94 lines
3.7 KiB
Python
94 lines
3.7 KiB
Python
|
import scrapy
|
||
|
from scrapy_splash import SplashRequest
|
||
|
import psycopg2
|
||
|
import logging
|
||
|
|
||
|
|
||
|
config = {
|
||
|
"db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
|
||
|
"db_port": "5432",
|
||
|
"db": "analytics",
|
||
|
"db_user": "dbadmin",
|
||
|
"db_pass": "5qCif6eyY3Kmg4z"
|
||
|
}
|
||
|
|
||
|
|
||
|
class OliveyoungSpider(scrapy.Spider):
|
||
|
name = 'oliveyoung_product'
|
||
|
allowed_domains = ['https://global.oliveyoung.com/?gad=1&gclid=CjwKCAjwq4imBhBQEiwA9Nx1Bi5w7mSF9wgKTFqfX37hyG_c3ocYHldGoXbIX1XfYKQQFxLOPECJCxoCxpEQAvD_BwE']
|
||
|
|
||
|
def start_requests(self):
|
||
|
url = 'https://global.oliveyoung.com/'
|
||
|
yield SplashRequest(url, self.parse, args={'wait': 5})
|
||
|
|
||
|
|
||
|
def parse(self, response):
|
||
|
|
||
|
conn = psycopg2.connect(database=config.get('db'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port'))
|
||
|
logging.info(conn)
|
||
|
conn.autocommit = True
|
||
|
cur = conn.cursor()
|
||
|
|
||
|
product_sections = [
|
||
|
('Best Sellers','#\#tab12'),
|
||
|
('MDS PICK','#\#tab22'),
|
||
|
('K-POP','div.main-section:nth-child(6) > div:nth-child(2)'),
|
||
|
('Featured','.main-brand-banner'),
|
||
|
('RECOMMENDATION','div.main-section:nth-child(9) > div:nth-child(2)'),
|
||
|
('FEATURED BRANDS', '#featuredBrands > div:nth-child(2)')
|
||
|
]
|
||
|
|
||
|
for product_section in product_sections:
|
||
|
|
||
|
products = response.css(str(product_section[1]))
|
||
|
|
||
|
product_selector = '.wrap-prd-info'
|
||
|
brand_selector = '.list-thumb-tit::text'
|
||
|
|
||
|
if 'FEATURED BRANDS' in product_section[0]:
|
||
|
product_selector = '.fig-title.ellipsis'
|
||
|
brand_selector = '.fig-title.ellipsis::text'
|
||
|
|
||
|
for product in products:
|
||
|
items = product.css(product_selector)
|
||
|
for item in items:
|
||
|
|
||
|
product_brand = (item.css(brand_selector).extract_first("")).replace("'","").strip()
|
||
|
product_name = item.css('.list-thumb-info::text').extract_first("").replace("'","").strip()
|
||
|
original_price = item.css('.price-cost::text').extract_first("").strip()
|
||
|
discounted_price = item.css('.prd-list-amountDue::text').extract_first("").strip()
|
||
|
|
||
|
logging.info("Collecting data for: {}".format(product_name))
|
||
|
|
||
|
sql = f"""
|
||
|
select product_section,product_brand,product_name from raena_spider_management.oliveyoung_products where product_section='{product_section[0]}' and product_brand='{product_brand}' and product_name='{product_name}'
|
||
|
"""
|
||
|
|
||
|
#logging.info(sql)
|
||
|
|
||
|
cur.execute(sql)
|
||
|
|
||
|
res = cur.fetchone()
|
||
|
|
||
|
if res:
|
||
|
|
||
|
sql = f"""
|
||
|
update raena_spider_management.oliveyoung_products set original_price='{original_price}',
|
||
|
discounted_price='{discounted_price}', updatedat=now()
|
||
|
where product_section='{product_section[0]}' and product_brand='{product_brand}' and product_name='{product_name}'
|
||
|
"""
|
||
|
#logging.info(sql)
|
||
|
|
||
|
cur.execute(sql)
|
||
|
|
||
|
else:
|
||
|
|
||
|
sql = f"""
|
||
|
insert into raena_spider_management.oliveyoung_products(product_section,product_brand,product_name,original_price,discounted_price,createdat,updatedat)
|
||
|
values('{product_section[0]}','{product_brand}','{product_name}','{original_price}','{discounted_price}',now(),now())
|
||
|
"""
|
||
|
#logging.info(sql)
|
||
|
|
||
|
cur.execute(sql)
|
||
|
|
||
|
conn.close()
|