raena-crawler-engine/oliveyoung_crawler/spiders/oliveyoung.py

94 lines
3.7 KiB
Python

import scrapy
from scrapy_splash import SplashRequest
import psycopg2
import logging
config = {
"db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
"db_port": "5432",
"db": "analytics",
"db_user": "dbadmin",
"db_pass": "5qCif6eyY3Kmg4z"
}
class OliveyoungSpider(scrapy.Spider):
name = 'oliveyoung_product'
allowed_domains = ['https://global.oliveyoung.com/?gad=1&gclid=CjwKCAjwq4imBhBQEiwA9Nx1Bi5w7mSF9wgKTFqfX37hyG_c3ocYHldGoXbIX1XfYKQQFxLOPECJCxoCxpEQAvD_BwE']
def start_requests(self):
url = 'https://global.oliveyoung.com/'
yield SplashRequest(url, self.parse, args={'wait': 5})
def parse(self, response):
conn = psycopg2.connect(database=config.get('db'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port'))
logging.info(conn)
conn.autocommit = True
cur = conn.cursor()
product_sections = [
('Best Sellers','#\#tab12'),
('MDS PICK','#\#tab22'),
('K-POP','div.main-section:nth-child(6) > div:nth-child(2)'),
('Featured','.main-brand-banner'),
('RECOMMENDATION','div.main-section:nth-child(9) > div:nth-child(2)'),
('FEATURED BRANDS', '#featuredBrands > div:nth-child(2)')
]
for product_section in product_sections:
products = response.css(str(product_section[1]))
product_selector = '.wrap-prd-info'
brand_selector = '.list-thumb-tit::text'
if 'FEATURED BRANDS' in product_section[0]:
product_selector = '.fig-title.ellipsis'
brand_selector = '.fig-title.ellipsis::text'
for product in products:
items = product.css(product_selector)
for item in items:
product_brand = (item.css(brand_selector).extract_first("")).replace("'","").strip()
product_name = item.css('.list-thumb-info::text').extract_first("").replace("'","").strip()
original_price = item.css('.price-cost::text').extract_first("").strip()
discounted_price = item.css('.prd-list-amountDue::text').extract_first("").strip()
logging.info("Collecting data for: {}".format(product_name))
sql = f"""
select product_section,product_brand,product_name from raena_spider_management.oliveyoung_products where product_section='{product_section[0]}' and product_brand='{product_brand}' and product_name='{product_name}'
"""
#logging.info(sql)
cur.execute(sql)
res = cur.fetchone()
if res:
sql = f"""
update raena_spider_management.oliveyoung_products set original_price='{original_price}',
discounted_price='{discounted_price}', updatedat=now()
where product_section='{product_section[0]}' and product_brand='{product_brand}' and product_name='{product_name}'
"""
#logging.info(sql)
cur.execute(sql)
else:
sql = f"""
insert into raena_spider_management.oliveyoung_products(product_section,product_brand,product_name,original_price,discounted_price,createdat,updatedat)
values('{product_section[0]}','{product_brand}','{product_name}','{original_price}','{discounted_price}',now(),now())
"""
#logging.info(sql)
cur.execute(sql)
conn.close()