import scrapy from scrapy_splash import SplashRequest import psycopg2 import logging config = { "db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com", "db_port": "5432", "db": "analytics", "db_user": "dbadmin", "db_pass": "5qCif6eyY3Kmg4z" } class OliveyoungSpider(scrapy.Spider): name = 'oliveyoung_product' allowed_domains = ['https://global.oliveyoung.com/?gad=1&gclid=CjwKCAjwq4imBhBQEiwA9Nx1Bi5w7mSF9wgKTFqfX37hyG_c3ocYHldGoXbIX1XfYKQQFxLOPECJCxoCxpEQAvD_BwE'] def start_requests(self): url = 'https://global.oliveyoung.com/' yield SplashRequest(url, self.parse, args={'wait': 5}) def parse(self, response): conn = psycopg2.connect(database=config.get('db'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port')) logging.info(conn) conn.autocommit = True cur = conn.cursor() product_sections = [ ('Best Sellers','#\#tab12'), ('MDS PICK','#\#tab22'), ('K-POP','div.main-section:nth-child(6) > div:nth-child(2)'), ('Featured','.main-brand-banner'), ('RECOMMENDATION','div.main-section:nth-child(9) > div:nth-child(2)'), ('FEATURED BRANDS', '#featuredBrands > div:nth-child(2)') ] for product_section in product_sections: products = response.css(str(product_section[1])) product_selector = '.wrap-prd-info' brand_selector = '.list-thumb-tit::text' if 'FEATURED BRANDS' in product_section[0]: product_selector = '.fig-title.ellipsis' brand_selector = '.fig-title.ellipsis::text' for product in products: items = product.css(product_selector) for item in items: product_brand = (item.css(brand_selector).extract_first("")).replace("'","").strip() product_name = item.css('.list-thumb-info::text').extract_first("").replace("'","").strip() original_price = item.css('.price-cost::text').extract_first("").strip() discounted_price = item.css('.prd-list-amountDue::text').extract_first("").strip() logging.info("Collecting data for: {}".format(product_name)) sql = f""" select product_section,product_brand,product_name from raena_spider_management.oliveyoung_products where product_section='{product_section[0]}' and product_brand='{product_brand}' and product_name='{product_name}' """ #logging.info(sql) cur.execute(sql) res = cur.fetchone() if res: sql = f""" update raena_spider_management.oliveyoung_products set original_price='{original_price}', discounted_price='{discounted_price}', updatedat=now() where product_section='{product_section[0]}' and product_brand='{product_brand}' and product_name='{product_name}' """ #logging.info(sql) cur.execute(sql) else: sql = f""" insert into raena_spider_management.oliveyoung_products(product_section,product_brand,product_name,original_price,discounted_price,createdat,updatedat) values('{product_section[0]}','{product_brand}','{product_name}','{original_price}','{discounted_price}',now(),now()) """ #logging.info(sql) cur.execute(sql) conn.close()