commit 3154eec5ab88433cb67ebe9364b798adf0bbe1c6 Author: shariar@raenabeauty.com Date: Wed Jan 24 17:05:07 2024 +0400 first commit diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..3accd96 Binary files /dev/null and b/.DS_Store differ diff --git a/README.md b/README.md new file mode 100644 index 0000000..39af52c --- /dev/null +++ b/README.md @@ -0,0 +1,29 @@ +# README # + +This README would normally document whatever steps are necessary to get your application up and running. + +### What is this repository for? ### + +* Quick summary +* Version +* [Learn Markdown](https://bitbucket.org/tutorials/markdowndemo) + +### How do I get set up? ### + +* Summary of set up +* Configuration +* Dependencies +* Database configuration +* How to run tests +* Deployment instructions + +### Contribution guidelines ### + +* Writing tests +* Code review +* Other guidelines + +### Who do I talk to? ### + +* Repo owner or admin +* Other community or team contact \ No newline at end of file diff --git a/amazon_crawler_engine/amazon_categories.py b/amazon_crawler_engine/amazon_categories.py new file mode 100644 index 0000000..65d2656 --- /dev/null +++ b/amazon_crawler_engine/amazon_categories.py @@ -0,0 +1,194 @@ +import hashlib +import logging +import undetected_chromedriver as webdriver +import psycopg2 +from selenium.webdriver.common.by import By +from pyvirtualdisplay import Display + +from amazon_db_writer import amazon_db_writer +import ssl +ssl._create_default_https_context = ssl._create_unverified_context + + +class amazon_categories: + def __init__(self, config): + self.config = config + self.crawler_name = self.config.get("crawler_name") + self.url = "https://www.amazon.ae/s?rh=n%3A11497859031&ref=lp_11497860031_sar" + self.product_limit = int(self.config.get("product_per_category")) + self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) + self.conn.autocommit = True + self.cur = self.conn.cursor() + self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Amazon'") + try : self.rce_source_id = self.cur.fetchone()[0] + except: + logging.info("Source tab is empty. Please check. Exiting.....") + exit(1) + self.db_writer = amazon_db_writer(config) + + #self.display = Display(visible=0, size=(800, 600)) + #self.display.start() + + def __del__(self): + print("Closing connection.....") + self.conn.close() + #self.display.stop() + + def start_processing(self): + op = webdriver.ChromeOptions() + op.add_argument('--no-sandbox') + op.add_argument('--disable-notifications') + op.add_argument("--lang=en-GB") + #op.headless = True + #driver=webdriver.Chrome(version_main = 113, options=op) + driver=webdriver.Chrome(options=op) + + driver.get(self.url) + + driver.implicitly_wait(10) + + self.get_categories(driver) + + driver.close() + + + def get_categories(self, driver): + + #element = driver.find_elements(By.CSS_SELECTOR,'.bxc-grid__container.bxc-grid__container--width-1500.bxc-grid__mp-gutter-layout') + #sub_cats = element[0].find_elements(By.CSS_SELECTOR,'.bxc-grid__image.bxc-grid__image--light') + sub_cats = driver.find_elements(By.CSS_SELECTOR,'.bxc-grid__image.bxc-grid__image--light') + + + names = ['Perfumes', 'Skin care', 'Hair care', 'Bath & body', 'Makeup', 'Nail care'] + + categories = [] + for sub_cat in sub_cats: + name = sub_cat.find_element(By.TAG_NAME, 'a').get_attribute('aria-label') + if name in names: + link = sub_cat.find_element(By.TAG_NAME, 'a').get_attribute('href') + + category = { + "name": name, + "link": link + } + + categories.append(category) + + print(categories) + self.get_sub_categories(driver, categories) + + def get_sub_categories(self,driver,categories): + + sub_categories = [] + for category in categories: + print("=============== {} ===============".format(category["name"])) + + data = {} + data['parent_category_id'] = 0 + data['rce_source_id'] = self.rce_source_id + data['rce_source_category_id'] = 0 + data['rce_source_status'] = 1 + data['category_name'] = category["name"] + data['category_page_url'] = category["link"] + data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest() + self.db_writer.rce_category(data) + + driver.get(category["link"]) + + ##### Feature Categories + try: + f_cat = driver.find_element(By.CSS_SELECTOR, '.octopus-pc-category-card-v2-title .a-size-extra-large') + if f_cat: + cats_c = driver.find_element(By.CSS_SELECTOR, '.a-section.octopus-pc-category-card-v2-content') + cats = cats_c.find_elements(By.CSS_SELECTOR, '.octopus-pc-category-card-v2-item') + for cat in cats: + cat_name = cat.find_element(By.CSS_SELECTOR, '.a-size-medium.a-color-base.a-text-bold').text + url = cat.find_element(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-category-card-v2-category-link').get_attribute("href") + # print('Name: {}, URL: {}'.format(cat_name,url)) + # s_cat = { + # "name": cat_name, + # "link": url + # } + # sub_categories.append(s_cat) + + data = {} + data['parent_category_id'] = 0 + data['rce_source_id'] = self.rce_source_id + data['rce_source_category_id'] = 0 + data['rce_source_status'] = 1 + data['category_name'] = cat_name + data['category_page_url'] = url + data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest() + self.db_writer.rce_category(data) + + try: + sub_cats = cat.find_elements(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-category-card-v2-subcategory-link') + + for sub_cat in sub_cats: + s_url = sub_cat.get_attribute('href') + s_title = sub_cat.get_attribute('title') + # print('Title: {}, URL: {}'.format(s_title, s_url)) + # s_cat = { + # "name": s_title, + # "link": s_url + # } + # sub_categories.append(s_cat) + data = {} + data['parent_category_id'] = 0 + data['rce_source_id'] = self.rce_source_id + data['rce_source_category_id'] = 0 + data['rce_source_status'] = 1 + data['category_name'] = s_title + data['category_page_url'] = s_url + data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest() + self.db_writer.rce_category(data) + except: + pass + except: + print("Feature Cat not available.") + pass + + ##### Shop by categories + try: + try: + cat_h = driver.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-header') + except: + cat_h = driver.find_element(By.CSS_SELECTOR, '#contentGrid_292470') + pass + if cat_h: + cats_c = driver.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-viewport-row-inner') + cats = cats_c.find_elements(By.TAG_NAME, 'li') + for cat in cats: + cat_name = cat.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-sub-card-title').text + url = cat.find_element(By.TAG_NAME, 'a').get_attribute('href') + # print('Name: {}, URL: {}'.format(cat_name,url)) + # s_cat = { + # "name": cat_name, + # "link": url + # } + # sub_categories.append(s_cat) + data = {} + data['parent_category_id'] = 0 + data['rce_source_id'] = self.rce_source_id + data['rce_source_category_id'] = 0 + data['rce_source_status'] = 1 + data['category_name'] = cat_name + data['category_page_url'] = url + data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest() + self.db_writer.rce_category(data) + except Exception as e: + print('Cat not available') + pass + + print(sub_categories) + + + + + + + + + +# categories = amazon_categories() +# categories.start_processing() \ No newline at end of file diff --git a/amazon_crawler_engine/amazon_category_products.py b/amazon_crawler_engine/amazon_category_products.py new file mode 100644 index 0000000..2e2d175 --- /dev/null +++ b/amazon_crawler_engine/amazon_category_products.py @@ -0,0 +1,186 @@ +import hashlib +import logging +import undetected_chromedriver as webdriver +from selenium.webdriver import ActionChains, Keys +from selenium.webdriver.chrome.service import Service +import psycopg2 +from selenium.webdriver.common.by import By +from amazon_db_writer import amazon_db_writer +from pyvirtualdisplay import Display +from scroller.scroller import smartScroll + + +import ssl +ssl._create_default_https_context = ssl._create_unverified_context + + +class amazon_category_products: + def __init__(self, config): + self.config = config + self.crawler_name = self.config.get("crawler_name") + #self.url = "https://www.amazon.ae/gp/browse.html?node=11497860031&ref_=nav_em_by_all_0_2_11_2" + self.product_limit = int(self.config.get("product_per_category")) + self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) + self.conn.autocommit = True + self.cur = self.conn.cursor() + sql = "delete from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where crawler_name='"+str(self.crawler_name)+"'" + self.cur.execute(sql) + sql = "select id, category_page_url from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" where rce_source_id = 66" + self.cur.execute(sql) + self.categories = self.cur.fetchall() + #self.display = Display(visible=0, size=(800, 600)) + #self.display.start() + + def __del__(self): + print("Closing connection.....") + self.conn.close() + #self.display.stop() + + def start_processing(self): + for category in self.categories: + logging.info("======= Fetching products of {}".format(category)) + self.browse_category_page(category) + + + def browse_category_page(self, catagory): + try: + op = webdriver.ChromeOptions() + op.add_argument('--no-sandbox') + op.add_argument('--disable-notifications') + op.add_argument("--lang=en-GB") + #op.headless = True + #driver=webdriver.Chrome(version_main = 113, options=op) + driver=webdriver.Chrome(options=op) + + driver.get(catagory[1]) + + driver.implicitly_wait(10) + + #### Collect section name and section products #### + section_products = self.section_products(driver, catagory[0]) + self.insert_tracker_tab(section_products) + + #### Collect All products #### + self.base_products(driver, catagory[0]) + + + + driver.close() + except Exception as e: + print(e) + + + def section_products(self, driver, catagory): + + elements = driver.find_elements(By.CSS_SELECTOR,".a-size-extra-large.a-color-base.a-text-bold") + section_name = [] + for element in elements: + section_name.append(element.text) + + elements = driver.find_elements(By.CSS_SELECTOR,".a-section.octopus-pc-card-content") + section_products = [] + for element in elements: + objs = element.find_elements(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-item-link') + + urls = [] + for obj in objs: + url = obj.get_attribute("href") + urls.append(url) + section_products.append(urls) + + result = [] + for i in range(len(section_name)): + result.append({ + "catagory": catagory, + "key": section_name[i], + "value": section_products[i] + }) + + return result + + def insert_tracker_tab(self, objs): + + for obj in objs: + category = obj['catagory'] + key = obj['key'] + items = obj['value'] + for item in items: + product_page_url = item + product_page_url_hash = hashlib.md5(product_page_url.encode('utf-8')).hexdigest() + flag = 0 + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where product_page_url = '"+product_page_url+"'" + self.cur.execute(sql) + res = self.cur.fetchall() + + if not res: + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+"(crawler_name,category,keyword,product_page_url,product_page_url_hash,flag) values('"+str(self.crawler_name)+"','"+str(category)+"','"+str(key)+"','"+product_page_url+"','"+product_page_url_hash+"',"+str(flag)+")" + self.cur.execute(sql) + + def base_products(self, driver, catagory): + + try: + smartScroll(driver, stopAtBorder=True, distancePerSecond=500, humanBreaks=True) + all_res = driver.find_element(By.CSS_SELECTOR, '#apb-desktop-browse-search-see-all') + all_res.click() + + driver.implicitly_wait(5) + + for i in range(1,16): + items = driver.find_elements(By.CSS_SELECTOR, '.a-size-mini.a-spacing-none.a-color-base.s-line-clamp-4') + + smartScroll(driver, stopAtBorder=True, distancePerSecond=500, humanBreaks=True) + + + urls = [] + for item in items: + url = item.find_element(By.TAG_NAME, 'a').get_attribute('href') + urls.append(url) + + result = [{ + "catagory": catagory, + "key": "Base Product Page {}".format(str(i)), + "value": urls + }] + + self.insert_tracker_tab(result) + + try: + driver.find_element(By.CSS_SELECTOR, '.s-pagination-next').click() + driver.implicitly_wait(5) + except: + logging.info("No more page to navigate......") + except: + pass + + + + + +# config = { +# "crawler_name": "raena_crawler_enginer_amazon", +# "crawler_schema": "raena_spider_management", +# "category_tab": "rce_category", +# "tracker_tab": "crawler_tracker", +# "product_tab": "rce_product", +# "variant_tab": "rce_product_variant", +# "brand_tab": "rce_brand", +# "reseller_tab": "rce_reseller", +# "reseller_store_tab": "rce_reseller_store", +# "review_tab": "rce_ratings_reviews", +# "review_productmodels_tab": "rce_ratings_reviews_productmodels", +# "review_producttags_tab": "rce_ratings_reviews_producttags", +# "review_tags": "rce_tags", +# "source_tab": "rce_source", +# "product_per_category": "1000", +# "source_category": "11043145", +# "db_user": "postgres", +# "db_pass": "postgres", +# "database": "postgres", +# "db_host": "localhost", +# "db_port": "5444", +# "crawler_main": "1", +# "crawler_slave_no": "" +# } +# amazon_category_products = amazon_category_products(config) +# amazon_category_products.start_processing() \ No newline at end of file diff --git a/amazon_crawler_engine/amazon_crawler.py b/amazon_crawler_engine/amazon_crawler.py new file mode 100644 index 0000000..2554a72 --- /dev/null +++ b/amazon_crawler_engine/amazon_crawler.py @@ -0,0 +1,98 @@ +import logging +import psycopg2 +import json +from datetime import datetime +import smtplib +from email.message import EmailMessage + +from amazon_categories import amazon_categories +from amazon_category_products import amazon_category_products +from amazon_products import amazon_products + + +##### Looger ###### +format = "%(asctime)s: %(message)s" +logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") + +config = {} + +def send_mail(): + + try: + EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5" + EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh" + From = 'data_reporting@raenabeauty.com' + To = 'shariar@raenabeauty.com' + #To = 'shariar@raenabeauty.com' + + html = f''' + + + +
+

Amazon Crawler Status

+
+
+
+ Error occured. Please check Amazon Pipeline. +
+

This is system generated mail. Please do not reply

+
+
+
+ + + ''' + + msg = EmailMessage() + msg['Subject'] = 'Amazon Crawler Status' + msg['From'] = From + msg['To'] = To + msg.set_content(html, subtype='html') + + + with smtplib.SMTP('email-smtp.ap-southeast-1.amazonaws.com', 587) as smtp: + smtp.ehlo() + smtp.starttls() + smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD) + smtp.send_message(msg) + except Exception as e: + logging.info("Error while sending mail: {}".format(e)) +def main(): + # start = datetime.now() + # categories = amazon_categories(config) + # categories.start_processing() + # end = datetime.now() + # logging.info('Total time taken to fetch the categories: {}'.format(str(end-start))) + # + # start = datetime.now() + # products = amazon_category_products(config) + # products.start_processing() + # end = datetime.now() + # logging.info('Total time taken to fetch the category products: {}'.format(str(end-start))) + + + product_info = amazon_products(config) + product_info.start_processing() + + # ###### For test + # item = (100, 'raena_crawler_enginer_amazon', '3066', 'Up to 25 AED', 'https://www.amazon.ae/Ross-Massager-Shampoo-Silicone-Bristles/dp/B09JGH1WM3?ref_=Oct_d_oup_d_12149480031_0&pd_rd_w=lfMTW&content-id=amzn1.sym.d6d96598-a48c-43a2-8244-52a2329bf791&pf_rd_p=d6d96598-a48c-43a2-8244-52a2329bf791&pf_rd_r=C1QM2XCSJDBVMS27JV7E&pd_rd_wg=gkRZv&pd_rd_r=f5af13ee-c6c4-4d8a-8677-cba9cbacdace&pd_rd_i=B09JGH1WM3', '8f0540b5919e176303cf24a1d46b0e1c', 0) + # product_info.get_product_info(item) + + +if __name__ == "__main__": + logging.info("Starting Shopee Crawler.......") + try: + logging.info("Loading config file.......") + with open("conf.json", "r") as jsonfile: + config = json.load(jsonfile) + logging.info("Config file loaded.......") + print(config) + + main() + + except Exception as e: + logging.info("Error: ".format(e)) + #logging.info("Cannot load config file. Please check. Exiting......") + send_mail() + exit(1) \ No newline at end of file diff --git a/amazon_crawler_engine/amazon_db_writer.py b/amazon_crawler_engine/amazon_db_writer.py new file mode 100755 index 0000000..193151d --- /dev/null +++ b/amazon_crawler_engine/amazon_db_writer.py @@ -0,0 +1,589 @@ +import logging +import psycopg2 + +###### Looger ###### +format = "%(asctime)s: %(message)s" +logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") + +class amazon_db_writer: + def __init__(self, config): + self.config = config + self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) + self.conn.autocommit = True + self.cur = self.conn.cursor() + + def __del__(self): + logging.info("Closing connection.....") + self.conn.close() + + def rce_category(self, data): + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" where category_name = '"+str(data['category_name'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + cat_name = data['category_name'].replace("'","''") + cat_url = data['category_page_url'].replace("'","''") + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" (parent_category_id,rce_source_id," \ + "rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name) values (" \ + +str(data['parent_category_id'])+","+str(data['rce_source_id'])+", "+str(data['rce_source_category_id'])+", "+str(data['rce_source_status'])+", " \ + "'"+str(cat_url)+"', '"+str(data['category_page_url_hash'])+"', '"+str(cat_name)+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(id,parent_category_id,rce_source_id," \ + "rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat) " \ + "select id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \ + "category_name,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \ + "where rce_source_category_id = "+ str(data['rce_source_category_id']) + #logging.info(sql) + + self.cur.execute(sql) + + else: + if str(data['parent_category_id'])==str(res[1]) and str(data['rce_source_category_id'])==str(res[3]) and str(data['category_name']) == str(res[7]) and \ + str(data['category_page_url'])==str(res[5]): + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" set updatedat=now() " \ + "where category_name = '"+ str(res[7])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" set parent_category_id = " \ + ""+str(data['parent_category_id'])+", rce_source_category_id = "+str(data['rce_source_category_id'])+", " \ + "category_name='"+str(cat_name)+"', category_page_url='"+str(cat_url)+"', " \ + "category_page_url_hash='"+str(data['category_page_url_hash'])+"', updatedat=now() where " \ + "category_name = '"+ str(res[7])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(id,parent_category_id,rce_source_id," \ + "rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat) " \ + "select id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \ + "category_name,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \ + "where category_name = '"+ str(res[7])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + def rce_product(self, data): + + data['product_page_url'] = data['product_page_url'].replace("'","''") + data['rce_source_product_name'] = data['rce_source_product_name'].replace("'","''") + data['product_description'] = data['product_description'].replace("'","''") + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where product_page_url = '"+str(data['product_page_url'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" (rce_source_product_id," \ + "rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \ + "rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \ + "product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \ + "product_section,rce_source_id) values("+str(data['rce_source_product_id'])+","+str(data['rce_source_product_status'])+",'"+str(data['product_page_url'])+"'," \ + "'"+str(data['product_page_url_hash'])+"',"+str(data['rce_category_id'])+","+str(data['rce_brand_id'])+","+str(data['rce_store_id'])+"," \ + "'"+str(data['rce_source_product_name'])+"','"+str(data['product_images'])+"','"+str(data['product_description'])+"',"+str(data['product_sold_total'])+"," \ + ""+str(data['product_sold'])+",'"+str(data['product_price_min'])+"','"+str(data['product_price_min_before_discount'])+"','"+str(data['product_price_max'])+"'," \ + "'"+str(data['product_price_max_before_discount'])+"','"+str(data['ratings'])+"','"+str(data['product_section'])+"',"+str(data['rce_source_id'])+")" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" (id,rce_source_product_id," \ + "rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \ + "rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \ + "product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \ + "product_section,createdat,updatedat,rce_source_id) select id,rce_source_product_id," \ + "rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \ + "rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \ + "product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \ + "product_section,createdat,updatedat,rce_source_id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where " \ + "product_page_url='"+str(data['product_page_url'])+"'" + #logging.info(sql) + self.cur.execute(sql) + else: + + if str(data['rce_source_product_id'])==str(res[1]) and str(data['rce_source_product_status'])==str(res[2]) and \ + str(data['product_page_url'])==str(res[3]) and str(data['product_page_url_hash'])==str(res[4]) and str(data['rce_category_id'])==str(res[5]) and \ + str(data['rce_brand_id'])==str(res[6]) and str(data['rce_store_id'])==str(res[7]) and str(data['rce_source_product_name'])==str(res[8]) and \ + str(data['product_images'])==str(res[9]) and str(data['product_sold_total'])==str(res[11]) and \ + str(data['product_sold'])==str(res[12]) and str(data['product_price_min'])==str(res[13]) and str(data['product_price_min_before_discount'])==str(res[14]) and \ + str(data['product_price_max'])==str(res[15]) and str(data['product_price_max_before_discount'])==str(res[16]) and str(data['ratings'])==str(res[17]) \ + and str(data['rce_source_id'])==str(res[21]) \ + and str(data['product_section'])==str(res[22]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" set updatedat=now() " \ + "where product_page_url = '"+ str(res[3])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" set rce_source_product_id="+str(data['rce_source_product_id'])+"," \ + "rce_source_product_status="+str(data['rce_source_product_status'])+",product_page_url='"+str(data['product_page_url'])+"',product_page_url_hash= " \ + "'"+str(data['product_page_url_hash'])+"',rce_category_id="+str(data['rce_category_id'])+",rce_brand_id="+str(data['rce_brand_id'])+"," \ + "rce_store_id="+str(data['rce_store_id'])+",rce_source_product_name='"+str(data['rce_source_product_name'])+"',product_images='"+str(data['product_images'])+"'" \ + ",product_description='"+str(data['product_description'])+"',product_sold_total="+str(data['product_sold_total'])+",product_sold="+str(data['product_sold'])+"," \ + "product_price_min='"+str(data['product_price_min'])+"',product_price_min_before_discount='"+str(data['product_price_min_before_discount'])+"'," \ + "product_price_max='"+str(data['product_price_max'])+"',product_price_max_before_discount='"+str(data['product_price_max_before_discount'])+"',ratings='"+str(data['ratings'])+"'," \ + "product_section='"+str(data['product_section'])+"', updatedat=now(), rce_source_id="+str(data['rce_source_id'])+" where product_page_url = '"+ str(res[3])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" (id,rce_source_product_id," \ + "rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \ + "rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \ + "product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \ + "product_section,createdat,updatedat,rce_source_id) select id,rce_source_product_id," \ + "rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \ + "rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \ + "product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \ + "product_section,createdat,updatedat,rce_source_id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where " \ + "product_page_url='"+str(res[3])+"'" + #logging.info(sql) + self.cur.execute(sql) + + + def rce_product_variant(self, data): + data['product_variant_name'] = data['product_variant_name'].replace("'","''") + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where product_variant_name = '"+str(data['product_variant_name'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" (rce_source_variant_id,rce_product_id," \ + "product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock) values("+str(data['rce_source_variant_id'])+"," \ + ""+str(data['rce_product_id'])+",'"+str(data['product_variant_name'])+"','"+str(data['product_variant_price'])+"'," \ + "'"+str(data['product_variant_price_before_discount'])+"',"+str(data['product_variant_stock'])+")" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" (id,rce_source_variant_id,rce_product_id," \ + "product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,createdat,updatedat) select * from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where product_variant_name='"+str(data['product_variant_name'])+"'" + #logging.info(sql) + self.cur.execute(sql) + + else: + if str(data['rce_source_variant_id'])==str(res[1]) and str(data['rce_product_id'])==str(res[2]) and str(data['product_variant_name'])==str(res[3]) and \ + str(data['product_variant_price'])==str(res[4]) and str(data['product_variant_price_before_discount'])==str(res[5]) and str(data['product_variant_stock'])==str(res[6]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" set updatedat=now() " \ + "where product_variant_name = '"+ str(res[3])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" set rce_source_variant_id="+str(data['rce_source_variant_id'])+", " \ + "rce_product_id="+str(data['rce_product_id'])+", product_variant_name='"+str(data['product_variant_name'])+"', product_variant_price=" \ + "'"+str(data['product_variant_price'])+"',product_variant_price_before_discount='"+str(data['product_variant_price_before_discount'])+"'," \ + "product_variant_stock="+str(data['product_variant_stock'])+", updatedat=now() where product_variant_name = '"+ str(res[3])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" (id,rce_source_variant_id,rce_product_id," \ + "product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,createdat,updatedat) select * from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where product_variant_name='"+str(res[3])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + + def rce_brand(self, data): + data['brand_page_url'] = data['brand_page_url'].replace("'","''") + data['brand_name'] = data['brand_name'].replace("'","''") + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_page_url = '"+str(data['brand_page_url'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + if not res: + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" (rce_source_id,rce_source_brand_status," \ + "brand_page_url,brand_page_url_hash,brand_name) values("+str(data['rce_source_id'])+"," \ + ""+str(data['rce_source_brand_status'])+",'"+str(data['brand_page_url'])+"','"+str(data['brand_page_url_hash'])+"'," \ + "'"+str(data['brand_name'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" (id,rce_source_id,rce_source_brand_status," \ + "brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat) select id,rce_source_id,rce_source_brand_status," \ + "brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_page_url='"+str(data['brand_page_url'])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_source_id'])==str(res[1]) and str(data['rce_source_brand_status'])==str(res[3]) and str(data['brand_page_url'])==str(res[4]) and \ + str(data['brand_page_url_hash'])==str(res[5]) and str(data['brand_name'])==str(res[6]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" set updatedat=now() " \ + "where brand_page_url = '"+ str(res[4])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" set rce_source_id="+str(data['rce_source_id'])+", " \ + "rce_source_brand_status="+str(data['rce_source_brand_status'])+", brand_page_url='"+str(data['brand_page_url'])+"', brand_page_url_hash=" \ + "'"+str(data['brand_page_url_hash'])+"',brand_name='"+str(data['brand_name'])+"', updatedat=now() where brand_page_url = '"+ str(res[4])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" (id,rce_source_id,rce_source_brand_status," \ + "brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat) select id,rce_source_id,rce_source_brand_status, " \ + "brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_page_url='"+str(res[4])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + def rce_reseller(self, data): + data['reseller_name'] = data['reseller_name'].replace("'","''") + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data['reseller_name'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" (rce_source_id,rce_source_reseller_status," \ + "reseller_name,reseller_average_rating,reseller_description) values("+str(data['rce_source_id'])+"," \ + ""+str(data['rce_source_reseller_status'])+",'"+str(data['reseller_name'])+"','"+str(data['reseller_average_rating'])+"'," \ + "'"+str(data['reseller_description'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (id,rce_source_id,rce_source_reseller_status," \ + "reseller_name,reseller_average_rating,reseller_description,createdat,updatedat) select id,rce_source_id,rce_source_reseller_status," \ + "reseller_name,reseller_average_rating,reseller_description,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name='"+str(data['reseller_name'])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_source_reseller_status'])==str(res[3]) and str(data['reseller_name'])==str(res[4]) and \ + str(data['reseller_average_rating'])==str(res[5]) and str(data['reseller_description'])==str(res[6]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set updatedat=now() " \ + "where reseller_name = '"+ str(res[4])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set rce_source_id="+str(data['rce_source_id'])+", " \ + "rce_source_reseller_status="+str(data['rce_source_reseller_status'])+", reseller_name='"+str(data['reseller_name'])+"', reseller_average_rating=" \ + "'"+str(data['reseller_average_rating'])+"',reseller_description='"+str(data['reseller_description'])+"', updatedat=now() where reseller_name = '"+ str(res[4])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (id,rce_source_id,rce_source_reseller_status," \ + "reseller_name,reseller_average_rating,reseller_description,createdat,updatedat) select id,rce_source_id,rce_source_reseller_status," \ + "reseller_name,reseller_average_rating,reseller_description,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name='"+str(res[4])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + def rce_reseller_store(self, data): + + data['store_page_url'] = data['store_page_url'].replace("'","''") + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url = '"+str(data['store_page_url'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" (rce_source_store_status," \ + "store_page_url,store_page_url_hash,store_location,rce_reseller_id,rce_source_id) values(" \ + ""+str(data['rce_source_store_status'])+",'"+str(data['store_page_url'])+"','"+str(data['store_page_url_hash'])+"'," \ + "'"+str(data['store_location'])+"', "+str(data['rce_reseller_id'])+", "+str(data['rce_source_id'])+")" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (id,rce_source_store_status," \ + "store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id) select id,rce_source_store_status," \ + "store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url='"+str(data['store_page_url'])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_source_store_status'])==str(res[2]) and str(data['store_page_url'])==str(res[3]) and \ + str(data['store_page_url_hash'])==str(res[4]) and str(data['store_location'])==str(res[5]) and \ + str(data['rce_reseller_id'])==str(res[6]) and str(data['rce_source_id'])==str(res[9]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set updatedat=now() " \ + "where store_page_url = '"+ str(res[3])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set " \ + "rce_source_store_status="+str(data['rce_source_store_status'])+", store_page_url='"+str(data['store_page_url'])+"', store_page_url_hash=" \ + "'"+str(data['store_page_url_hash'])+"',store_location='"+str(data['store_location'])+"', rce_reseller_id="+str(data['rce_reseller_id'])+", " \ + "updatedat=now(), rce_source_id="+str(data['rce_source_id'])+" where store_page_url = '"+ str(res[3])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (id,rce_source_store_status," \ + "store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id) select id,rce_source_store_status," \ + "store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url='"+str(res[3])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + def rce_ratings_reviews(self, data): + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id = "+str(data['rce_product_id'])+" and username ='"+str(data['username'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + data['username'] = data['username'].replace("'","''") + data['img_url'] = data['img_url'].replace("'","''") + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" (id,rce_product_id,username," \ + "review,img_url,review_like_count,user_tier,shop_id,video_url,rating) values("+str(data['id'])+","+str(data['rce_product_id'])+"," \ + "'"+str(data['username'])+"','"+str(data['review'])+"','"+str(data['img_url'])+"',"+str(data['review_like_count'])+",'"+str(data['user_tier'])+"'," \ + ""+str(data['shop_id'])+", '"+str(data['video_url'])+"', '"+str(data['rating'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" (id,rce_product_id,username," \ + "review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select id,rce_product_id,username," \ + "review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id="+str(data['rce_product_id'])+" and username ='"+str(data['username'])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_product_id'])==str(res[1]) and str(data['username'])==str(res[2]) and str(data['review'])==str(res[3]) and \ + str(data['img_url'])==str(res[4]) and str(data['review_like_count'])==str(res[5]) and str(data['user_tier'])==str(res[6]) and \ + str(data['shop_id'])==str(res[7]) and str(data['video_url'])==str(res[8]) and str(data['rating'])==str(res[9]): + + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set updatedat=now() " \ + "where rce_product_id = "+ str(res[1])+" and username ='"+res[2]+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set rce_product_id="+str(data['rce_product_id'])+", " \ + "username='"+str(data['username'])+"', review='"+str(data['review'])+"', img_url=" \ + "'"+str(data['img_url'])+"',review_like_count="+str(data['review_like_count'])+", user_tier='"+str(data['user_tier'])+"', " \ + "shop_id="+str(data['shop_id'])+", video_url='"+str(data['video_url'])+"', rating='"+str(data['rating'])+"', updatedat=now() " \ + "where rce_product_id = "+ str(res[1])+" and username ='"+str(data['username'])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" (id,rce_product_id,username," \ + "review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select id,rce_product_id,username," \ + "review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id="+str(res[1])+" and username ='"+str(data['username'])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + def rce_ratings_reviews_productmodels(self,data): + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id = "+str(data['rce_rating_id']) + self.cur.execute(sql) + res = self.cur.fetchone() + + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" (rce_rating_id,model_id) " \ + "values("+str(data['rce_rating_id'])+",'"+str(data['model_id'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \ + "createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(data['rce_rating_id'])+"" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_rating_id'])==str(res[1]) and str(data['model_id'])==str(res[2]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set updatedat=now() " \ + "where rce_rating_id = "+ str(res[1]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set model_id="+str(data['model_id'])+", " \ + "updatedat=now() where rce_source_store_id = "+ str(res[1]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \ + "createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(res[1])+"" + #logging.info(sql) + + self.cur.execute(sql) + + + def rce_tags(self,data): + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description = '"+str(data['description'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" (id,description) " \ + "values("+str(data['id'])+",'"+str(data['description'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \ + "createdat,updatedat) select id,description,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(data['description'])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['description'])==str(res[1]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set updatedat=now() " \ + "where description = '"+ str(res[1])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set description='"+str(data['description'])+"', " \ + "updatedat=now() where description = "+ str(res[1]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \ + "createdat,updatedat) select id,description,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(res[1])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + + def rce_ratings_reviews_producttags(self,data): + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id = '"+str(data['rce_rating_id'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" (rce_rating_id,tag_ids) " \ + "values("+str(data['rce_rating_id'])+",'"+str(data['tag_ids'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \ + "createdat,updatedat) select id,rce_rating_id,tag_ids,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id='"+str(data['rce_rating_id'])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_rating_id'])==str(res[1]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set updatedat=now() " \ + "where rce_rating_id = '"+ str(res[1])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set rce_rating_id='"+str(data['rce_rating_id'])+"', " \ + "updatedat=now() where rce_rating_id = "+ str(res[1]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \ + "createdat,updatedat) select id,description,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where description='"+str(res[1])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + + diff --git a/amazon_crawler_engine/amazon_product_adhoc.py b/amazon_crawler_engine/amazon_product_adhoc.py new file mode 100644 index 0000000..be13049 --- /dev/null +++ b/amazon_crawler_engine/amazon_product_adhoc.py @@ -0,0 +1,174 @@ +import hashlib +import logging +import random +import sys +import string +#from selenium import webdriver +import undetected_chromedriver as webdriver +from selenium.webdriver.common.by import By +import psycopg2 +import time +import re +from amazon_db_writer import amazon_db_writer +from datetime import datetime +from pyvirtualdisplay import Display + +import ssl +ssl._create_default_https_context = ssl._create_unverified_context + +class amazon_products_adhoc: + def __init__(self, config): + self.config = config + self.crawler_name = self.config.get("crawler_name") + self.pattern = r'[' + string.punctuation + ']' + self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) + self.conn.autocommit = True + self.cur = self.conn.cursor() + sql = f"""select * from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where rce_source_id=66 and product_price_min= '' order by id desc""" + self.cur.execute(sql) + self.items = self.cur.fetchall() + self.db_writer = amazon_db_writer(config) + #self.display = Display(visible=0, size=(800, 600)) + #self.display.start() + + + def __del__(self): + print("Closing connection.....") + self.conn.close() + #self.display.stop() + + def start_processing(self): + op = webdriver.ChromeOptions() + op.add_argument('--no-sandbox') + op.add_argument('--disable-notifications') + op.add_argument("--lang=en-GB") + op.add_argument('--user-data-dir=/home/ec2-user/chrome_cache/') + driver=webdriver.Chrome(options=op) + count = 0 + for item in self.items: + count += 1 + try: + logging.info("============== Getting info for {}/{}: {} ================".format(str(count),str(len(self.items)),str(item))) + start = datetime.now() + + driver.get(item[3]) + self.product_info(driver, item) + + sql = f""" + update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 where product_page_url_hash='{item[4]}' + """ + self.cur.execute(sql) + end = datetime.now() + logging.info('Total time taken to fetch the product: {}'.format(str(end-start))) + time.sleep(5) + except Exception as e: + print(e) + driver.close() + + def product_info(self, driver, item): + + data_product = {} + + data_product['rce_source_product_id'] = item[1] + data_product['rce_source_id'] = item[21] + data_product['rce_source_product_status'] = item[2] + data_product['product_page_url'] = item[3] + data_product['product_page_url_hash'] = item[4] + data_product['rce_category_id'] = item[5] + data_product['rce_brand_id'] = item[6] + data_product['rce_store_id'] = item[7] + data_product['rce_source_product_name'] = item[8] + data_product['product_images'] = item[9] + data_product['product_description'] = item[10] + data_product['product_sold_total'] = item[11] + data_product['product_sold'] = item[12] + data_product['product_price_min'] = item[13] + data_product['product_price_min_before_discount'] =item[14] + data_product['product_price_max'] = item[15] + data_product['product_price_max_before_discount'] = item[16] + data_product['ratings'] = item[17] + data_product['product_section'] = item[22] + + + # try: + # data_product['product_price_min'] = (driver.find_element(By.CSS_SELECTOR, '#corePrice_desktop > div > table > tbody > tr:nth-child(2) > td.a-span12 > span.a-price.a-text-price.a-size-medium.apexPriceToPay > span:nth-child(2)').text).replace('AED', '') + # data_product['product_price_max'] = data_product['product_price_min'] + # + # except: + # + # try: + # price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text + # price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text + # + # price = price_whole+"."+price_fraction + # data_product['product_price_min'] = price + # data_product['product_price_max'] = price + # except: + # try: + # data_product['product_price_min'] =(driver.find_element(By.CSS_SELECTOR, '#sns-base-price > div > span.a-price.a-text-price.a-size-medium.apexPriceToPay > span:nth-child(2)').text).replace('AED','') + # data_product['product_price_max'] = data_product['product_price_min'] + # except: + # data_product['product_price_min'] = (driver.find_element(By.CSS_SELECTOR, '#sns-base-price').text).replace('AED','') + # data_product['product_price_max'] = data_product['product_price_min'] + # pass + # pass + # + # pass + + try: + data_product['product_price_min'] = (driver.find_element(By.CSS_SELECTOR, '#sns-base-price').text).replace('AED', '') + data_product['product_price_max'] = data_product['product_price_min'] + + except: + price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text + price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text + + price = price_whole+"."+price_fraction + data_product['product_price_min'] = price + data_product['product_price_max'] = price + pass + + print("product_price_min: {}".format(data_product['product_price_min'])) + + try: + data_product['product_price_min_before_discount'] = (driver.find_element(By.CSS_SELECTOR, '.a-text-price').text).replace('AED', '') + data_product['product_price_max_before_discount'] = data_product['product_price_min_before_discount'] + except: + pass + + try: + self.db_writer.rce_product(data_product) + except Exception as e: + logging.info(e) + + + + +config = { + "crawler_name": "raena_crawler_enginer_amazon", + "crawler_schema": "raena_spider_management", + "category_tab": "rce_category", + "tracker_tab": "crawler_tracker", + "product_tab": "rce_product", + "variant_tab": "rce_product_variant", + "brand_tab": "rce_brand", + "reseller_tab": "rce_reseller", + "reseller_store_tab": "rce_reseller_store", + "review_tab": "rce_ratings_reviews", + "review_productmodels_tab": "rce_ratings_reviews_productmodels", + "review_producttags_tab": "rce_ratings_reviews_producttags", + "review_tags": "rce_tags", + "source_tab": "rce_source", + "product_per_category": "1000", + "source_category": "11043145", + "db_user": "dbadmin", + "db_pass": "5qCif6eyY3Kmg4z", + "database": "analytics", + "db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com", + "db_port": "5432", + "crawler_main": "1", + "crawler_slave_no": "" +} + +amazon_products_adhoc = amazon_products_adhoc(config) +amazon_products_adhoc.start_processing() \ No newline at end of file diff --git a/amazon_crawler_engine/amazon_products.py b/amazon_crawler_engine/amazon_products.py new file mode 100755 index 0000000..8e2f128 --- /dev/null +++ b/amazon_crawler_engine/amazon_products.py @@ -0,0 +1,516 @@ +import hashlib +import logging +import random +import sys +import string +import undetected_chromedriver as webdriver +from selenium.webdriver.common.by import By +import psycopg2 +import time +import re +from amazon_db_writer import amazon_db_writer +from datetime import datetime +from pyvirtualdisplay import Display + +import ssl +ssl._create_default_https_context = ssl._create_unverified_context + +class amazon_products: + def __init__(self, config): + self.config = config + self.crawler_name = self.config.get("crawler_name") + self.pattern = r'[' + string.punctuation + ']' + self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) + self.conn.autocommit = True + self.cur = self.conn.cursor() + self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Amazon'") + self.rce_source_id = self.cur.fetchone()[0] + self.cur.execute("select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where crawler_name='raena_crawler_enginer_amazon' and flag=0 order by id") + self.items = self.cur.fetchall() + self.db_writer = amazon_db_writer(config) + #self.display = Display(visible=0, size=(800, 600)) + #self.display.start() + + + def __del__(self): + print("Closing connection.....") + self.conn.close() + #self.display.stop() + + def start_processing(self): + count = 0 + for item in self.items: + count += 1 + try: + logging.info("============== Getting info for {}/{}: {} ================".format(str(count),str(len(self.items)),str(item))) + start = datetime.now() + self.get_product_info(item) + end = datetime.now() + logging.info('Total time taken to fetch the product: {}'.format(str(end-start))) + except Exception as e: + print(e) + + def reseller_info(self, driver): + try: + store_urls = [] + try: + driver.find_element(By.CSS_SELECTOR, '.a-icon.a-icon-arrow.a-icon-small.arrow-icon').click() + time.sleep(5) + + offers = driver.find_elements(By.CSS_SELECTOR, '#aod-offer-soldBy') + + for offer in offers: + try: + store_url = offer.find_element(By.CSS_SELECTOR, '.a-fixed-left-grid-col.a-col-right').find_element(By.TAG_NAME, 'a').get_attribute('href') + store_urls.append(store_url) + except: + pass + except: + try: + store_url = driver.find_element(By.CSS_SELECTOR, '#sellerProfileTriggerId').get_attribute('href') + store_urls.append(store_url) + except: + pass + pass + + if store_urls: + + store_urls = list(set(store_urls)) + + return_item = "" + flag = 0 + + for store_url in store_urls: + driver.get(store_url) + driver.implicitly_wait(5) + + ##### reseller info + + data_reseller = {} + data_reseller['rce_source_id'] = self.rce_source_id + data_reseller['rce_source_reseller_status'] = 1 + data_reseller['reseller_name'] = "" + data_reseller['reseller_average_rating'] = 0.0 + data_reseller['reseller_description'] = "" + + try: + data_reseller['reseller_name'] = driver.find_element(By.CSS_SELECTOR,'#seller-name').text + data_reseller['reseller_name'] = data_reseller['reseller_name'].replace("'","") + except: + pass + + try: + data_reseller['reseller_average_rating'] = float(driver.find_element(By.CSS_SELECTOR,'#effective-timeperiod-rating-year-description').text) + except: + try: + data_reseller['reseller_average_rating'] = float(driver.find_element(By.CSS_SELECTOR,'#effective-timeperiod-rating-year-description').text) + except: + pass + pass + + try: + data_reseller['reseller_description'] = driver.find_element(By.CSS_SELECTOR, '#spp-expander-about-seller .a-row').text + data_reseller['reseller_description'] = data_reseller['reseller_description'].replace("'","") + except: + pass + try: + self.db_writer.rce_reseller(data_reseller) + except Exception as e: + logging.info(e) + + ##### Store info + + data_reseller_store = {} + data_reseller_store['rce_source_store_status'] = 1 + data_reseller_store['store_page_url'] = store_url + data_reseller_store['store_page_url_hash'] = hashlib.md5(data_reseller_store['store_page_url'].encode('utf-8')).hexdigest() + data_reseller_store['store_location'] = "" + data_reseller_store['rce_reseller_id'] = "" + data_reseller_store['rce_source_id'] = self.rce_source_id + + try: + self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data_reseller['reseller_name'])+"'") + rce_reseller_id = self.cur.fetchone() + data_reseller_store['rce_reseller_id'] = rce_reseller_id[0] + if flag == 0: + return_item = data_reseller_store['rce_reseller_id'] + flag = 1 + except: + pass + + try: + self.db_writer.rce_reseller_store(data_reseller_store) + except Exception as e: + logging.info(e) + + time.sleep(2) + else: + + ##### reseller info + + data_reseller = {} + data_reseller['rce_source_id'] = self.rce_source_id + data_reseller['rce_source_reseller_status'] = 1 + data_reseller['reseller_name'] = "Amazon.ae" + data_reseller['reseller_average_rating'] = 0.0 + data_reseller['reseller_description'] = "" + + + try: + self.db_writer.rce_reseller(data_reseller) + except Exception as e: + logging.info(e) + + ##### Store info + + data_reseller_store = {} + data_reseller_store['rce_source_store_status'] = 1 + data_reseller_store['store_page_url'] = "amazon.ae" + data_reseller_store['store_page_url_hash'] = hashlib.md5(data_reseller_store['store_page_url'].encode('utf-8')).hexdigest() + data_reseller_store['store_location'] = "" + data_reseller_store['rce_reseller_id'] = "" + data_reseller_store['rce_source_id'] = self.rce_source_id + + try: + self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data_reseller['reseller_name'])+"'") + rce_reseller_id = self.cur.fetchone() + data_reseller_store['rce_reseller_id'] = rce_reseller_id[0] + return_item = data_reseller_store['rce_reseller_id'] + except: + pass + + try: + self.db_writer.rce_reseller_store(data_reseller_store) + except Exception as e: + logging.info(e) + + + + return return_item + + except Exception as e: + print(e) + + def brand_info(self, driver): + data_brand = {} + + data_brand['rce_source_id'] = self.rce_source_id + data_brand['rce_source_brand_status'] = 1 + data_brand['brand_page_url'] = "" + data_brand['brand_page_url_hash'] = "" + data_brand['brand_name'] = "" + + try: + data_brand['brand_page_url'] = driver.find_element(By.CSS_SELECTOR, '#bylineInfo').get_attribute('href') + data_brand['brand_page_url_hash'] = hashlib.md5(data_brand['brand_page_url'].encode('utf-8')).hexdigest() + + try: + data_brand['brand_name'] = driver.find_element(By.CSS_SELECTOR, '.po-brand .po-break-word').text + except: + pass + + try: + self.db_writer.rce_brand(data_brand) + except Exception as e: + logging.info(e) + + return data_brand['brand_name'] + except: + pass + + def product_info(self, driver, category, keyword, url, url_hash, brand_name, rce_reseller_id): + data_product = {} + + data_product['rce_source_product_id'] = 0 + data_product['rce_source_id'] = self.rce_source_id + data_product['rce_source_product_status'] = 1 + data_product['product_page_url'] = url.replace("'","''") + data_product['product_page_url_hash'] = url_hash + data_product['rce_category_id'] = category + data_product['rce_brand_id'] = "" + data_product['rce_store_id'] = "" + data_product['rce_source_product_name'] = "" + data_product['product_images'] = "" + data_product['product_description'] = "" + data_product['product_sold_total'] = 0 + data_product['product_sold'] = 0 + data_product['product_price_min'] = "" + data_product['product_price_min_before_discount'] ="" + data_product['product_price_max'] = "" + data_product['product_price_max_before_discount'] = "" + data_product['ratings'] = 0.0 + data_product['product_section'] = keyword + + try: + sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_name = '"+str(brand_name)+"'" + self.cur.execute(sql) + data_product['rce_brand_id'] = self.cur.fetchone()[0] + except: pass + + try: + sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+"" + self.cur.execute(sql) + data_product['rce_store_id'] = self.cur.fetchone()[0] + except: pass + + try: + rce_source_product_name = driver.find_element(By.CSS_SELECTOR,'#productTitle').text + data_product['rce_source_product_name'] = str(re.sub(self.pattern, '', rce_source_product_name)).replace("'","''") + except: pass + + + try: + product_images_element = driver.find_element(By.CSS_SELECTOR, '#magnifierLens') + product_images_raw = product_images_element.find_elements(By.TAG_NAME, 'img') + + product_images = [] + for product_image in product_images_raw: + url = product_image.get_attribute('src') + product_images.append(url) + + data_product['product_images'] = str(product_images) + + except: pass + + try: + description = "" + des_rank = "" + try: + des_raws = driver.find_element(By.CSS_SELECTOR, '.a-unordered-list.a-vertical.a-spacing-mini').find_elements(By.CSS_SELECTOR, '.a-list-item') + + for des_raw in des_raws: + try: + des = des_raw.text + description += des + except: + pass + except: + pass + try: + des_rank = driver.find_element(By.XPATH, '/html/body/div[2]/div/div[6]/div[24]/div/ul[1]').find_element(By.CSS_SELECTOR, '.a-list-item').text + except: + pass + data_product['product_description'] = description+des_rank + except: + pass + + try: + price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text + price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text + + price = price_whole+"."+price_fraction + + data_product['product_price_min'] = price + data_product['product_price_max'] = price + except: + pass + + try: + d_price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text + d_price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text + + price = d_price_whole+"."+d_price_fraction + + data_product['product_price_min'] = price + data_product['product_price_max'] = price + except: + pass + + try: + data_product['product_price_min_before_discount'] = (driver.find_element(By.CSS_SELECTOR, '.a-text-price').text).replace('AED', '') + data_product['product_price_max_before_discount'] = data_product['product_price_min_before_discount'] + except: + pass + + try: + data_product['ratings'] = driver.find_element(By.CSS_SELECTOR, '#averageCustomerReviews .a-color-base').text + except: + pass + + try: + self.db_writer.rce_product(data_product) + except Exception as e: + logging.info(e) + + ### rce_product_variant + try: + is_variant = driver.find_element(By.CSS_SELECTOR, '.a-unordered-list.a-nostyle.a-button-list.a-declarative.a-button-toggle-group.a-horizontal.a-spacing-top-micro.swatches.swatchesSquare.imageSwatches') + if is_variant: + variants = is_variant.find_elements(By.TAG_NAME, 'li') + #random.shuffle(variants) + + for variant in variants: + variant.click() + data_variant = {} + + data_variant['rce_source_variant_id'] = 0 + data_variant['rce_product_id'] = "" + data_variant['product_variant_name'] = "" + data_variant['product_variant_price'] = "" + data_variant['product_variant_price_before_discount'] = "" + data_variant['product_variant_stock'] = 0 + + try: + sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_name = '"+str(data_product['rce_source_product_name'])+"'" + self.cur.execute(sql) + data_variant['rce_product_id'] = self.cur.fetchone()[0] + except: + pass + + try: + product_variant_name = driver.find_element(By.CSS_SELECTOR,'#productTitle').text + data_variant['product_variant_name'] = str(re.sub(self.pattern, '', product_variant_name)).replace("'","''") + except: pass + + try: + d_price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text + d_price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text + + price = d_price_whole+"."+d_price_fraction + + data_variant['product_variant_price'] = price + except: + pass + + try: + data_variant['product_variant_price_before_discount'] = (driver.find_element(By.CSS_SELECTOR, '.a-text-price').text).replace('AED', '') + except: + pass + + try: + self.db_writer.rce_product_variant(data_variant) + except Exception as e: + logging.info(e) + + time.sleep(random.randint(2,5)) + + else: + logging.info('No variant found') + except: + logging.info('No variant found') + pass + + + + def rating_info(self, driver, rce_reseller_id, url_hash): + + try: + driver.find_element(By.CSS_SELECTOR, '#reviews-medley-footer .a-link-emphasis').click() + driver.implicitly_wait(5) + + data_reviews = driver.find_elements(By.CSS_SELECTOR, '.a-section.review.aok-relative') + + + for data in data_reviews: + + data_review = {} + + data_review["id"] = "" + data_review["rce_product_id"] = "" + data_review["username"] = "" + data_review["review"] = "" + data_review["img_url"] = "" + data_review["review_like_count"] = 0 + data_review["user_tier"] = "" + data_review["shop_id"] = 0 + data_review["video_url"] = "" + data_review["rating"] = "" + + try: + sql = "select max(id) from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab') + self.cur.execute(sql) + rating_id = self.cur.fetchone() + + if rating_id[0]==None: + rating_id = 1 + else: + rating_id = int(rating_id[0]) + 1 + + data_review["id"] = rating_id + except: + pass + + try: + sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where product_page_url_hash = '"+str(url_hash)+"'" + self.cur.execute(sql) + data_review["rce_product_id"] = self.cur.fetchone()[0] + except: pass + + try: data_review["username"] = data.find_element(By.CSS_SELECTOR, '.a-profile-name').text + except: pass + + try: + data_review["review"] = data.find_element(By.CSS_SELECTOR, '.a-size-base.review-text.review-text-content').text + data_review["review"] = data_review["review"].replace("'","") + except: pass + + try: + rating = data.find_element(By.CSS_SELECTOR, '.a-icon.a-icon-star.review-rating .a-icon-alt').get_attribute("textContent") + data_review["rating"] = rating.replace(' out of 5 stars', '') + except: pass + + try: + sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+"" + self.cur.execute(sql) + data_review["shop_id"] = self.cur.fetchone()[0] + except: pass + + try: + self.db_writer.rce_ratings_reviews(data_review) + except Exception as e: + logging.info(e) + except: + pass + + + + def get_product_info(self,item): + try: + op = webdriver.ChromeOptions() + op.add_argument('--no-sandbox') + op.add_argument('--disable-notifications') + op.add_argument("--lang=en-GB") + op.add_argument('--user-data-dir=/home/ec2-user/chrome_cache/') + #op.headless = True + driver=webdriver.Chrome(options=op) + + try: + driver.get('https://www.amazon.ae') + time.sleep(3) + except Exception as e: + print(e) + + + + ##### Reseller info ##### + driver.get(item[4]) + driver.implicitly_wait(5) + rce_reseller_id = self.reseller_info(driver) + + + + ##### Product Info ##### + driver.get(item[4]) + driver.implicitly_wait(5) + ##### Brand Info + brand_name = self.brand_info(driver) + ##### Product info + self.product_info(driver, item[2], item[3], item[4], item[5], brand_name, rce_reseller_id) + + + ##### Rating Info ##### + driver.get(item[4]) + driver.implicitly_wait(5) + self.rating_info(driver, rce_reseller_id, item[5]) + + sql = f""" + update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 where product_page_url_hash='{item[5]}' + """ + self.cur.execute(sql) + + + driver.close() + except Exception as e: + print(e) + driver.close() + diff --git a/amazon_crawler_engine/conf.json b/amazon_crawler_engine/conf.json new file mode 100755 index 0000000..5cd659e --- /dev/null +++ b/amazon_crawler_engine/conf.json @@ -0,0 +1,25 @@ +{ + "crawler_name": "raena_crawler_enginer_amazon", + "crawler_schema": "raena_spider_management", + "category_tab": "rce_category", + "tracker_tab": "crawler_tracker", + "product_tab": "rce_product", + "variant_tab": "rce_product_variant", + "brand_tab": "rce_brand", + "reseller_tab": "rce_reseller", + "reseller_store_tab": "rce_reseller_store", + "review_tab": "rce_ratings_reviews", + "review_productmodels_tab": "rce_ratings_reviews_productmodels", + "review_producttags_tab": "rce_ratings_reviews_producttags", + "review_tags": "rce_tags", + "source_tab": "rce_source", + "product_per_category": "1000", + "source_category": "11043145", + "db_user": "dbadmin", + "db_pass": "5qCif6eyY3Kmg4z", + "database": "analytics", + "db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com", + "db_port": "5432", + "crawler_main": "1", + "crawler_slave_no": "" +} \ No newline at end of file diff --git a/amazon_crawler_engine/test.py b/amazon_crawler_engine/test.py new file mode 100644 index 0000000..bd0f25a --- /dev/null +++ b/amazon_crawler_engine/test.py @@ -0,0 +1,44 @@ + +from selenium import webdriver +from selenium.webdriver.common.by import By +import time + +import ssl +ssl._create_default_https_context = ssl._create_unverified_context + +op = webdriver.ChromeOptions() +op.add_argument('--no-sandbox') +op.add_argument('--disable-notifications') +op.add_argument("--lang=en-GB") +#op.headless = True +driver=webdriver.Chrome( options=op) + + + +driver.get('https://www.noon.com/uae-en/beauty/') + +time.sleep(10) + +element = driver.find_element(By.CSS_SELECTOR, '.componentArea-9') + +title = element.find_element(By.CSS_SELECTOR, '.truncate-title-header').text +products = element.find_elements(By.CSS_SELECTOR, '.sc-kCMKrZ.ealOXE') + +urls = [] +for product in products: + url = product.find_element(By.TAG_NAME, 'a').get_attribute('href') + urls.append(url) + +data = { + "title": title, + "products": urls +} + +print(data) + +driver.close() + + + + + diff --git a/amazon_crawler_engine/test1.py b/amazon_crawler_engine/test1.py new file mode 100644 index 0000000..b1eb402 --- /dev/null +++ b/amazon_crawler_engine/test1.py @@ -0,0 +1,83 @@ +import hashlib +import logging +import sys +import string +import undetected_chromedriver as webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.service import Service +import psycopg2 +import bs4 +from webdriver_manager.chrome import ChromeDriverManager +import random +from bs4 import BeautifulSoup +import json +import time +import gzip +import re +import random +from amazon_db_writer import amazon_db_writer + +import ssl +ssl._create_default_https_context = ssl._create_unverified_context + + +def reseller_info(store_url): + + op = webdriver.ChromeOptions() + op.add_argument('--no-sandbox') + op.add_argument('--disable-notifications') + op.add_argument("--lang=en-GB") + #op.headless = True + driver=webdriver.Chrome( options=op) + + driver.get(store_url) + + driver.implicitly_wait(5) + + try: + driver.get(store_url) + driver.implicitly_wait(5) + + ##### reseller info + + avg_rating = driver.find_element(By.CSS_SELECTOR,'#effective-timeperiod-rating-year-description.ratings-reviews').text + + print(avg_rating) + + + + except Exception as e: + print(e) + +config = { + "crawler_name": "raena_crawler_enginer_amazon", + "crawler_schema": "raena_spider_management", + "category_tab": "rce_category", + "tracker_tab": "crawler_tracker", + "product_tab": "rce_product", + "variant_tab": "rce_product_variant", + "brand_tab": "rce_brand", + "reseller_tab": "rce_reseller", + "reseller_store_tab": "rce_reseller_store", + "review_tab": "rce_ratings_reviews", + "review_productmodels_tab": "rce_ratings_reviews_productmodels", + "review_producttags_tab": "rce_ratings_reviews_producttags", + "review_tags": "rce_tags", + "source_tab": "rce_source", + "product_per_category": "1000", + "source_category": "11043145", + "db_user": "postgres", + "db_pass": "postgres", + "database": "postgres", + "db_host": "localhost", + "db_port": "5444", + "crawler_main": "1", + "crawler_slave_no": "" +} +conn = psycopg2.connect(database=config.get('database'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port')) +conn.autocommit = True +cur = conn.cursor() +db_writer = amazon_db_writer(config) + + +reseller_info('https://www.amazon.ae/sp?ie=UTF8&seller=A3TFGX22P341AN&isAmazonFulfilled=0&asin=B09BR31PF9&ref_=olp_merch_name_1') \ No newline at end of file diff --git a/amazon_crawler_engine/test2.py b/amazon_crawler_engine/test2.py new file mode 100644 index 0000000..0a2004f --- /dev/null +++ b/amazon_crawler_engine/test2.py @@ -0,0 +1,27 @@ +import hashlib + + +def insert_tracker_tab(objs): + + for obj in objs: + category = obj['catagory'] + key = obj['key'] + items = obj['value'] + for item in items: + product_page_url = item + product_page_url_hash = hashlib.md5(product_page_url.encode('utf-8')).hexdigest() + flag = 0 + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where product_page_url = '"+product_page_url+"'" + self.cur.execute(sql) + res = self.cur.fetchall() + + if not res: + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+"(crawler_name,category,keyword,product_page_url,product_page_url_hash,flag) values('"+str(self.crawler_name)+"','"+category+"','"+key+"','"+product_page_url+"','"+product_page_url_hash+"',"+str(flag)+")" + self.cur.execute(sql) + else: + print("Already Collected. Skipping......") + + +section_products = [{'catagory': 3, 'key': 'Hot new releases', 'value': ['https://www.amazon.ae/Ordinary-Glycolic-Acid-Toning-Solution/dp/B0CG149P34?ref_=Oct_d_onr_d_12149483031_0&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CG149P34', 'https://www.amazon.ae/Ordinary-Glycolic-Acid-Toning-Solution/dp/B0CG149P34?ref_=Oct_d_onr_d_12149483031_0&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CG149P34', 'https://www.amazon.ae/Roche-s%C3%A9rum-r%C3%A9novateur-%C3%A9clat-VITAMINE/dp/B0C9WW6F2M?ref_=Oct_d_onr_d_12149483031_1&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0C9WW6F2M', 'https://www.amazon.ae/Roche-s%C3%A9rum-r%C3%A9novateur-%C3%A9clat-VITAMINE/dp/B0C9WW6F2M?ref_=Oct_d_onr_d_12149483031_1&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0C9WW6F2M', 'https://www.amazon.ae/Eucerin-protection-touch-Gel-Cream-SPF50/dp/B0CBVNNSLS?ref_=Oct_d_onr_d_12149483031_2&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CBVNNSLS', 'https://www.amazon.ae/Eucerin-protection-touch-Gel-Cream-SPF50/dp/B0CBVNNSLS?ref_=Oct_d_onr_d_12149483031_2&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CBVNNSLS', 'https://www.amazon.ae/Cool-Divine-Essence-Exotic-Bloom/dp/B0CDCHQX1L?ref_=Oct_d_onr_d_12149483031_3&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CDCHQX1L', 'https://www.amazon.ae/Cool-Divine-Essence-Exotic-Bloom/dp/B0CDCHQX1L?ref_=Oct_d_onr_d_12149483031_3&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CDCHQX1L', 'https://www.amazon.ae/Paulas-Choice-Blackheads-Lines-SKIN-PERFECTING/dp/B0CDV575YT?ref_=Oct_d_onr_d_12149483031_4&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CDV575YT', 'https://www.amazon.ae/Paulas-Choice-Blackheads-Lines-SKIN-PERFECTING/dp/B0CDV575YT?ref_=Oct_d_onr_d_12149483031_4&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CDV575YT', 'https://www.amazon.ae/Beauty-Joseeon-Relief-Probiotics-1-69fl-oz/dp/B0CGXQXK5B?ref_=Oct_d_onr_d_12149483031_5&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CGXQXK5B', 'https://www.amazon.ae/Beauty-Joseeon-Relief-Probiotics-1-69fl-oz/dp/B0CGXQXK5B?ref_=Oct_d_onr_d_12149483031_5&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CGXQXK5B', 'https://www.amazon.ae/COSRXX-Advanced-Snail-Cream-100g/dp/B0CGDXCCDQ?ref_=Oct_d_onr_d_12149483031_6&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CGDXCCDQ', 'https://www.amazon.ae/COSRXX-Advanced-Snail-Cream-100g/dp/B0CGDXCCDQ?ref_=Oct_d_onr_d_12149483031_6&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CGDXCCDQ', 'https://www.amazon.ae/Ecolyte-Premium-Hand-Liquid-Refill/dp/B0CCPCRW4D?ref_=Oct_d_onr_d_12149483031_7&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CCPCRW4D', 'https://www.amazon.ae/Ecolyte-Premium-Hand-Liquid-Refill/dp/B0CCPCRW4D?ref_=Oct_d_onr_d_12149483031_7&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CCPCRW4D', 'https://www.amazon.ae/Skin1004-Madagascar-Centella-Hyalu-Cica-Water-Fit/dp/B0BZGLNZQZ?ref_=Oct_d_onr_d_12149483031_8&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0BZGLNZQZ', 'https://www.amazon.ae/Skin1004-Madagascar-Centella-Hyalu-Cica-Water-Fit/dp/B0BZGLNZQZ?ref_=Oct_d_onr_d_12149483031_8&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0BZGLNZQZ', 'https://www.amazon.ae/Lip-Sleeping-Mask-laneige-3gr/dp/B0CG9LGH2Z?ref_=Oct_d_onr_d_12149483031_9&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CG9LGH2Z', 'https://www.amazon.ae/Lip-Sleeping-Mask-laneige-3gr/dp/B0CG9LGH2Z?ref_=Oct_d_onr_d_12149483031_9&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CG9LGH2Z', 'https://www.amazon.ae/Paulas-Choice-Salicylic-Blackheads-Lines-30ml/dp/B0CBXPX5YS?ref_=Oct_d_onr_d_12149483031_10&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CBXPX5YS', 'https://www.amazon.ae/Paulas-Choice-Salicylic-Blackheads-Lines-30ml/dp/B0CBXPX5YS?ref_=Oct_d_onr_d_12149483031_10&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CBXPX5YS', 'https://www.amazon.ae/Roche-Ageing-Routine-Protection-Moisturiser/dp/B0CDLLPVLC?ref_=Oct_d_onr_d_12149483031_11&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CDLLPVLC', 'https://www.amazon.ae/Roche-Ageing-Routine-Protection-Moisturiser/dp/B0CDLLPVLC?ref_=Oct_d_onr_d_12149483031_11&pd_rd_w=Ex782&content-id=amzn1.sym.f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_p=f3b61736-36a6-4d09-982b-b2bcfe2a5d41&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0CDLLPVLC']}, {'catagory': 3, 'key': 'Most wished for', 'value': ['https://www.amazon.ae/Moisturizing-KASTWAVE-Softener-Pedicure-Treatment/dp/B09KGY1TX5?ref_=Oct_d_omwf_d_12149483031_0&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B09KGY1TX5', 'https://www.amazon.ae/Moisturizing-KASTWAVE-Softener-Pedicure-Treatment/dp/B09KGY1TX5?ref_=Oct_d_omwf_d_12149483031_0&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B09KGY1TX5', 'https://www.amazon.ae/QRxLabs-Glycolic-Salicylic-Allantoin-Calendula/dp/B07JR3QF3T?ref_=Oct_d_omwf_d_12149483031_1&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07JR3QF3T', 'https://www.amazon.ae/QRxLabs-Glycolic-Salicylic-Allantoin-Calendula/dp/B07JR3QF3T?ref_=Oct_d_omwf_d_12149483031_1&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07JR3QF3T', 'https://www.amazon.ae/Cosmo-Naturals-Facial-Massage-Cream/dp/B07QW3F6X6?ref_=Oct_d_omwf_d_12149483031_2&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07QW3F6X6', 'https://www.amazon.ae/Cosmo-Naturals-Facial-Massage-Cream/dp/B07QW3F6X6?ref_=Oct_d_omwf_d_12149483031_2&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07QW3F6X6', 'https://www.amazon.ae/BANILA-Cleansing-Original-cleansing-millilitre/dp/B07ZCQ88WD?ref_=Oct_d_omwf_d_12149483031_3&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07ZCQ88WD', 'https://www.amazon.ae/BANILA-Cleansing-Original-cleansing-millilitre/dp/B07ZCQ88WD?ref_=Oct_d_omwf_d_12149483031_3&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07ZCQ88WD', 'https://www.amazon.ae/Lakme-Complexion-Care-Cream-Bronze/dp/B01BBNF6NK?ref_=Oct_d_omwf_d_12149483031_4&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B01BBNF6NK', 'https://www.amazon.ae/Lakme-Complexion-Care-Cream-Bronze/dp/B01BBNF6NK?ref_=Oct_d_omwf_d_12149483031_4&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B01BBNF6NK', 'https://www.amazon.ae/Laura-Mercier-Almond-Coconut-Cream/dp/B002HPUT70?ref_=Oct_d_omwf_d_12149483031_5&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B002HPUT70', 'https://www.amazon.ae/Laura-Mercier-Almond-Coconut-Cream/dp/B002HPUT70?ref_=Oct_d_omwf_d_12149483031_5&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B002HPUT70', 'https://www.amazon.ae/Laura-Mercier-Body-Bath-Coconut/dp/B000F5HADA?ref_=Oct_d_omwf_d_12149483031_6&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B000F5HADA', 'https://www.amazon.ae/Laura-Mercier-Body-Bath-Coconut/dp/B000F5HADA?ref_=Oct_d_omwf_d_12149483031_6&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B000F5HADA', 'https://www.amazon.ae/NEEDLY-Cicachid-Chilling-Soothing-Certified/dp/B09K3PL3K6?ref_=Oct_d_omwf_d_12149483031_7&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B09K3PL3K6', 'https://www.amazon.ae/NEEDLY-Cicachid-Chilling-Soothing-Certified/dp/B09K3PL3K6?ref_=Oct_d_omwf_d_12149483031_7&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B09K3PL3K6', 'https://www.amazon.ae/Aquaphor-l-f-Repair-Protect-35floz/dp/B008SQSBJK?ref_=Oct_d_omwf_d_12149483031_8&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B008SQSBJK', 'https://www.amazon.ae/Aquaphor-l-f-Repair-Protect-35floz/dp/B008SQSBJK?ref_=Oct_d_omwf_d_12149483031_8&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B008SQSBJK', 'https://www.amazon.ae/l-f-Impurities-Brightens-Minimizes-Cruelty-Free/dp/B078SM5HH6?ref_=Oct_d_omwf_d_12149483031_9&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B078SM5HH6', 'https://www.amazon.ae/l-f-Impurities-Brightens-Minimizes-Cruelty-Free/dp/B078SM5HH6?ref_=Oct_d_omwf_d_12149483031_9&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B078SM5HH6', 'https://www.amazon.ae/Olay-Natural-White-Glowing-Fairness/dp/B009ZCE5JG?ref_=Oct_d_omwf_d_12149483031_10&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B009ZCE5JG', 'https://www.amazon.ae/Olay-Natural-White-Glowing-Fairness/dp/B009ZCE5JG?ref_=Oct_d_omwf_d_12149483031_10&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B009ZCE5JG', 'https://www.amazon.ae/Zayn-Myza-Hyaluronic-Brightness-Combination/dp/B0BFP55SND?ref_=Oct_d_omwf_d_12149483031_11&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0BFP55SND', 'https://www.amazon.ae/Zayn-Myza-Hyaluronic-Brightness-Combination/dp/B0BFP55SND?ref_=Oct_d_omwf_d_12149483031_11&pd_rd_w=wHJmR&content-id=amzn1.sym.a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_p=a2f2fe15-a2f8-4bbc-9c8f-69c0c572137e&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0BFP55SND']}, {'catagory': 3, 'key': 'Top rated', 'value': ['https://www.amazon.ae/COSRX-Advance-Snail-Mucin-Essence/dp/B00PBX3L7K?ref_=Oct_d_otopr_d_12149483031_0&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00PBX3L7K', 'https://www.amazon.ae/COSRX-Advance-Snail-Mucin-Essence/dp/B00PBX3L7K?ref_=Oct_d_otopr_d_12149483031_0&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00PBX3L7K', 'https://www.amazon.ae/Beauty-Joseon-Relief-Sun-Probiotics/dp/B09JVNZVH3?ref_=Oct_d_otopr_d_12149483031_1&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B09JVNZVH3', 'https://www.amazon.ae/Beauty-Joseon-Relief-Sun-Probiotics/dp/B09JVNZVH3?ref_=Oct_d_otopr_d_12149483031_1&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B09JVNZVH3', 'https://www.amazon.ae/COSRX-RLE70245-Acne-Pimple-Patch/dp/B00PBX3TN6?ref_=Oct_d_otopr_d_12149483031_2&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00PBX3TN6', 'https://www.amazon.ae/COSRX-RLE70245-Acne-Pimple-Patch/dp/B00PBX3TN6?ref_=Oct_d_otopr_d_12149483031_2&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00PBX3TN6', 'https://www.amazon.ae/Himalaya-Purehands-Effectively-Protects-3x250ml/dp/B08LHSBG3R?ref_=Oct_d_otopr_d_12149483031_3&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B08LHSBG3R', 'https://www.amazon.ae/Himalaya-Purehands-Effectively-Protects-3x250ml/dp/B08LHSBG3R?ref_=Oct_d_otopr_d_12149483031_3&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B08LHSBG3R', 'https://www.amazon.ae/PanOxyl-Foaming-Peroxide-Strength-Antimicrobial/dp/B081KL2QYJ?ref_=Oct_d_otopr_d_12149483031_4&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B081KL2QYJ', 'https://www.amazon.ae/PanOxyl-Foaming-Peroxide-Strength-Antimicrobial/dp/B081KL2QYJ?ref_=Oct_d_otopr_d_12149483031_4&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B081KL2QYJ', 'https://www.amazon.ae/COSRX-Aloe-Soothing-Cream-50ml/dp/B00PBX3FLW?ref_=Oct_d_otopr_d_12149483031_5&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00PBX3FLW', 'https://www.amazon.ae/COSRX-Aloe-Soothing-Cream-50ml/dp/B00PBX3FLW?ref_=Oct_d_otopr_d_12149483031_5&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00PBX3FLW', 'https://www.amazon.ae/CeraVe-Resurfacing-Brightening-Niacinamide-Non-Comedogenic/dp/B07VWSN95S?ref_=Oct_d_otopr_d_12149483031_6&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07VWSN95S', 'https://www.amazon.ae/CeraVe-Resurfacing-Brightening-Niacinamide-Non-Comedogenic/dp/B07VWSN95S?ref_=Oct_d_otopr_d_12149483031_6&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07VWSN95S', 'https://www.amazon.ae/Himalaya-Purehands-Liquid-Protect-Germs/dp/B08LHJ1BHY?ref_=Oct_d_otopr_d_12149483031_7&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B08LHJ1BHY', 'https://www.amazon.ae/Himalaya-Purehands-Liquid-Protect-Germs/dp/B08LHJ1BHY?ref_=Oct_d_otopr_d_12149483031_7&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B08LHJ1BHY', 'https://www.amazon.ae/Derma-Hyaluronic-Sunscreen-Spectrum-Protection/dp/B095CRM8NF?ref_=Oct_d_otopr_d_12149483031_8&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B095CRM8NF', 'https://www.amazon.ae/Derma-Hyaluronic-Sunscreen-Spectrum-Protection/dp/B095CRM8NF?ref_=Oct_d_otopr_d_12149483031_8&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B095CRM8NF', 'https://www.amazon.ae/CeraVe-Facial-Moisturizing-Moisturizer-Packaging/dp/B00F97FHAW?ref_=Oct_d_otopr_d_12149483031_9&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00F97FHAW', 'https://www.amazon.ae/CeraVe-Facial-Moisturizing-Moisturizer-Packaging/dp/B00F97FHAW?ref_=Oct_d_otopr_d_12149483031_9&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00F97FHAW', 'https://www.amazon.ae/TruSkin-Vitamin-Serum-Hyaluronic-Brightening/dp/B01M4MCUAF?ref_=Oct_d_otopr_d_12149483031_10&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B01M4MCUAF', 'https://www.amazon.ae/TruSkin-Vitamin-Serum-Hyaluronic-Brightening/dp/B01M4MCUAF?ref_=Oct_d_otopr_d_12149483031_10&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B01M4MCUAF', 'https://www.amazon.ae/NEUtrogena-Face-Cream-Hydro-Boost/dp/B07881485K?ref_=Oct_d_otopr_d_12149483031_11&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07881485K', 'https://www.amazon.ae/NEUtrogena-Face-Cream-Hydro-Boost/dp/B07881485K?ref_=Oct_d_otopr_d_12149483031_11&pd_rd_w=7kq0j&content-id=amzn1.sym.ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_p=ae3a53d5-9ee8-4cd2-b16b-b8bd9696c810&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07881485K']}, {'catagory': 3, 'key': 'Most gifted', 'value': ['https://www.amazon.ae/innisfree-Green-Balancing-Lotion-5-41fl/dp/B07Q65ZGLD?ref_=Oct_d_omg_d_12149483031_0&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07Q65ZGLD', 'https://www.amazon.ae/innisfree-Green-Balancing-Lotion-5-41fl/dp/B07Q65ZGLD?ref_=Oct_d_omg_d_12149483031_0&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07Q65ZGLD', 'https://www.amazon.ae/Lux-Anti-Bacterial-Perfumed-Handwash-Refill/dp/B08WC13R29?ref_=Oct_d_omg_d_12149483031_1&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B08WC13R29', 'https://www.amazon.ae/Lux-Anti-Bacterial-Perfumed-Handwash-Refill/dp/B08WC13R29?ref_=Oct_d_omg_d_12149483031_1&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B08WC13R29', 'https://www.amazon.ae/Neutrogena-Free-Face-Moisture-Normal/dp/B006LXE5OC?ref_=Oct_d_omg_d_12149483031_2&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B006LXE5OC', 'https://www.amazon.ae/Neutrogena-Free-Face-Moisture-Normal/dp/B006LXE5OC?ref_=Oct_d_omg_d_12149483031_2&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B006LXE5OC', 'https://www.amazon.ae/Garnier-Cleanses-Brightens-Clarifies-SkinActive/dp/B07MVV66DH?ref_=Oct_d_omg_d_12149483031_3&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07MVV66DH', 'https://www.amazon.ae/Garnier-Cleanses-Brightens-Clarifies-SkinActive/dp/B07MVV66DH?ref_=Oct_d_omg_d_12149483031_3&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07MVV66DH', 'https://www.amazon.ae/NIVEA-Moisturising-Cream-Soft-Refreshing/dp/B00ICO1NUC?ref_=Oct_d_omg_d_12149483031_4&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00ICO1NUC', 'https://www.amazon.ae/NIVEA-Moisturising-Cream-Soft-Refreshing/dp/B00ICO1NUC?ref_=Oct_d_omg_d_12149483031_4&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00ICO1NUC', 'https://www.amazon.ae/LOr%C3%A9al-Hyaluron-Expert-Replumping-Moisturizing/dp/B084N5FKSZ?ref_=Oct_d_omg_d_12149483031_5&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B084N5FKSZ', 'https://www.amazon.ae/LOr%C3%A9al-Hyaluron-Expert-Replumping-Moisturizing/dp/B084N5FKSZ?ref_=Oct_d_omg_d_12149483031_5&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B084N5FKSZ', 'https://www.amazon.ae/Neutrogena-Ultra-Sheer-Sunblock-Sunscreen/dp/B082PFY9S7?ref_=Oct_d_omg_d_12149483031_6&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B082PFY9S7', 'https://www.amazon.ae/Neutrogena-Ultra-Sheer-Sunblock-Sunscreen/dp/B082PFY9S7?ref_=Oct_d_omg_d_12149483031_6&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B082PFY9S7', 'https://www.amazon.ae/PONDS-Bright-Beauty-Cream-Night/dp/B09CDVCJDD?ref_=Oct_d_omg_d_12149483031_7&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B09CDVCJDD', 'https://www.amazon.ae/PONDS-Bright-Beauty-Cream-Night/dp/B09CDVCJDD?ref_=Oct_d_omg_d_12149483031_7&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B09CDVCJDD', 'https://www.amazon.ae/KANZA-Varicose-Chestnut-Phlebitis-Inflammation/dp/B09SQ6VR5G?ref_=Oct_d_omg_d_12149483031_8&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B09SQ6VR5G', 'https://www.amazon.ae/KANZA-Varicose-Chestnut-Phlebitis-Inflammation/dp/B09SQ6VR5G?ref_=Oct_d_omg_d_12149483031_8&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B09SQ6VR5G', 'https://www.amazon.ae/Garnier-Wrinkle-Lift-Anti-Ageing-Cream/dp/B00791D32U?ref_=Oct_d_omg_d_12149483031_9&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00791D32U', 'https://www.amazon.ae/Garnier-Wrinkle-Lift-Anti-Ageing-Cream/dp/B00791D32U?ref_=Oct_d_omg_d_12149483031_9&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00791D32U', 'https://www.amazon.ae/Filorga-Flgorga-Nctf-Intensive-Rejuvenating/dp/B01MUA657E?ref_=Oct_d_omg_d_12149483031_10&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B01MUA657E', 'https://www.amazon.ae/Filorga-Flgorga-Nctf-Intensive-Rejuvenating/dp/B01MUA657E?ref_=Oct_d_omg_d_12149483031_10&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B01MUA657E', 'https://www.amazon.ae/Vichy-LiftActiv-Retinol-Concentrate-Serum/dp/B00FNYVQ1A?ref_=Oct_d_omg_d_12149483031_11&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00FNYVQ1A', 'https://www.amazon.ae/Vichy-LiftActiv-Retinol-Concentrate-Serum/dp/B00FNYVQ1A?ref_=Oct_d_omg_d_12149483031_11&pd_rd_w=RyEbJ&content-id=amzn1.sym.d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_p=d27dd2aa-8a5f-4543-a2d5-d3465e839c9d&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00FNYVQ1A']}, {'catagory': 3, 'key': 'Recommended for you', 'value': ['https://www.amazon.ae/Neutrogena-Moisturizer-Water-Normal-Combination/dp/B07MTX97N7?ref_=Oct_d_orecs_d_12149483031_0&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07MTX97N7', 'https://www.amazon.ae/Neutrogena-Moisturizer-Water-Normal-Combination/dp/B07MTX97N7?ref_=Oct_d_orecs_d_12149483031_0&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07MTX97N7', 'https://www.amazon.ae/Beauty-Joseon-Serum-Propolis-Niacinamide/dp/B086VKZZZY?ref_=Oct_d_orecs_d_12149483031_1&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B086VKZZZY', 'https://www.amazon.ae/Beauty-Joseon-Serum-Propolis-Niacinamide/dp/B086VKZZZY?ref_=Oct_d_orecs_d_12149483031_1&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B086VKZZZY', 'https://www.amazon.ae/Vaseline-Care-Cocoa-Radiant-Body/dp/B0059MUJR8?ref_=Oct_d_orecs_d_12149483031_2&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0059MUJR8', 'https://www.amazon.ae/Vaseline-Care-Cocoa-Radiant-Body/dp/B0059MUJR8?ref_=Oct_d_orecs_d_12149483031_2&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0059MUJR8', 'https://www.amazon.ae/Dettol-Skincare-Liquid-Blossom-Fragrance/dp/B07NF7YQTM?ref_=Oct_d_orecs_d_12149483031_3&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07NF7YQTM', 'https://www.amazon.ae/Dettol-Skincare-Liquid-Blossom-Fragrance/dp/B07NF7YQTM?ref_=Oct_d_orecs_d_12149483031_3&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07NF7YQTM', 'https://www.amazon.ae/Ordinary-Niacinamide-Zinc-Serum-Face/dp/B08BJLFM2W?ref_=Oct_d_orecs_d_12149483031_4&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B08BJLFM2W', 'https://www.amazon.ae/Ordinary-Niacinamide-Zinc-Serum-Face/dp/B08BJLFM2W?ref_=Oct_d_orecs_d_12149483031_4&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B08BJLFM2W', 'https://www.amazon.ae/CeraVe-Salicylic-Hyaluronic-Niacinamide-Ceramides/dp/B00U1YCRD8?ref_=Oct_d_orecs_d_12149483031_5&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00U1YCRD8', 'https://www.amazon.ae/CeraVe-Salicylic-Hyaluronic-Niacinamide-Ceramides/dp/B00U1YCRD8?ref_=Oct_d_orecs_d_12149483031_5&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00U1YCRD8', 'https://www.amazon.ae/COSRX-Salicylic-Daily-Gentle-Cleanser/dp/B06XHLGL6N?ref_=Oct_d_orecs_d_12149483031_6&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B06XHLGL6N', 'https://www.amazon.ae/COSRX-Salicylic-Daily-Gentle-Cleanser/dp/B06XHLGL6N?ref_=Oct_d_orecs_d_12149483031_6&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B06XHLGL6N', 'https://www.amazon.ae/PanOxyl-Antimicrobial-Creamy-Benzoyl-Peroxide/dp/B081KLNHSG?ref_=Oct_d_orecs_d_12149483031_7&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B081KLNHSG', 'https://www.amazon.ae/PanOxyl-Antimicrobial-Creamy-Benzoyl-Peroxide/dp/B081KLNHSG?ref_=Oct_d_orecs_d_12149483031_7&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B081KLNHSG', 'https://www.amazon.ae/Ordinary-Niacinamide-irritation-blemish-formula/dp/B07DP5QV9H?ref_=Oct_d_orecs_d_12149483031_8&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07DP5QV9H', 'https://www.amazon.ae/Ordinary-Niacinamide-irritation-blemish-formula/dp/B07DP5QV9H?ref_=Oct_d_orecs_d_12149483031_8&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07DP5QV9H', 'https://www.amazon.ae/Garnier-Anti-Dark-Brighter-Niacinamide-SkinActive/dp/B0979YYXQ2?ref_=Oct_d_orecs_d_12149483031_9&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0979YYXQ2', 'https://www.amazon.ae/Garnier-Anti-Dark-Brighter-Niacinamide-SkinActive/dp/B0979YYXQ2?ref_=Oct_d_orecs_d_12149483031_9&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0979YYXQ2', 'https://www.amazon.ae/Beauty-Joseon-Matte-sun-stick/dp/B0BQVTM1BJ?ref_=Oct_d_orecs_d_12149483031_10&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0BQVTM1BJ', 'https://www.amazon.ae/Beauty-Joseon-Matte-sun-stick/dp/B0BQVTM1BJ?ref_=Oct_d_orecs_d_12149483031_10&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B0BQVTM1BJ', 'https://www.amazon.ae/Minimalist-Niacinamide-Blemishes-Balancing-Clarifying/dp/B08F9MF314?ref_=Oct_d_orecs_d_12149483031_11&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B08F9MF314', 'https://www.amazon.ae/Minimalist-Niacinamide-Blemishes-Balancing-Clarifying/dp/B08F9MF314?ref_=Oct_d_orecs_d_12149483031_11&pd_rd_w=4P4Tf&content-id=amzn1.sym.fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_p=fd270888-76d8-4e7b-a7e0-178479d09adf&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B08F9MF314']}, {'catagory': 3, 'key': 'Best Seller', 'value': ['https://www.amazon.ae/COSRX-Advance-Snail-Mucin-Essence/dp/B00PBX3L7K?ref_=Oct_d_obs_d_12149483031_0&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00PBX3L7K', 'https://www.amazon.ae/COSRX-Advance-Snail-Mucin-Essence/dp/B00PBX3L7K?ref_=Oct_d_obs_d_12149483031_0&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00PBX3L7K', 'https://www.amazon.ae/Beauty-Joseon-Relief-Sun-Probiotics/dp/B09JVNZVH3?ref_=Oct_d_obs_d_12149483031_1&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B09JVNZVH3', 'https://www.amazon.ae/Beauty-Joseon-Relief-Sun-Probiotics/dp/B09JVNZVH3?ref_=Oct_d_obs_d_12149483031_1&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B09JVNZVH3', 'https://www.amazon.ae/COSRX-RLE70245-Acne-Pimple-Patch/dp/B00PBX3TN6?ref_=Oct_d_obs_d_12149483031_2&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00PBX3TN6', 'https://www.amazon.ae/COSRX-RLE70245-Acne-Pimple-Patch/dp/B00PBX3TN6?ref_=Oct_d_obs_d_12149483031_2&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00PBX3TN6', 'https://www.amazon.ae/COSRX-Aloe-Soothing-Cream-50ml/dp/B00PBX3FLW?ref_=Oct_d_obs_d_12149483031_3&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00PBX3FLW', 'https://www.amazon.ae/COSRX-Aloe-Soothing-Cream-50ml/dp/B00PBX3FLW?ref_=Oct_d_obs_d_12149483031_3&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00PBX3FLW', 'https://www.amazon.ae/Himalaya-Purehands-Effectively-Protects-3x250ml/dp/B08LHSBG3R?ref_=Oct_d_obs_d_12149483031_4&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B08LHSBG3R', 'https://www.amazon.ae/Himalaya-Purehands-Effectively-Protects-3x250ml/dp/B08LHSBG3R?ref_=Oct_d_obs_d_12149483031_4&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B08LHSBG3R', 'https://www.amazon.ae/Neutrogena-Moisturizer-Water-Normal-Combination/dp/B07MTX97N7?ref_=Oct_d_obs_d_12149483031_5&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07MTX97N7', 'https://www.amazon.ae/Neutrogena-Moisturizer-Water-Normal-Combination/dp/B07MTX97N7?ref_=Oct_d_obs_d_12149483031_5&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07MTX97N7', 'https://www.amazon.ae/PanOxyl-Foaming-Peroxide-Strength-Antimicrobial/dp/B081KL2QYJ?ref_=Oct_d_obs_d_12149483031_6&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B081KL2QYJ', 'https://www.amazon.ae/PanOxyl-Foaming-Peroxide-Strength-Antimicrobial/dp/B081KL2QYJ?ref_=Oct_d_obs_d_12149483031_6&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B081KL2QYJ', 'https://www.amazon.ae/CeraVe-Facial-Moisturizing-Moisturizer-Packaging/dp/B00F97FHAW?ref_=Oct_d_obs_d_12149483031_7&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00F97FHAW', 'https://www.amazon.ae/CeraVe-Facial-Moisturizing-Moisturizer-Packaging/dp/B00F97FHAW?ref_=Oct_d_obs_d_12149483031_7&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B00F97FHAW', 'https://www.amazon.ae/Ordinary-Niacinamide-Zinc-Serum-Face/dp/B08BJLFM2W?ref_=Oct_d_obs_d_12149483031_8&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B08BJLFM2W', 'https://www.amazon.ae/Ordinary-Niacinamide-Zinc-Serum-Face/dp/B08BJLFM2W?ref_=Oct_d_obs_d_12149483031_8&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B08BJLFM2W', 'https://www.amazon.ae/CeraVe-Resurfacing-Brightening-Niacinamide-Non-Comedogenic/dp/B07VWSN95S?ref_=Oct_d_obs_d_12149483031_9&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07VWSN95S', 'https://www.amazon.ae/CeraVe-Resurfacing-Brightening-Niacinamide-Non-Comedogenic/dp/B07VWSN95S?ref_=Oct_d_obs_d_12149483031_9&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B07VWSN95S', 'https://www.amazon.ae/TruSkin-Vitamin-Serum-Hyaluronic-Brightening/dp/B01M4MCUAF?ref_=Oct_d_obs_d_12149483031_10&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B01M4MCUAF', 'https://www.amazon.ae/TruSkin-Vitamin-Serum-Hyaluronic-Brightening/dp/B01M4MCUAF?ref_=Oct_d_obs_d_12149483031_10&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B01M4MCUAF', 'https://www.amazon.ae/Derma-Hyaluronic-Sunscreen-Spectrum-Protection/dp/B095CRM8NF?ref_=Oct_d_obs_d_12149483031_11&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B095CRM8NF', 'https://www.amazon.ae/Derma-Hyaluronic-Sunscreen-Spectrum-Protection/dp/B095CRM8NF?ref_=Oct_d_obs_d_12149483031_11&pd_rd_w=mxyW8&content-id=amzn1.sym.cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_p=cc1c6701-2ff2-4094-a4fb-df5f0b147a98&pf_rd_r=65Q4BM6NAVRAFDTAT0C4&pd_rd_wg=YA5Iw&pd_rd_r=fc776482-09be-42af-853d-b8ac4e5ca90d&pd_rd_i=B095CRM8NF']}] +insert_tracker_tab(section_products) \ No newline at end of file diff --git a/amazon_crawler_engine/test_db.py b/amazon_crawler_engine/test_db.py new file mode 100644 index 0000000..aa986ee --- /dev/null +++ b/amazon_crawler_engine/test_db.py @@ -0,0 +1,77 @@ +import hashlib +from amazon_db_writer import amazon_db_writer + +config = { + "crawler_name": "raena_crawler_enginer_amazon", + "crawler_schema": "raena_spider_management", + "category_tab": "rce_category", + "tracker_tab": "crawler_tracker", + "product_tab": "rce_product", + "variant_tab": "rce_product_variant", + "brand_tab": "rce_brand", + "reseller_tab": "rce_reseller", + "reseller_store_tab": "rce_reseller_store", + "review_tab": "rce_ratings_reviews", + "review_productmodels_tab": "rce_ratings_reviews_productmodels", + "review_producttags_tab": "rce_ratings_reviews_producttags", + "review_tags": "rce_tags", + "source_tab": "rce_source", + "product_per_category": "1000", + "source_category": "11043145", + "db_user": "postgres", + "db_pass": "postgres", + "database": "postgres", + "db_host": "localhost", + "db_port": "5444", + "crawler_main": "1", + "crawler_slave_no": "" +} + +db_writer = amazon_db_writer(config) + +data_product = {} + +data_product['rce_source_product_id'] = 0 +data_product['rce_source_id'] = 1 +data_product['rce_source_product_status'] = 1 +data_product['product_page_url'] = 'https://www.amazon.ae/Davidoff-Water-Perfume-Toilette-110ML/dp/B002S8PT8U/?_encoding=UTF8&pd_rd_w=VQ6dh&content-id=amzn1.sym.baa1fbbd-9373-444b-8104-61fa134741c5%3Aamzn1.symc.36bd837a-d66d-47d1-8457-ffe9a9f3ddab&pf_rd_p=baa1fbbd-9373-444b-8104-61fa134741c5&pf_rd_r=6EKKA9QC40Y5MFKGRWYQ&pd_rd_wg=nsmjm&pd_rd_r=6d02ccd2-297c-4b73-8586-a9ac9b355d4a&ref_=pd_gw_ci_mcx_mr_hp_atf_m' +data_product['product_page_url_hash'] = 'bjhgfds867ty3iuhbfew' +data_product['rce_category_id'] = 3 +data_product['rce_brand_id'] = 2 +data_product['rce_store_id'] = 6 +data_product['rce_source_product_name'] = "Hot Water by Davidoff for Men" +data_product['product_images'] = "" +data_product['product_description'] = "Davidoff Hot Water hits you first with it’s fresh spicy aroma owing to the vegetal top notes of wormwood and basil. While the o" +data_product['product_sold_total'] = 0 +data_product['product_sold'] = 0 +data_product['product_price_min'] = "99.00" +data_product['product_price_min_before_discount'] ="340.00" +data_product['product_price_max'] = "99.00" +data_product['product_price_max_before_discount'] = "340.00" +data_product['ratings'] = 4.1 +data_product['product_section'] = "Fragrance" + +data_variant = {} + +data_variant['rce_source_variant_id'] = 0 +data_variant['rce_product_id'] = 2 +data_variant['product_variant_name'] = "abc" +data_variant['product_variant_price'] = "67.3" +data_variant['product_variant_price_before_discount'] = "100.90" +data_variant['product_variant_stock'] = 0 + + +data_review = {} + +data_review["id"] = 1 +data_review["rce_product_id"] = 5 +data_review["username"] = "adnan" +data_review["review"] = "very good product" +data_review["img_url"] = "" +data_review["review_like_count"] = 0 +data_review["user_tier"] = "" +data_review["shop_id"] = 2 +data_review["video_url"] = "" +data_review["rating"] = "4.9" + +db_writer.rce_ratings_reviews(data_review) diff --git a/fb_group_member_extraction/Readme.md b/fb_group_member_extraction/Readme.md new file mode 100644 index 0000000..0d81235 --- /dev/null +++ b/fb_group_member_extraction/Readme.md @@ -0,0 +1,9 @@ +1. Log into Facebook and go to the group from which you want to export the members. + +2. Navigate to the “Members“ tab. + +3. Open the Developer console on chrome and paste the code from "chrome_group_export". + +4. Paste the code from "chrome_auto_scroll" to the auto-scroll page. + +5. Download and save the file once the limit (10K) is reached. \ No newline at end of file diff --git a/fb_group_member_extraction/chrome_auto_scroll b/fb_group_member_extraction/chrome_auto_scroll new file mode 100644 index 0000000..25cde4d --- /dev/null +++ b/fb_group_member_extraction/chrome_auto_scroll @@ -0,0 +1,37 @@ +(function() { + var intervalObj = null; + var retry = 0; + var clickHandler = function() { + console.log("Clicked; stopping autoscroll"); + clearInterval(intervalObj); + document.body.removeEventListener("click", clickHandler); + } + function scrollDown() { + var scrollHeight = document.body.scrollHeight, + scrollTop = document.body.scrollTop, + innerHeight = window.innerHeight, + difference = (scrollHeight - scrollTop) - innerHeight + + if (difference > 0) { + window.scrollBy(0, difference); + if (retry > 0) { + retry = 0; + } + console.log("scrolling down more"); + } else { + if (retry >= 3) { + console.log("reached bottom of page; stopping"); + clearInterval(intervalObj); + document.body.removeEventListener("click", clickHandler); + } else { + console.log("[apparenty] hit bottom of page; retrying: " + (retry + 1)); + retry++; + } + } + } + + document.body.addEventListener("click", clickHandler); + + intervalObj = setInterval(scrollDown, 1000); + +})() \ No newline at end of file diff --git a/fb_group_member_extraction/chrome_group_export b/fb_group_member_extraction/chrome_group_export new file mode 100644 index 0000000..2d3279c --- /dev/null +++ b/fb_group_member_extraction/chrome_group_export @@ -0,0 +1 @@ +function exportToCsv(e,t){for(var n="",o=0;o div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > div:nth-child(1)').find_elements(By.CSS_SELECTOR,'.sc-kCMKrZ.ealOXE') + + urls = [] + for element in elements: + link = element.find_element(By.TAG_NAME, 'a').get_attribute('href') + urls.append(link) + + result = { + "catagory": '3184', + "key": "Bestsellers", + "value": urls + } + results.append(result) + + # New arrivals + + elements = driver.find_element(By.CSS_SELECTOR, '.componentArea-18 > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > div:nth-child(1)').find_elements(By.CSS_SELECTOR,'.swiper-slide') + urls = [] + for element in elements: + link = element.find_element(By.TAG_NAME, 'a').get_attribute('href') + urls.append(link) + + result = { + "catagory": '3184', + "key": "New arrivals", + "value": urls + } + results.append(result) + + # Clearance deals + + elements = driver.find_element(By.CSS_SELECTOR, '.componentArea-21 > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > div:nth-child(1)').find_elements(By.CSS_SELECTOR,'.swiper-slide') + + urls = [] + for element in elements: + link = element.find_element(By.TAG_NAME, 'a').get_attribute('href') + urls.append(link.replace("'","")) + + result = { + "catagory": '3184', + "key": "Clearance deals", + "value": urls + } + results.append(result) + + print(results) + + return results + + def insert_tracker_tab(self, objs): + + for obj in objs: + category = str(obj['catagory']) + key = str(obj['key']) + items = obj['value'] + for item in items: + product_page_url = item + product_page_url_hash = hashlib.md5(product_page_url.encode('utf-8')).hexdigest() + flag = 0 + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where product_page_url = '"+product_page_url+"'" + self.cur.execute(sql) + res = self.cur.fetchall() + + if not res: + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+"(crawler_name,category,keyword,product_page_url,product_page_url_hash,flag) values('"+str(self.crawler_name)+"','"+str(category)+"','"+str(key)+"','"+product_page_url+"','"+product_page_url_hash+"',"+str(flag)+")" + print(sql) + self.cur.execute(sql) + + def base_products(self, driver, catagory): + + try: + for i in range(1,16): + + smartScroll(driver, stopAtBorder=True, distancePerSecond=500, humanBreaks=True) + + + # ############## + # SCROLL_PAUSE_TIME = 0.5 + # + # # Get scroll height + # last_height = driver.execute_script("return document.body.scrollHeight") + # + # while True: + # # Scroll down to bottom + # driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + # + # # Wait to load page + # time.sleep(SCROLL_PAUSE_TIME) + # + # # Calculate new scroll height and compare with last scroll height + # new_height = driver.execute_script("return document.body.scrollHeight") + # if new_height == last_height: + # break + # last_height = new_height + # ############# + + items = driver.find_element(By.CSS_SELECTOR, '.sc-810b5658-7.upghB.grid').find_elements(By.CSS_SELECTOR,'.sc-ff3f80d5-0.iBVDAS.wrapper.productContainer') + + #smartScroll(driver, stopAtBorder=True, distancePerSecond=500, humanBreaks=True) + + + urls = [] + for item in items: + url = item.find_element(By.TAG_NAME, 'a').get_attribute('href') + urls.append(url) + + result = [{ + "catagory": catagory, + "key": "Base Product Page {}".format(str(i)), + "value": urls + }] + + self.insert_tracker_tab(result) + + try: + driver.find_elements(By.CSS_SELECTOR, '.arrowLink')[1].click() + html = driver.find_element(By.TAG_NAME, 'html') + html.send_keys(Keys.HOME) + driver.implicitly_wait(5) + except: + logging.info("No more page to navigate......") + break + + + + except Exception as e: + print(e) + pass + + + + + +config = { + "crawler_name": "raena_crawler_enginer_noon", + "crawler_schema": "raena_spider_management", + "category_tab": "rce_category", + "tracker_tab": "crawler_tracker_noon", + "product_tab": "rce_product", + "variant_tab": "rce_product_variant", + "brand_tab": "rce_brand", + "reseller_tab": "rce_reseller", + "reseller_store_tab": "rce_reseller_store", + "review_tab": "rce_ratings_reviews", + "review_productmodels_tab": "rce_ratings_reviews_productmodels", + "review_producttags_tab": "rce_ratings_reviews_producttags", + "review_tags": "rce_tags", + "source_tab": "rce_source", + "product_per_category": "1000", + "source_category": "11043145", + "db_user": "dbadmin", + "db_pass": "5qCif6eyY3Kmg4z", + "database": "analytics", + "db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com", + "db_port": "5432", + "crawler_main": "1", + "crawler_slave_no": "" +} +noon_category_products = noon_category_products(config) +noon_category_products.start_processing() \ No newline at end of file diff --git a/noon_crawler_engine/noon_crawler.py b/noon_crawler_engine/noon_crawler.py new file mode 100644 index 0000000..4020d70 --- /dev/null +++ b/noon_crawler_engine/noon_crawler.py @@ -0,0 +1,115 @@ +import logging +import psycopg2 +import json +from datetime import datetime +import smtplib +from email.message import EmailMessage + +import requests + +from noon_products import noon_products + + +##### Looger ###### +format = "%(asctime)s: %(message)s" +logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") + +config = {} + +def slack_notification(message): + webhook_url = "https://hooks.slack.com/services/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" + slack_data = {"text": "Issue occurred on Noon Crawler. Error: " + str(message)} + + response = requests.post( + webhook_url, data=json.dumps(slack_data), + headers={"Content-Type": "application/json"} + ) + + if response.status_code != 200: + raise ValueError( + f"Request to Slack returned an error {response.status_code}, {response.text}" + ) + +def send_mail(): + + try: + EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5" + EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh" + From = 'data_reporting@raenabeauty.com' + To = 'shariar@raenabeauty.com' + #To = 'shariar@raenabeauty.com' + + html = f''' + + + +
+

Amazon Crawler Status

+
+
+
+ Error occured. Please check Amazon Pipeline. +
+

This is system generated mail. Please do not reply

+
+
+
+ + + ''' + + msg = EmailMessage() + msg['Subject'] = 'Amazon Crawler Status' + msg['From'] = From + msg['To'] = To + msg.set_content(html, subtype='html') + + + with smtplib.SMTP('email-smtp.ap-southeast-1.amazonaws.com', 587) as smtp: + smtp.ehlo() + smtp.starttls() + smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD) + smtp.send_message(msg) + except Exception as e: + logging.info("Error while sending mail: {}".format(e)) +def main(): + # start = datetime.now() + # categories = amazon_categories(config) + # categories.start_processing() + # end = datetime.now() + # logging.info('Total time taken to fetch the categories: {}'.format(str(end-start))) + # + # start = datetime.now() + # products = amazon_category_products(config) + # products.start_processing() + # end = datetime.now() + # logging.info('Total time taken to fetch the category products: {}'.format(str(end-start))) + + + product_info = noon_products(config) + product_info.start_processing() + + # ###### For test + # item = (100, 'raena_crawler_enginer_amazon', '3066', 'Up to 25 AED', 'https://www.amazon.ae/Ross-Massager-Shampoo-Silicone-Bristles/dp/B09JGH1WM3?ref_=Oct_d_oup_d_12149480031_0&pd_rd_w=lfMTW&content-id=amzn1.sym.d6d96598-a48c-43a2-8244-52a2329bf791&pf_rd_p=d6d96598-a48c-43a2-8244-52a2329bf791&pf_rd_r=C1QM2XCSJDBVMS27JV7E&pd_rd_wg=gkRZv&pd_rd_r=f5af13ee-c6c4-4d8a-8677-cba9cbacdace&pd_rd_i=B09JGH1WM3', '8f0540b5919e176303cf24a1d46b0e1c', 0) + # product_info.get_product_info(item) + + +if __name__ == "__main__": + logging.info("Starting Shopee Crawler.......") + try: + logging.info("Loading config file.......") + with open("conf.json", "r") as jsonfile: + config = json.load(jsonfile) + logging.info("Config file loaded.......") + print(config) + + main() + + #raise Exception("Sorry, no numbers below zero") + + except Exception as e: + logging.info("Error: ".format(e)) + #logging.info("Cannot load config file. Please check. Exiting......") + #send_mail() + slack_notification(e) + exit(1) \ No newline at end of file diff --git a/noon_crawler_engine/noon_db_writer.py b/noon_crawler_engine/noon_db_writer.py new file mode 100755 index 0000000..260d0f0 --- /dev/null +++ b/noon_crawler_engine/noon_db_writer.py @@ -0,0 +1,590 @@ +import logging +import psycopg2 + +###### Looger ###### +format = "%(asctime)s: %(message)s" +logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") + +class noon_db_writer: + def __init__(self, config): + self.config = config + self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) + self.conn.autocommit = True + self.cur = self.conn.cursor() + + def __del__(self): + logging.info("Closing connection.....") + self.conn.close() + + def rce_category(self, data): + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" where category_name = '"+str(data['category_name'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + cat_name = data['category_name'].replace("'","''") + cat_url = data['category_page_url'].replace("'","''") + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" (parent_category_id,rce_source_id," \ + "rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name) values (" \ + +str(data['parent_category_id'])+","+str(data['rce_source_id'])+", "+str(data['rce_source_category_id'])+", "+str(data['rce_source_status'])+", " \ + "'"+str(cat_url)+"', '"+str(data['category_page_url_hash'])+"', '"+str(cat_name)+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(id,parent_category_id,rce_source_id," \ + "rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat) " \ + "select id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \ + "category_name,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \ + "where rce_source_category_id = "+ str(data['rce_source_category_id']) + #logging.info(sql) + + self.cur.execute(sql) + + else: + if str(data['parent_category_id'])==str(res[1]) and str(data['rce_source_category_id'])==str(res[3]) and str(data['category_name']) == str(res[7]) and \ + str(data['category_page_url'])==str(res[5]): + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" set updatedat=now() " \ + "where category_name = '"+ str(res[7])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" set parent_category_id = " \ + ""+str(data['parent_category_id'])+", rce_source_category_id = "+str(data['rce_source_category_id'])+", " \ + "category_name='"+str(cat_name)+"', category_page_url='"+str(cat_url)+"', " \ + "category_page_url_hash='"+str(data['category_page_url_hash'])+"', updatedat=now() where " \ + "category_name = '"+ str(res[7])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(id,parent_category_id,rce_source_id," \ + "rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat) " \ + "select id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \ + "category_name,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \ + "where category_name = '"+ str(res[7])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + def rce_product(self, data): + + data['product_page_url'] = data['product_page_url'].replace("'","") + data['rce_source_product_name'] = data['rce_source_product_name'].replace("'","") + data['product_description'] = data['product_description'].replace("'","") + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where product_page_url = '"+str(data['product_page_url'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" (rce_source_product_id," \ + "rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \ + "rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \ + "product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \ + "product_section,rce_source_id) values("+str(data['rce_source_product_id'])+","+str(data['rce_source_product_status'])+",'"+str(data['product_page_url'])+"'," \ + "'"+str(data['product_page_url_hash'])+"',"+str(data['rce_category_id'])+","+str(data['rce_brand_id'])+","+str(data['rce_store_id'])+"," \ + "'"+str(data['rce_source_product_name'])+"','"+str(data['product_images'])+"','"+str(data['product_description'])+"',"+str(data['product_sold_total'])+"," \ + ""+str(data['product_sold'])+",'"+str(data['product_price_min'])+"','"+str(data['product_price_min_before_discount'])+"','"+str(data['product_price_max'])+"'," \ + "'"+str(data['product_price_max_before_discount'])+"','"+str(data['ratings'])+"','"+str(data['product_section'])+"',"+str(data['rce_source_id'])+")" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" (id,rce_source_product_id," \ + "rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \ + "rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \ + "product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \ + "product_section,createdat,updatedat,rce_source_id) select id,rce_source_product_id," \ + "rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \ + "rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \ + "product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \ + "product_section,createdat,updatedat,rce_source_id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where " \ + "product_page_url='"+str(data['product_page_url'])+"'" + #logging.info(sql) + self.cur.execute(sql) + else: + + if str(data['rce_source_product_id'])==str(res[1]) and str(data['rce_source_product_status'])==str(res[2]) and \ + str(data['product_page_url'])==str(res[3]) and str(data['product_page_url_hash'])==str(res[4]) and str(data['rce_category_id'])==str(res[5]) and \ + str(data['rce_brand_id'])==str(res[6]) and str(data['rce_store_id'])==str(res[7]) and str(data['rce_source_product_name'])==str(res[8]) and \ + str(data['product_images'])==str(res[9]) and str(data['product_sold_total'])==str(res[11]) and \ + str(data['product_sold'])==str(res[12]) and str(data['product_price_min'])==str(res[13]) and str(data['product_price_min_before_discount'])==str(res[14]) and \ + str(data['product_price_max'])==str(res[15]) \ + and str(data['product_price_max_before_discount'])==str(res[16]) \ + and str(data['ratings'])==str(res[17]) and str(data['rce_source_id'])==str(res[21]) and \ + str(data['product_section'])==str(res[22]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" set updatedat=now() " \ + "where product_page_url = '"+ str(res[3])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" set rce_source_product_id="+str(data['rce_source_product_id'])+"," \ + "rce_source_product_status="+str(data['rce_source_product_status'])+",product_page_url='"+str(data['product_page_url'])+"',product_page_url_hash= " \ + "'"+str(data['product_page_url_hash'])+"',rce_category_id="+str(data['rce_category_id'])+",rce_brand_id="+str(data['rce_brand_id'])+"," \ + "rce_store_id="+str(data['rce_store_id'])+",rce_source_product_name='"+str(data['rce_source_product_name'])+"',product_images='"+str(data['product_images'])+"'" \ + ",product_description='"+str(data['product_description'])+"',product_sold_total="+str(data['product_sold_total'])+",product_sold="+str(data['product_sold'])+"," \ + "product_price_min='"+str(data['product_price_min'])+"',product_price_min_before_discount='"+str(data['product_price_min_before_discount'])+"'," \ + "product_price_max='"+str(data['product_price_max'])+"',product_price_max_before_discount='"+str(data['product_price_max_before_discount'])+"',ratings='"+str(data['ratings'])+"'," \ + "product_section='"+str(data['product_section'])+"', updatedat=now(), rce_source_id="+str(data['rce_source_id'])+" where product_page_url = '"+ str(res[3])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" (id,rce_source_product_id," \ + "rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \ + "rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \ + "product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \ + "product_section,createdat,updatedat,rce_source_id) select id,rce_source_product_id," \ + "rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \ + "rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \ + "product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \ + "product_section,createdat,updatedat,rce_source_id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where " \ + "product_page_url='"+str(res[3])+"'" + #logging.info(sql) + self.cur.execute(sql) + + + def rce_product_variant(self, data): + data['product_variant_name'] = data['product_variant_name'].replace("'","''") + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where product_variant_name = '"+str(data['product_variant_name'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" (rce_source_variant_id,rce_product_id," \ + "product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock) values("+str(data['rce_source_variant_id'])+"," \ + ""+str(data['rce_product_id'])+",'"+str(data['product_variant_name'])+"','"+str(data['product_variant_price'])+"'," \ + "'"+str(data['product_variant_price_before_discount'])+"',"+str(data['product_variant_stock'])+")" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" (id,rce_source_variant_id,rce_product_id," \ + "product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,createdat,updatedat) select * from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where product_variant_name='"+str(data['product_variant_name'])+"'" + #logging.info(sql) + self.cur.execute(sql) + + else: + if str(data['rce_source_variant_id'])==str(res[1]) and str(data['rce_product_id'])==str(res[2]) and str(data['product_variant_name'])==str(res[3]) and \ + str(data['product_variant_price'])==str(res[4]) and str(data['product_variant_price_before_discount'])==str(res[5]) and str(data['product_variant_stock'])==str(res[6]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" set updatedat=now() " \ + "where product_variant_name = '"+ str(res[3])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" set rce_source_variant_id="+str(data['rce_source_variant_id'])+", " \ + "rce_product_id="+str(data['rce_product_id'])+", product_variant_name='"+str(data['product_variant_name'])+"', product_variant_price=" \ + "'"+str(data['product_variant_price'])+"',product_variant_price_before_discount='"+str(data['product_variant_price_before_discount'])+"'," \ + "product_variant_stock="+str(data['product_variant_stock'])+", updatedat=now() where product_variant_name = '"+ str(res[3])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" (id,rce_source_variant_id,rce_product_id," \ + "product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,createdat,updatedat) select * from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where product_variant_name='"+str(res[3])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + + def rce_brand(self, data): + data['brand_page_url'] = data['brand_page_url'].replace("'","''") + data['brand_name'] = data['brand_name'].replace("'","''") + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_page_url = '"+str(data['brand_page_url'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + if not res: + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" (rce_source_id,rce_source_brand_status," \ + "brand_page_url,brand_page_url_hash,brand_name) values("+str(data['rce_source_id'])+"," \ + ""+str(data['rce_source_brand_status'])+",'"+str(data['brand_page_url'])+"','"+str(data['brand_page_url_hash'])+"'," \ + "'"+str(data['brand_name'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" (id,rce_source_id,rce_source_brand_status," \ + "brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat) select id,rce_source_id,rce_source_brand_status," \ + "brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_page_url='"+str(data['brand_page_url'])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_source_id'])==str(res[1]) and str(data['rce_source_brand_status'])==str(res[3]) and str(data['brand_page_url'])==str(res[4]) and \ + str(data['brand_page_url_hash'])==str(res[5]) and str(data['brand_name'])==str(res[6]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" set updatedat=now() " \ + "where brand_page_url = '"+ str(res[4])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" set rce_source_id="+str(data['rce_source_id'])+", " \ + "rce_source_brand_status="+str(data['rce_source_brand_status'])+", brand_page_url='"+str(data['brand_page_url'])+"', brand_page_url_hash=" \ + "'"+str(data['brand_page_url_hash'])+"',brand_name='"+str(data['brand_name'])+"', updatedat=now() where brand_page_url = '"+ str(res[4])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" (id,rce_source_id,rce_source_brand_status," \ + "brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat) select id,rce_source_id,rce_source_brand_status, " \ + "brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_page_url='"+str(res[4])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + def rce_reseller(self, data): + data['reseller_name'] = data['reseller_name'].replace("'","''") + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data['reseller_name'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" (rce_source_id,rce_source_reseller_status," \ + "reseller_name,reseller_average_rating,reseller_description) values("+str(data['rce_source_id'])+"," \ + ""+str(data['rce_source_reseller_status'])+",'"+str(data['reseller_name'])+"','"+str(data['reseller_average_rating'])+"'," \ + "'"+str(data['reseller_description'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (id,rce_source_id,rce_source_reseller_status," \ + "reseller_name,reseller_average_rating,reseller_description,createdat,updatedat) select id,rce_source_id,rce_source_reseller_status," \ + "reseller_name,reseller_average_rating,reseller_description,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name='"+str(data['reseller_name'])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if data['rce_source_id']==res[1] and str(data['rce_source_reseller_status'])==str(res[3]) and str(data['reseller_name'])==str(res[4]) and \ + str(data['reseller_average_rating'])==str(res[5]) and str(data['reseller_description'])==str(res[6]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set updatedat=now() " \ + "where reseller_name = '"+ str(res[4])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set rce_source_id="+str(data['rce_source_id'])+", " \ + "rce_source_reseller_status="+str(data['rce_source_reseller_status'])+", reseller_name='"+str(data['reseller_name'])+"', reseller_average_rating=" \ + "'"+str(data['reseller_average_rating'])+"',reseller_description='"+str(data['reseller_description'])+"', updatedat=now() where reseller_name = '"+ str(res[4])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (id,rce_source_id,rce_source_reseller_status," \ + "reseller_name,reseller_average_rating,reseller_description,createdat,updatedat) select id,rce_source_id,rce_source_reseller_status," \ + "reseller_name,reseller_average_rating,reseller_description,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name='"+str(res[4])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + def rce_reseller_store(self, data): + + data['store_page_url'] = data['store_page_url'].replace("'","''") + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url = '"+str(data['store_page_url'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" (rce_source_store_status," \ + "store_page_url,store_page_url_hash,store_location,rce_reseller_id,rce_source_id) values(" \ + ""+str(data['rce_source_store_status'])+",'"+str(data['store_page_url'])+"','"+str(data['store_page_url_hash'])+"'," \ + "'"+str(data['store_location'])+"', "+str(data['rce_reseller_id'])+", "+str(data['rce_source_id'])+")" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (id,rce_source_store_status," \ + "store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id) select id,rce_source_store_status," \ + "store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url='"+str(data['store_page_url'])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_source_store_status'])==str(res[2]) and str(data['store_page_url'])==str(res[3]) and \ + str(data['store_page_url_hash'])==str(res[4]) and str(data['store_location'])==str(res[5]) and \ + str(data['rce_reseller_id'])==str(res[6]) and str(data['rce_source_id'])==str(res[9]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set updatedat=now() " \ + "where store_page_url = '"+ str(res[3])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set " \ + "rce_source_store_status="+str(data['rce_source_store_status'])+", store_page_url='"+str(data['store_page_url'])+"', store_page_url_hash=" \ + "'"+str(data['store_page_url_hash'])+"',store_location='"+str(data['store_location'])+"', rce_reseller_id="+str(data['rce_reseller_id'])+", " \ + "updatedat=now(), rce_source_id="+str(data['rce_source_id'])+" where store_page_url = '"+ str(res[3])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (id,rce_source_store_status," \ + "store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id) select id,rce_source_store_status," \ + "store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url='"+str(res[3])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + def rce_ratings_reviews(self, data): + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id = "+str(data['rce_product_id'])+" and username ='"+str(data['username'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + data['username'] = data['username'].replace("'","''") + data['img_url'] = data['img_url'].replace("'","''") + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" (id,rce_product_id,username," \ + "review,img_url,review_like_count,user_tier,shop_id,video_url,rating) values("+str(data['id'])+","+str(data['rce_product_id'])+"," \ + "'"+str(data['username'])+"','"+str(data['review'])+"','"+str(data['img_url'])+"',"+str(data['review_like_count'])+",'"+str(data['user_tier'])+"'," \ + ""+str(data['shop_id'])+", '"+str(data['video_url'])+"', '"+str(data['rating'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" (id,rce_product_id,username," \ + "review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select id,rce_product_id,username," \ + "review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id="+str(data['rce_product_id'])+" and username ='"+str(data['username'])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_product_id'])==str(res[1]) and str(data['username'])==str(res[2]) and str(data['review'])==str(res[3]) and \ + str(data['img_url'])==str(res[4]) and str(data['review_like_count'])==str(res[5]) and str(data['user_tier'])==str(res[6]) and \ + str(data['shop_id'])==str(res[7]) and str(data['video_url'])==str(res[8]) and str(data['rating'])==str(res[9]): + + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set updatedat=now() " \ + "where rce_product_id = "+ str(res[1])+" and username ='"+res[2]+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set rce_product_id="+str(data['rce_product_id'])+", " \ + "username='"+str(data['username'])+"', review='"+str(data['review'])+"', img_url=" \ + "'"+str(data['img_url'])+"',review_like_count="+str(data['review_like_count'])+", user_tier='"+str(data['user_tier'])+"', " \ + "shop_id="+str(data['shop_id'])+", video_url='"+str(data['video_url'])+"', rating='"+str(data['rating'])+"', updatedat=now() " \ + "where rce_product_id = "+ str(res[1])+" and username ='"+str(data['username'])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" (id,rce_product_id,username," \ + "review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select id,rce_product_id,username," \ + "review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id="+str(res[1])+" and username ='"+str(data['username'])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + def rce_ratings_reviews_productmodels(self,data): + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id = "+str(data['rce_rating_id']) + self.cur.execute(sql) + res = self.cur.fetchone() + + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" (rce_rating_id,model_id) " \ + "values("+str(data['rce_rating_id'])+",'"+str(data['model_id'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \ + "createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(data['rce_rating_id'])+"" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_rating_id'])==str(res[1]) and str(data['model_id'])==str(res[2]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set updatedat=now() " \ + "where rce_rating_id = "+ str(res[1]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set model_id="+str(data['model_id'])+", " \ + "updatedat=now() where rce_source_store_id = "+ str(res[1]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \ + "createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(res[1])+"" + #logging.info(sql) + + self.cur.execute(sql) + + + def rce_tags(self,data): + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description = '"+str(data['description'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" (id,description) " \ + "values("+str(data['id'])+",'"+str(data['description'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \ + "createdat,updatedat) select id,description,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(data['description'])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['description'])==str(res[1]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set updatedat=now() " \ + "where description = '"+ str(res[1])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set description='"+str(data['description'])+"', " \ + "updatedat=now() where description = "+ str(res[1]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \ + "createdat,updatedat) select id,description,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(res[1])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + + def rce_ratings_reviews_producttags(self,data): + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id = '"+str(data['rce_rating_id'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" (rce_rating_id,tag_ids) " \ + "values("+str(data['rce_rating_id'])+",'"+str(data['tag_ids'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \ + "createdat,updatedat) select id,rce_rating_id,tag_ids,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id='"+str(data['rce_rating_id'])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_rating_id'])==str(res[1]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set updatedat=now() " \ + "where rce_rating_id = '"+ str(res[1])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set rce_rating_id='"+str(data['rce_rating_id'])+"', " \ + "updatedat=now() where rce_rating_id = "+ str(res[1]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \ + "createdat,updatedat) select id,description,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where description='"+str(res[1])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + + diff --git a/noon_crawler_engine/noon_products.py b/noon_crawler_engine/noon_products.py new file mode 100755 index 0000000..7f11ecd --- /dev/null +++ b/noon_crawler_engine/noon_products.py @@ -0,0 +1,426 @@ +import hashlib +import json +import logging +import random +import sys +import string +import psycopg2 +import time +import re + +import requests + +from noon_db_writer import noon_db_writer +from datetime import datetime +from noon_raw_product import get_product_info_raw + +class noon_products: + def __init__(self, config): + self.config = config + self.crawler_name = self.config.get("crawler_name") + self.pattern = r'[' + string.punctuation + ']' + self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) + self.conn.autocommit = True + self.cur = self.conn.cursor() + self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Noon'") + self.rce_source_id = self.cur.fetchone()[0] + self.cur.execute("select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where crawler_name='raena_crawler_enginer_noon' and flag=0") + self.items = self.cur.fetchall() + self.db_writer = noon_db_writer(config) + #self.display = Display(visible=0, size=(800, 600)) + #self.display.start() + + + def __del__(self): + print("Closing connection.....") + self.conn.close() + #self.display.stop() + + def slack_notification(message): + webhook_url = "https://hooks.slack.com/services/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" + slack_data = {"text": "Issue occurred on Noon Crawler. Error: " + str(message)} + + response = requests.post( + webhook_url, data=json.dumps(slack_data), + headers={"Content-Type": "application/json"} + ) + + if response.status_code != 200: + raise ValueError( + f"Request to Slack returned an error {response.status_code}, {response.text}" + ) + + def start_processing(self): + count = 0 + for item in self.items: + count += 1 + try: + logging.info("============== Getting info for {}/{}: {} ================".format(str(count),str(len(self.items)),str(item))) + start = datetime.now() + self.get_product_info(item) + end = datetime.now() + logging.info('Total time taken to fetch the product: {}'.format(str(end-start))) + # sleeptime = random.randint(20,50) + # logging.info("Sleeping for {} sec".format(str(sleeptime))) + # time.sleep(sleeptime) + time.sleep(5) + except Exception as e: + print(e) + self.slack_notification(e) + + def reseller_info(self, data): + try: + stores = data["product"]["variants"][0]["offers"] + + if stores: + + return_item = "" + flag = 0 + + for store in stores: + + ##### reseller info + + data_reseller = {} + data_reseller['rce_source_id'] = self.rce_source_id + data_reseller['rce_source_reseller_status'] = 1 + data_reseller['reseller_name'] = "" + data_reseller['reseller_average_rating'] = 0.0 + data_reseller['reseller_description'] = "" + + try: + data_reseller['reseller_name'] = store["store_name"] + data_reseller['reseller_name'] = data_reseller['reseller_name'].replace("'","") + except: + pass + + try: + data_reseller['reseller_average_rating'] = float(store["partner_ratings_sellerlab"]["partner_rating"]) + except: + pass + + + try: + self.db_writer.rce_reseller(data_reseller) + except Exception as e: + logging.info(e) + + ##### Store info + + data_reseller_store = {} + data_reseller_store['rce_source_store_status'] = 1 + data_reseller_store['store_page_url'] = "" + data_reseller_store['store_page_url_hash'] = "" + data_reseller_store['store_location'] = "" + data_reseller_store['rce_reseller_id'] = "" + data_reseller_store['rce_source_id'] = self.rce_source_id + + try: + data_reseller_store['store_page_url'] = "https://www.noon.com/uae-en/seller/" + store["store_code"] + data_reseller_store['store_page_url'] = data_reseller_store['store_page_url'].replace("'","") + + data_reseller_store['store_page_url_hash'] = hashlib.md5(data_reseller_store['store_page_url'].encode('utf-8')).hexdigest() + except: + pass + + try: + self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data_reseller['reseller_name'])+"'") + rce_reseller_id = self.cur.fetchone() + data_reseller_store['rce_reseller_id'] = rce_reseller_id[0] + if flag == 0: + return_item = data_reseller_store['rce_reseller_id'] + flag = 1 + except: + pass + + try: + self.db_writer.rce_reseller_store(data_reseller_store) + except Exception as e: + logging.info(e) + + return return_item + + except Exception as e: + print(e) + + def brand_info(self, data): + data_brand = {} + + data_brand['rce_source_id'] = self.rce_source_id + data_brand['rce_source_brand_status'] = 1 + data_brand['brand_page_url'] = "" + data_brand['brand_page_url_hash'] = "" + data_brand['brand_name'] = "" + + try: + data_brand['brand_page_url'] = "https://www.noon.com/uae-en/" + data["product"]["brand_code"] + data_brand['brand_page_url_hash'] = hashlib.md5(data_brand['brand_page_url'].encode('utf-8')).hexdigest() + + try: + data_brand['brand_name'] = data["product"]["brand"] + data_brand['brand_name'] = data_brand['brand_name'].replace("'","") + except: + pass + + try: + self.db_writer.rce_brand(data_brand) + except Exception as e: + logging.info(e) + + return data_brand['brand_name'] + except: + pass + + def product_info(self, data, category, keyword, url, url_hash, brand_name, rce_reseller_id): + data_product = {} + + data_product['rce_source_product_id'] = 0 + data_product['rce_source_id'] = self.rce_source_id + data_product['rce_source_product_status'] = 1 + data_product['product_page_url'] = url.replace("'","''") + data_product['product_page_url_hash'] = url_hash + data_product['rce_category_id'] = int(category) + data_product['rce_brand_id'] = "" + data_product['rce_store_id'] = "" + data_product['rce_source_product_name'] = "" + data_product['product_images'] = "" + data_product['product_description'] = "" + data_product['product_sold_total'] = 0 + data_product['product_sold'] = 0 + data_product['product_price_min'] = "" + data_product['product_price_min_before_discount'] ="" + data_product['product_price_max'] = "" + data_product['product_price_max_before_discount'] = "" + data_product['ratings'] = 0.0 + data_product['product_section'] = keyword + + try: + sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_name = '"+str(brand_name)+"'" + self.cur.execute(sql) + data_product['rce_brand_id'] = self.cur.fetchone()[0] + except: pass + + try: + sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+"" + self.cur.execute(sql) + data_product['rce_store_id'] = self.cur.fetchone()[0] + except: pass + + try: + rce_source_product_name = data["product"]["product_title"] + data_product['rce_source_product_name'] = str(re.sub(self.pattern, '', rce_source_product_name)).replace("'","") + except: pass + + + try: + images = data["product"]["image_keys"] + data_product['product_images'] = ','.join(images) + #print(data_product['product_images']) + except: pass + + try: + data_product['product_description'] = data["product"]["long_description"] + " ".join(data["product"]["feature_bullets"]) + data_product['product_description'] = str(re.sub(self.pattern, '', data_product['product_description'])).replace("'","") + except: + pass + + try: + data_product['product_price_min'] = str(data["product"]["variants"][0]["offers"][0]["sale_price"]) + data_product['product_price_max'] = data_product['product_price_min'] + except: + data_product['product_price_min'] = str(data["product"]["variants"][0]["offers"][0]["price"]) + data_product['product_price_max'] = data_product['product_price_min'] + pass + + try: + data_product['product_price_min_before_discount'] = str(data["product"]["variants"][0]["offers"][0]["price"]) + data_product['product_price_max_before_discount'] = str(data["product"]["variants"][0]["offers"][0]["price"]) + except: + pass + + try: + data_product['ratings'] = float(data["product"]["product_rating"]["value"]) + #print(data_product['ratings']) + except: + pass + + try: + self.db_writer.rce_product(data_product) + except Exception as e: + logging.info(e) + + ### rce_product_variant + try: + variants = data["product"]["groups"][0]["options"] + if variants: + + for variant in variants: + + data_variant = {} + + data_variant['rce_source_variant_id'] = 0 + data_variant['rce_product_id'] = "" + data_variant['product_variant_name'] = "" + data_variant['product_variant_price'] = 0 + data_variant['product_variant_price_before_discount'] = 0 + data_variant['product_variant_stock'] = 0 + + try: + sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_name = '"+str(data_product['rce_source_product_name'])+"'" + self.cur.execute(sql) + data_variant['rce_product_id'] = self.cur.fetchone()[0] + except: + pass + + try: + product_variant_name = variant["name"] + data_variant['product_variant_name'] = str(re.sub(self.pattern, '', product_variant_name)).replace("'","''") + except: pass + + + try: + self.db_writer.rce_product_variant(data_variant) + except Exception as e: + logging.info(e) + + time.sleep(random.randint(2,5)) + + else: + logging.info('No variant found') + except: + logging.info('No variant found') + pass + + + + def rating_info(self, data, rce_reseller_id, url_hash): + + try: + data_reviews = [] + data_reviews_ar = [] + data_reviews_en = [] + + try: + if data["product"]["reviews"]["comments"]["ar"]["reviews"]: + data_reviews_ar = data["product"]["reviews"]["comments"]["ar"]["reviews"] + data_reviews.extend(data_reviews_ar) + except: + pass + + try: + if data["product"]["reviews"]["comments"]["en"]["reviews"]: + data_reviews_en = data["product"]["reviews"]["comments"]["en"]["reviews"] + data_reviews.extend(data_reviews_en) + except: + pass + + + for review in data_reviews: + + data_review = {} + + data_review["id"] = "" + data_review["rce_product_id"] = "" + data_review["username"] = "" + data_review["review"] = "" + data_review["img_url"] = "" + data_review["review_like_count"] = 0 + data_review["user_tier"] = "" + data_review["shop_id"] = 0 + data_review["video_url"] = "" + data_review["rating"] = "" + + try: + sql = "select max(id) from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab') + self.cur.execute(sql) + rating_id = self.cur.fetchone() + + if rating_id[0]==None: + rating_id = 1 + else: + rating_id = int(rating_id[0]) + 1 + + data_review["id"] = rating_id + except: + pass + + try: + sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where product_page_url_hash = '"+str(url_hash)+"'" + self.cur.execute(sql) + data_review["rce_product_id"] = self.cur.fetchone()[0] + except: pass + + try: data_review["username"] = review["displayName"] + except: pass + + try: + try: + title = review["title"] + except: + pass + + try: + comment = review["comment"] + except: + pass + + data_review["review"] = title + comment + data_review["review"] = data_review["review"].replace("'","") + except: pass + + try: + data_review["review_like_count"] = review["helpfulCount"] + except: + pass + + try: + data_review["rating"] = review["rating"] + except: pass + + try: + sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+"" + self.cur.execute(sql) + data_review["shop_id"] = self.cur.fetchone()[0] + except: pass + + try: + self.db_writer.rce_ratings_reviews(data_review) + except Exception as e: + logging.info(e) + except: + pass + + + + def get_product_info(self,item): + try: + + data = get_product_info_raw(item[4]) + + + + ##### Reseller info ##### + rce_reseller_id = self.reseller_info(data) + + + + ##### Product Info ##### + ##### Brand Info + brand_name = self.brand_info(data) + ##### Product info + self.product_info(data, item[2], item[3], item[4], item[5], brand_name, rce_reseller_id) + + + ##### Rating Info ##### + self.rating_info(data, rce_reseller_id, item[5]) + + + sql = f""" + update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 where product_page_url_hash='{item[5]}' + """ + self.cur.execute(sql) + + + except Exception as e: + print(e) + diff --git a/noon_crawler_engine/noon_raw_product.py b/noon_crawler_engine/noon_raw_product.py new file mode 100644 index 0000000..73c12a5 --- /dev/null +++ b/noon_crawler_engine/noon_raw_product.py @@ -0,0 +1,62 @@ +import json +import logging +import requests +# import random +# import string +# import uuid +# import time +# import jwt +from urllib.parse import urlparse, quote + +##### Looger ###### +format = "%(asctime)s: %(message)s" +logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") + +def get_product_info_raw(url): + + #parsed_url = urlparse(url) + parsed_url = url.replace("noon.com/uae-en/", "noon.com/_svc/catalog/api/v3/u/") + print(parsed_url) + encoded_url = quote(parsed_url, safe='') + + api_url= 'http://localhost:3090/rcs/v1/noon/' + + print(url) + print(api_url+encoded_url) + response = requests.request("GET", api_url+encoded_url) + + logging.info(response) + + print(api_url+encoded_url) + data = json.loads(response.text) + + return data['data'] + + +# def generate_sentry_trace(): +# trace_id = ''.join(random.choices(string.ascii_lowercase + string.digits, k=32)) +# span_id = ''.join(random.choices(string.ascii_lowercase + string.digits, k=16)) +# sampling_decision = random.randint(0, 1) +# +# sentry_trace = f'{trace_id}-{span_id}-{sampling_decision}' +# return sentry_trace +# +# def generate_x_visitor_id(): +# x_visitor_id = str(uuid.uuid4()) +# return x_visitor_id +# +# def generate_cookie(): +# payload = { +# 'raId': 'd1e3f451135d40958672d78da1f8c612', +# 'iat': int(time.time()), +# 'exp': int(time.time()+60) +# } +# # Generate the cookie string without a secret key +# cookie = jwt.encode(payload, '', algorithm='HS256') +# +# return cookie + + +# url = 'https://www.noon.com/uae-en/niacinamide-10-and-zinc-1-clear-30ml/N23772548A/p/?o=cbd635fab2298abe' +# # +# print(get_product_info_raw(url)) \ No newline at end of file diff --git a/noon_crawler_engine/test.py b/noon_crawler_engine/test.py new file mode 100644 index 0000000..302d6a7 --- /dev/null +++ b/noon_crawler_engine/test.py @@ -0,0 +1,30 @@ +import hashlib +import logging +#import undetected_chromedriver as webdriver +from selenium import webdriver +from selenium.webdriver import ActionChains, Keys +from selenium.webdriver.chrome.service import Service +import psycopg2 +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys +from noon_db_writer import noon_db_writer +from pyvirtualdisplay import Display +from scroller.scroller import smartScroll +import time + + +import ssl +ssl._create_default_https_context = ssl._create_unverified_context + +driver = webdriver.Firefox() + +driver.get('https://www.noon.com/uae-en/beauty/') +driver.implicitly_wait(5) + +elements = driver.find_element(By.XPATH, '//*[@id="__next"]/div/section/div/div/div[23]/div/div/div/div/div/div/div/div/div[2]/div[1]/div').find_elements(By.CSS_SELECTOR,'.swiper-slide') + +for element in elements: + link = element.find_element(By.TAG_NAME, 'a').get_attribute('href') + print(link) + +driver.close() \ No newline at end of file diff --git a/noon_crawler_engine/test_slack.py b/noon_crawler_engine/test_slack.py new file mode 100644 index 0000000..248725d --- /dev/null +++ b/noon_crawler_engine/test_slack.py @@ -0,0 +1,20 @@ +import requests +import json + +def slack_notification(message): + webhook_url = "https://hooks.slack.com/services/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" + slack_data = {"text": message} + + response = requests.post( + webhook_url, data=json.dumps(slack_data), + headers={"Content-Type": "application/json"} + ) + + if response.status_code != 200: + raise ValueError( + f"Request to Slack returned an error {response.status_code}, {response.text}" + ) + + +message = "Hello from Python!" +slack_notification(message) diff --git a/oliveyoung_crawler/__init__.py b/oliveyoung_crawler/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/oliveyoung_crawler/items.py b/oliveyoung_crawler/items.py new file mode 100644 index 0000000..63e674d --- /dev/null +++ b/oliveyoung_crawler/items.py @@ -0,0 +1,12 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class RaenaCrawlerItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/oliveyoung_crawler/middlewares.py b/oliveyoung_crawler/middlewares.py new file mode 100644 index 0000000..7fb5e92 --- /dev/null +++ b/oliveyoung_crawler/middlewares.py @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class RaenaCrawlerSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class RaenaCrawlerDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/oliveyoung_crawler/pipelines.py b/oliveyoung_crawler/pipelines.py new file mode 100644 index 0000000..7868475 --- /dev/null +++ b/oliveyoung_crawler/pipelines.py @@ -0,0 +1,18 @@ +# pipelines.py +import json + + +class OliveYoungPipeline: + def __init__(self): + self.file = None + + def open_spider(self, spider): + self.file = open('output.json', 'w') + + def close_spider(self, spider): + self.file.close() + + def process_item(self, item, spider): + line = json.dumps(item) + "\n" + self.file.write(line) + return item diff --git a/oliveyoung_crawler/requirements.txt b/oliveyoung_crawler/requirements.txt new file mode 100644 index 0000000..072220f --- /dev/null +++ b/oliveyoung_crawler/requirements.txt @@ -0,0 +1,111 @@ +appdirs==1.4.4 +appnope @ file:///opt/concourse/worker/volumes/live/4f734db2-9ca8-4d8b-5b29-6ca15b4b4772/volume/appnope_1606859466979/work +argon2-cffi @ file:///opt/conda/conda-bld/argon2-cffi_1645000214183/work +argon2-cffi-bindings @ file:///opt/concourse/worker/volumes/live/c6f9b05d-dc80-4dbc-7473-70bfcb66883c/volume/argon2-cffi-bindings_1644569703264/work +attrs @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_33k1uces4n/croot/attrs_1668696162258/work +Automat==22.10.0 +backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work +bleach @ file:///opt/conda/conda-bld/bleach_1641577558959/work +brotlipy==0.7.0 +certifi @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_477u68wvzm/croot/certifi_1671487773341/work/certifi +cffi @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_1b0qzba5nr/croot/cffi_1670423213150/work +charset-normalizer @ file:///tmp/build/80754af9/charset-normalizer_1630003229654/work +colorama @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_f5t80kwp9l/croot/colorama_1672386533201/work +ConfigUpdater @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_495uyr_0u4/croot/configupdater_1668698019809/work +constantly==15.1.0 +cryptography @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_19cvzxmeb9/croot/cryptography_1677533085498/work +cssselect==1.2.0 +debugpy @ file:///opt/concourse/worker/volumes/live/32b11d06-4d64-4ec8-497a-cf4fc97343d2/volume/debugpy_1637091821874/work +decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work +defusedxml @ file:///tmp/build/80754af9/defusedxml_1615228127516/work +entrypoints @ file:///opt/concourse/worker/volumes/live/194c0a28-55ce-4e83-6a87-0d9f2e06ab2c/volume/entrypoints_1649926487944/work +fake-useragent==1.2.1 +Faker==18.13.0 +fastjsonschema @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b5c1gee32t/croots/recipe/python-fastjsonschema_1661368622875/work +filelock==3.12.2 +hyperlink @ file:///tmp/build/80754af9/hyperlink_1610130746837/work +idna @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_00jf0h4zbt/croot/idna_1666125573348/work +imagesize @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_4a6ed1be-fe30-4d6a-91d4-f867600caa0be5_dxzvt/croots/recipe/imagesize_1657179500955/work +importlib-metadata @ file:///opt/concourse/worker/volumes/live/4e1a3384-472f-4bcb-7776-cb0076aaea40/volume/importlib-metadata_1648562431336/work +importlib-resources @ file:///tmp/build/80754af9/importlib_resources_1625135880749/work +incremental==22.10.0 +ipykernel @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_05yte6zd0k/croots/recipe/ipykernel_1662361808878/work +ipython @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b9echyik_d/croots/recipe/ipython_1659529861316/work +ipython-genutils @ file:///tmp/build/80754af9/ipython_genutils_1606773439826/work +itemadapter==0.8.0 +itemloaders==1.1.0 +jedi @ file:///opt/concourse/worker/volumes/live/c9d2fa99-8bc1-4572-41e7-6beba6391441/volume/jedi_1644315238822/work +Jinja2 @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_6adj7x0ejx/croot/jinja2_1666908137966/work +jmespath==1.0.1 +jsonschema @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_21cqeq1xnk/croot/jsonschema_1676558686956/work +jupyter_client @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_41tzpfqkok/croots/recipe/jupyter_client_1661848920196/work +jupyter_core @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_fc_0us_ta7/croot/jupyter_core_1668084443574/work +jupyterlab-pygments @ file:///tmp/build/80754af9/jupyterlab_pygments_1601490720602/work +langcodes @ file:///opt/conda/conda-bld/langcodes_1643477751144/work +lxml==4.9.3 +MarkupSafe @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_d4a9444f-bd4c-4043-b47d-cede33979b0fve7bm42r/croots/recipe/markupsafe_1654597878200/work +matplotlib-inline @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_9ddl71oqte/croots/recipe/matplotlib-inline_1662014471815/work +mccabe @ file:///opt/conda/conda-bld/mccabe_1644221741721/work +mistune==0.8.4 +nbclient @ file:///opt/concourse/worker/volumes/live/2b77047f-e15a-4d19-54ac-7d87d20b74de/volume/nbclient_1650308375803/work +nbconvert @ file:///opt/concourse/worker/volumes/live/84c159ef-8fac-4372-7b64-25f831ab7aec/volume/nbconvert_1624479064764/work +nbformat @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_2daun1fill/croot/nbformat_1670352339504/work +nest-asyncio @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_64pfm74mxq/croot/nest-asyncio_1672387129786/work +nose @ file:///opt/conda/conda-bld/nose_1642704612149/work +notebook @ file:///opt/concourse/worker/volumes/live/f984e24b-6ef4-4a5b-55be-c5db1417e27a/volume/notebook_1621528337539/work +packaging @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_bet5qdixgt/croot/packaging_1671697440883/work +pandocfilters @ file:///opt/conda/conda-bld/pandocfilters_1643405455980/work +parsel==1.8.1 +parso @ file:///tmp/build/80754af9/parso_1617223946239/work +pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work +pickleshare @ file:///tmp/build/80754af9/pickleshare_1606932040724/work +pkgutil_resolve_name @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_c9l5hym8w0/croots/recipe/pkgutil-resolve-name_1661463329338/work +prometheus-client @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_19kjbndib7/croots/recipe/prometheus_client_1659455105394/work +prompt-toolkit @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_82emz7mook/croot/prompt-toolkit_1672387300396/work +Protego==0.2.1 +psutil @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_c9b604bf-685f-47f6-8304-238e4e70557e1o7mmsot/croots/recipe/psutil_1656431274701/work +psycopg2-binary==2.9.7 +ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl +py @ file:///tmp/build/80754af9/py_1607971587848/work +pyasn1==0.5.0 +pyasn1-modules==0.3.0 +pycodestyle @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_a7riaf725h/croot/pycodestyle_1674267226642/work +pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work +PyDispatcher==2.0.7 +pyflakes @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_a87qrne4ps/croot/pyflakes_1674165135821/work +Pygments @ file:///opt/conda/conda-bld/pygments_1644249106324/work +pyOpenSSL @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_6dweji2whw/croot/pyopenssl_1677607689781/work +pyrsistent @ file:///opt/concourse/worker/volumes/live/24b7a9ab-37d8-463c-575f-69184f9cfbc8/volume/pyrsistent_1636111022304/work +PySocks @ file:///opt/concourse/worker/volumes/live/ef943889-94fc-4539-798d-461c60b77804/volume/pysocks_1605305801690/work +python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work +pyzmq @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_15f7a459-ad98-422b-b8da-cbf1f626e2115nt0ocwy/croots/recipe/pyzmq_1657724193704/work +queuelib==1.6.2 +requests @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_516b78ce-034d-4395-b9b5-1d78c2847384qtnol99l/croots/recipe/requests_1657734628886/work +requests-file @ file:///Users/ktietz/demo/mc3/conda-bld/requests-file_1629455781986/work +Scrapy==2.9.0 +scrapy-fake-useragent==1.4.4 +scrapy-rotating-proxies==0.6.2 +scrapy-splash==0.9.0 +Send2Trash @ file:///tmp/build/80754af9/send2trash_1632406701022/work +service-identity==21.1.0 +six @ file:///tmp/build/80754af9/six_1644875935023/work +snowballstemmer @ file:///tmp/build/80754af9/snowballstemmer_1637937080595/work +sphinxcontrib-devhelp @ file:///home/ktietz/src/ci/sphinxcontrib-devhelp_1611920923094/work +sphinxcontrib-jsmath @ file:///home/ktietz/src/ci/sphinxcontrib-jsmath_1611920942228/work +sphinxcontrib-qthelp @ file:///home/ktietz/src/ci/sphinxcontrib-qthelp_1611921055322/work +sphinxcontrib-serializinghtml @ file:///tmp/build/80754af9/sphinxcontrib-serializinghtml_1624451540180/work +terminado @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_18_p3gbeio/croot/terminado_1671751835656/work +testpath @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_aaf4aec7-dbb6-43d6-9707-824338b4efc82yrt6xjp/croots/recipe/testpath_1655908558843/work +tldextract==3.4.4 +toml @ file:///tmp/build/80754af9/toml_1616166611790/work +tornado @ file:///opt/concourse/worker/volumes/live/d531d395-893c-4ca1-6a5f-717b318eb08c/volume/tornado_1606942307627/work +traitlets @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_0dtilxc0bw/croot/traitlets_1671143889152/work +Twisted==22.10.0 +typing==3.7.4.3 +typing_extensions @ file:///opt/conda/conda-bld/typing_extensions_1647553014482/work +urllib3 @ file:///opt/conda/conda-bld/urllib3_1643638302206/work +w3lib==2.1.1 +wcwidth @ file:///Users/ktietz/demo/mc3/conda-bld/wcwidth_1629357192024/work +webencodings==0.5.1 +zipp @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b71z79bye2/croot/zipp_1672387125902/work +zope.interface==6.0 diff --git a/oliveyoung_crawler/run_spider.sh b/oliveyoung_crawler/run_spider.sh new file mode 100644 index 0000000..cb93038 --- /dev/null +++ b/oliveyoung_crawler/run_spider.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +docker rm splash-local + +docker pull scrapinghub/splash + +docker run --name splash-local -p 8050:8050 -d scrapinghub/splash + +sleep 10 + +scrapy crawl oliveyoung_product + +sleep 10 + +scrapy crawl tiktok_hashtag + +docker stop splash-local +docker rm splash-local \ No newline at end of file diff --git a/oliveyoung_crawler/settings.py b/oliveyoung_crawler/settings.py new file mode 100644 index 0000000..b1d406d --- /dev/null +++ b/oliveyoung_crawler/settings.py @@ -0,0 +1,117 @@ +# Scrapy settings for raena_crawler project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "raena_crawler" + +SPIDER_MODULES = ["raena_crawler.spiders"] +NEWSPIDER_MODULE = "raena_crawler.spiders" + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = "raena_crawler (+http://www.yourdomain.com)" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +CONCURRENT_REQUESTS = 10 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# "raena_crawler.middlewares.RaenaCrawlerSpiderMiddleware": 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# "raena_crawler.middlewares.RaenaCrawlerDownloaderMiddleware": 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'raena_crawler.pipelines.OliveYoungPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = "httpcache" +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" + +SPLASH_URL = 'http://localhost:8050' +DOWNLOADER_MIDDLEWARES = { + 'scrapy_splash.SplashCookiesMiddleware': 723, + 'scrapy_splash.SplashMiddleware': 725, + 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, + 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None, + 'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400, + 'scrapy_fake_useragent.middleware.RetryUserAgentMiddleware': 401, + 'rotating_proxies.middlewares.RotatingProxyMiddleware': 610, + 'rotating_proxies.middlewares.BanDetectionMiddleware': 620, +} +SPIDER_MIDDLEWARES = { + 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, +} +DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' + +FAKEUSERAGENT_PROVIDERS = [ + 'scrapy_fake_useragent.providers.FakeUserAgentProvider', # This is the first provider we'll try + 'scrapy_fake_useragent.providers.FakerProvider', # If FakeUserAgentProvider fails, we'll use faker to generate a user-agent string for us + 'scrapy_fake_useragent.providers.FixedUserAgentProvider', # Fall back to USER_AGENT value +] + +USER_AGENT = 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148' diff --git a/oliveyoung_crawler/spiders/__init__.py b/oliveyoung_crawler/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/oliveyoung_crawler/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/oliveyoung_crawler/spiders/oliveyoung.py b/oliveyoung_crawler/spiders/oliveyoung.py new file mode 100644 index 0000000..d0b951f --- /dev/null +++ b/oliveyoung_crawler/spiders/oliveyoung.py @@ -0,0 +1,94 @@ +import scrapy +from scrapy_splash import SplashRequest +import psycopg2 +import logging + + +config = { + "db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com", + "db_port": "5432", + "db": "analytics", + "db_user": "dbadmin", + "db_pass": "5qCif6eyY3Kmg4z" + } + + +class OliveyoungSpider(scrapy.Spider): + name = 'oliveyoung_product' + allowed_domains = ['https://global.oliveyoung.com/?gad=1&gclid=CjwKCAjwq4imBhBQEiwA9Nx1Bi5w7mSF9wgKTFqfX37hyG_c3ocYHldGoXbIX1XfYKQQFxLOPECJCxoCxpEQAvD_BwE'] + + def start_requests(self): + url = 'https://global.oliveyoung.com/' + yield SplashRequest(url, self.parse, args={'wait': 5}) + + + def parse(self, response): + + conn = psycopg2.connect(database=config.get('db'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port')) + logging.info(conn) + conn.autocommit = True + cur = conn.cursor() + + product_sections = [ + ('Best Sellers','#\#tab12'), + ('MDS PICK','#\#tab22'), + ('K-POP','div.main-section:nth-child(6) > div:nth-child(2)'), + ('Featured','.main-brand-banner'), + ('RECOMMENDATION','div.main-section:nth-child(9) > div:nth-child(2)'), + ('FEATURED BRANDS', '#featuredBrands > div:nth-child(2)') + ] + + for product_section in product_sections: + + products = response.css(str(product_section[1])) + + product_selector = '.wrap-prd-info' + brand_selector = '.list-thumb-tit::text' + + if 'FEATURED BRANDS' in product_section[0]: + product_selector = '.fig-title.ellipsis' + brand_selector = '.fig-title.ellipsis::text' + + for product in products: + items = product.css(product_selector) + for item in items: + + product_brand = (item.css(brand_selector).extract_first("")).replace("'","").strip() + product_name = item.css('.list-thumb-info::text').extract_first("").replace("'","").strip() + original_price = item.css('.price-cost::text').extract_first("").strip() + discounted_price = item.css('.prd-list-amountDue::text').extract_first("").strip() + + logging.info("Collecting data for: {}".format(product_name)) + + sql = f""" + select product_section,product_brand,product_name from raena_spider_management.oliveyoung_products where product_section='{product_section[0]}' and product_brand='{product_brand}' and product_name='{product_name}' + """ + + #logging.info(sql) + + cur.execute(sql) + + res = cur.fetchone() + + if res: + + sql = f""" + update raena_spider_management.oliveyoung_products set original_price='{original_price}', + discounted_price='{discounted_price}', updatedat=now() + where product_section='{product_section[0]}' and product_brand='{product_brand}' and product_name='{product_name}' + """ + #logging.info(sql) + + cur.execute(sql) + + else: + + sql = f""" + insert into raena_spider_management.oliveyoung_products(product_section,product_brand,product_name,original_price,discounted_price,createdat,updatedat) + values('{product_section[0]}','{product_brand}','{product_name}','{original_price}','{discounted_price}',now(),now()) + """ + #logging.info(sql) + + cur.execute(sql) + + conn.close() \ No newline at end of file diff --git a/oliveyoung_crawler/spiders/oliveyoung_bk.py b/oliveyoung_crawler/spiders/oliveyoung_bk.py new file mode 100644 index 0000000..fd1d6ec --- /dev/null +++ b/oliveyoung_crawler/spiders/oliveyoung_bk.py @@ -0,0 +1,63 @@ +# oliveyoung.py +import scrapy +import requests + + +class OliveYoungSpider(scrapy.Spider): + name = 'oliveyoung_bk' + start_urls = [ + 'https://global.oliveyoung.com/?gad=1&gclid=CjwKCAjwq4imBhBQEiwA9Nx1Bi5w7mSF9wgKTFqfX37hyG_c3ocYHldGoXbIX1XfYKQQFxLOPECJCxoCxpEQAvD_BwE'] + + def parse(self, response): + sections = { + "Best Sellers": "//div[@class='slick-slider-customized']/div[contains(@class,'slick-slide')]", + # "MD's Pick": "//section[@id='md_pick']/div[@class='item']/div[@class='product-item']", + # "Featured Brands": "//section[@id='brand_list']/div[@class='product-item']", + # "K-Pop": "//section[@id='kpop_list']/div[@class='product-item']", + # "INNISFREE": "//section[@id='brand_zone']/div[contains(@class,'brand-inn-store')]//div[" + # "@class='product-item']", + # "Recommendation": "//section[@id='recommendation']/div[contains(@class,'product-item')]", + } + + # Extract data from each section + for section_name, section_xpath in sections.items(): + products = response.xpath(section_xpath) + for product in products: + brand_name = product.xpath(".//span[@class='brand']/text()").get() + product_name = product.xpath(".//span[@class='name']/text()").get() + price = product.xpath(".//span[@class='num']/text()").get() + + if brand_name: + yield { + "brand_name": brand_name.strip(), + "product_name": product_name.strip(), + "price": price.strip(), + "section": section_name, + } + + # # Generate hashtags for each brand name + # hashtags = [word.lower() for word in brand_name.split()] + # hashtags = '#'.join(hashtags) + # yield { + # "brand_name": brand_name.strip(), + # "hashtags": f"#{hashtags}", + # } + # + # # Fetch views data from TikTok API using tiktok_api.py + # views_all, views = get_hashtag_views(hashtags) + # yield { + # "brand_name": brand_name.strip(), + # "hashtags": f"#{hashtags}", + # "views_all": views_all, + # "views": views, + # } + + +def get_hashtag_views(hashtag): + url = f'https://ads.tiktok.com/creative_radar_api/v1/popular_trend/hashtag/detail?period=7&hashtag_name={hashtag}&country_code=IS' + headers = { + # Add the headers from the CURL request here + } + response = requests.get(url, headers=headers) + data = response.json() + return data.get('hashtag', {}).get('video_views_all', 0), data.get('hashtag', {}).get('video_views', 0) diff --git a/oliveyoung_crawler/spiders/tiktok_hashtag.py b/oliveyoung_crawler/spiders/tiktok_hashtag.py new file mode 100644 index 0000000..cd53eaf --- /dev/null +++ b/oliveyoung_crawler/spiders/tiktok_hashtag.py @@ -0,0 +1,103 @@ +import scrapy +import psycopg2 +import logging +import time +import random + +config = { + "db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com", + "db_port": "5432", + "db": "analytics", + "db_user": "dbadmin", + "db_pass": "5qCif6eyY3Kmg4z" +} + +class TiktokHashtag(scrapy.Spider): + name = 'tiktok_hashtag' + start_urls = ['https://ads.tiktok.com/business/creativecenter/hashtag/beautyofjoseon/pc/en?countryCode=ID&period=7'] + + def start_requests(self): + + conn = psycopg2.connect(database=config.get('db'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port')) + logging.info(conn) + conn.autocommit = True + cur = conn.cursor() + + + sql = f""" + select distinct product_brand, hashtag from ( + select distinct product_brand, replace(regexp_replace(lower(product_brand), '[^\w]+','','g'),' ','') hashtag + from raena_spider_management.oliveyoung_products + union + select distinct product_brand, replace(regexp_replace(lower(product_brand), '[^\w]+',' ','g'),' ','_') hashtag + from raena_spider_management.oliveyoung_products) a + order by product_brand + """ + + logging.info(sql) + + cur.execute(sql) + + brands = cur.fetchall() + + logging.info(brands) + + for brand in brands: + url_hashtag = "https://ads.tiktok.com/business/creativecenter/hashtag/"+brand[1]+"/pc/en?countryCode=ID&period=7" + + yield scrapy.Request(url_hashtag, self.get_hashtag_info, meta={'meta': brand}) + time.sleep(random.randint(10,20)) + + def get_hashtag_info(self, response): + + logging.info("Collecting hashTag info") + + conn = psycopg2.connect(database=config.get('db'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port')) + conn.autocommit = True + cur = conn.cursor() + + brand = response.meta.get('meta') + + post_last7days = "0" + post_overall = "0" + view_last7days = "0" + view_overall = "0" + + try: + post_last7days = response.xpath('/html/body/div[1]/div/main/div/div[1]/div/div[2]/div[1]/div[2]/div[1]/div/div[1]/span[1]/text()').get() + post_overall = response.xpath('/html/body/div[1]/div/main/div/div[1]/div/div[2]/div[1]/div[2]/div[1]/div/div[3]/span[1]/text()').get() + + view_last7days = response.xpath('/html/body/div[1]/div/main/div/div[1]/div/div[2]/div[1]/div[2]/div[2]/div/div[1]/span[1]/text()').get() + view_overall = response.xpath('/html/body/div[1]/div/main/div/div[1]/div/div[2]/div[1]/div[2]/div[2]/div/div[3]/span[1]/text()').get() + + except: + pass + + sql = f""" + select product_brand,brand_hashtag from raena_spider_management.oliveyoung_brand_hashtag + where product_brand='{brand[0]}' and brand_hashtag='{brand[1]}' + """ + + cur.execute(sql) + res = cur.fetchone() + + if res: + + sql = f""" + update raena_spider_management.oliveyoung_brand_hashtag set posts='{post_last7days}', posts_total='{post_overall}', + views='{view_last7days}', views_overall='{view_overall}', updatedat=now() + where product_brand='{brand[0]}' and brand_hashtag='{brand[1]}' + """ + + cur.execute(sql) + + else: + + sql = f""" + insert into raena_spider_management.oliveyoung_brand_hashtag(product_brand,brand_hashtag,posts,posts_total,views,views_overall,createdat,updatedat) + values('{brand[0]}','{brand[1]}','{post_last7days}','{post_overall}','{view_last7days}','{view_overall}',now(),now()) + """ + + cur.execute(sql) + + conn.close() diff --git a/shopee_crawler_engine/Readme.md b/shopee_crawler_engine/Readme.md new file mode 100644 index 0000000..cd2c28f --- /dev/null +++ b/shopee_crawler_engine/Readme.md @@ -0,0 +1,76 @@ +***Run:*** +1. Change config accourding to the crawler type. +2. run "python shopee_crawler.py" + +***Config for Master:*** + +config = { +"crawler_name": "raena_crawler_enginer_shopee", +"crawler_schema": "raena_spider_management", +"category_tab": "rce_category", +"tracker_tab": "crawler_tracker", +"product_tab": "rce_product", +"variant_tab": "rce_product_variant", +"brand_tab": "rce_brand", +"reseller_tab": "rce_reseller", +"reseller_store_tab": "rce_reseller_store", +"review_tab": "rce_ratings_reviews", +"product_per_category": "136", +"source_category": "11043145", +"db_user": "crawler", +"db_pass": "4Z063Zp9Aczv", +"database": "raena_db", +"db_host": "raen-prd-sg-aurora-pg-rds-cluster-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com", +"db_port": "5432", +"crawler_main": "1", +"crawler_slave_no": "" +} + + +***Config for Slave01:*** + +config = { +"crawler_name": "raena_crawler_enginer_shopee", +"crawler_schema": "raena_spider_management", +"category_tab": "rce_category", +"tracker_tab": "crawler_tracker", +"product_tab": "rce_product", +"variant_tab": "rce_product_variant", +"brand_tab": "rce_brand", +"reseller_tab": "rce_reseller", +"reseller_store_tab": "rce_reseller_store", +"review_tab": "rce_ratings_reviews", +"product_per_category": "136", +"source_category": "11043145", +"db_user": "crawler", +"db_pass": "4Z063Zp9Aczv", +"database": "raena_db", +"db_host": "raen-prd-sg-aurora-pg-rds-cluster-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com", +"db_port": "5432", +"crawler_main": "0", +"crawler_slave_no": "1" +} + +***Config for Slave02:*** + +config = { +"crawler_name": "raena_crawler_enginer_shopee", +"crawler_schema": "raena_spider_management", +"category_tab": "rce_category", +"tracker_tab": "crawler_tracker", +"product_tab": "rce_product", +"variant_tab": "rce_product_variant", +"brand_tab": "rce_brand", +"reseller_tab": "rce_reseller", +"reseller_store_tab": "rce_reseller_store", +"review_tab": "rce_ratings_reviews", +"product_per_category": "136", +"source_category": "11043145", +"db_user": "crawler", +"db_pass": "4Z063Zp9Aczv", +"database": "raena_db", +"db_host": "raen-prd-sg-aurora-pg-rds-cluster-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com", +"db_port": "5432", +"crawler_main": "0", +"crawler_slave_no": "2" +} \ No newline at end of file diff --git a/shopee_crawler_engine/conf.json b/shopee_crawler_engine/conf.json new file mode 100755 index 0000000..3c6183b --- /dev/null +++ b/shopee_crawler_engine/conf.json @@ -0,0 +1,25 @@ +{ + "crawler_name": "raena_crawler_enginer_shopee", + "crawler_schema": "raena_spider_management", + "category_tab": "rce_category", + "tracker_tab": "crawler_tracker", + "product_tab": "rce_product", + "variant_tab": "rce_product_variant", + "brand_tab": "rce_brand", + "reseller_tab": "rce_reseller", + "reseller_store_tab": "rce_reseller_store", + "review_tab": "rce_ratings_reviews", + "review_productmodels_tab": "rce_ratings_reviews_productmodels", + "review_producttags_tab": "rce_ratings_reviews_producttags", + "review_tags": "rce_tags", + "source_tab": "rce_source", + "product_per_category": "136", + "source_category": "11043145", + "db_user": "crawler", + "db_pass": "4Z063Zp9Aczv", + "database": "raena_db", + "db_host": "raen-prd-sg-aurora-pg-rds-cluster-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com", + "db_port": "5432", + "crawler_main": "1", + "crawler_slave_no": "" +} \ No newline at end of file diff --git a/shopee_crawler_engine/requirments.txt b/shopee_crawler_engine/requirments.txt new file mode 100644 index 0000000..04d9a40 --- /dev/null +++ b/shopee_crawler_engine/requirments.txt @@ -0,0 +1,147 @@ +alembic==1.9.3 +anyio==3.6.2 +apache-airflow==2.5.1 +apache-airflow-providers-amazon==7.2.0 +apache-airflow-providers-common-sql==1.3.3 +apache-airflow-providers-ftp==3.3.1 +apache-airflow-providers-http==4.1.1 +apache-airflow-providers-imap==3.1.1 +apache-airflow-providers-sqlite==3.3.1 +apispec==3.3.2 +argcomplete==1.12.3 +asn1crypto==1.5.1 +attrs==22.2.0 +Babel==2.11.0 +beautifulsoup4==4.11.2 +blinker==1.5 +boto3==1.26.69 +botocore==1.29.69 +cached-property==1.5.2 +cachelib==0.9.0 +cattrs==22.2.0 +certifi==2022.12.7 +cffi==1.15.1 +chardet==3.0.4 +charset-normalizer==3.0.1 +click==8.1.3 +clickclick==20.10.2 +colorama==0.4.6 +colorlog==4.0.2 +configparser==3.5.3 +ConfigUpdater==3.1.1 +connexion==2.14.2 +cron-descriptor==1.2.35 +croniter==0.3.37 +cryptography==39.0.1 +decorator==5.1.1 +defusedxml==0.7.1 +Deprecated==1.2.13 +dill==0.3.6 +dnspython==2.3.0 +docutils==0.19 +email-validator==1.3.1 +exceptiongroup==1.1.0 +Flask==2.2.2 +Flask-Admin==1.5.4 +Flask-AppBuilder==4.1.4 +Flask-Babel==1.0.0 +Flask-Caching==2.0.2 +Flask-JWT-Extended==4.4.4 +Flask-Login==0.6.2 +Flask-OpenID==1.3.0 +Flask-Session==0.4.0 +Flask-SQLAlchemy==2.5.1 +flask-swagger==0.2.14 +Flask-WTF==1.1.1 +funcsigs==1.0.2 +future==0.18.3 +graphviz==0.20.1 +greenlet==2.0.2 +gunicorn==20.1.0 +h11==0.14.0 +httpcore==0.16.3 +httpx==0.23.3 +idna==2.10 +importlib-resources==1.5.0 +inflection==0.5.1 +iso8601==1.1.0 +itsdangerous==2.1.2 +Jinja2==3.1.2 +jmespath==0.10.0 +json-merge-patch==0.2 +jsonpath-ng==1.5.3 +jsonschema==3.2.0 +lazy-object-proxy==1.4.3 +linkify-it-py==2.0.0 +lockfile==0.12.2 +lxml==4.9.2 +Mako==1.2.4 +Markdown==3.4.1 +markdown-it-py==2.1.0 +MarkupSafe==2.1.2 +marshmallow==3.19.0 +marshmallow-enum==1.5.1 +marshmallow-oneofschema==3.0.1 +marshmallow-sqlalchemy==0.23.1 +mdit-py-plugins==0.3.3 +mdurl==0.1.2 +mypy-boto3-appflow==1.26.53 +mypy-boto3-rds==1.26.47 +mypy-boto3-redshift-data==1.26.30 +natsort==8.2.0 +numpy==1.24.2 +packaging==23.0 +pandas==1.5.3 +pathspec==0.9.0 +pendulum==2.1.2 +piapy==0.2.0 +pluggy==1.0.0 +ply==3.11 +prison==0.2.1 +protobuf==4.21.12 +psutil==5.9.4 +pycparser==2.21 +Pygments==2.14.0 +PyJWT==2.6.0 +pyrsistent==0.19.3 +python-daemon==2.3.2 +python-dateutil==2.8.2 +python-dotenv==0.21.1 +python-nvd3==0.15.0 +python-slugify==8.0.0 +python3-openid==3.2.0 +pytz==2022.7.1 +pytzdata==2020.1 +PyYAML==6.0 +redshift-connector==2.0.910 +requests==2.28.2 +requests-toolbelt==0.10.1 +rfc3986==1.5.0 +rich==13.3.1 +s3transfer==0.6.0 +scramp==1.4.4 +setproctitle==1.3.2 +six==1.16.0 +sniffio==1.3.0 +soupsieve==2.3.2.post1 +SQLAlchemy==1.4.9 +SQLAlchemy-JSONField==1.0.1.post0 +sqlalchemy-redshift==0.8.12 +SQLAlchemy-Utils==0.40.0 +sqlparse==0.4.3 +swagger-ui-bundle==0.0.9 +tabulate==0.8.10 +tenacity==8.2.1 +termcolor==2.2.0 +text-unidecode==1.3 +thrift==0.16.0 +typing_extensions==4.4.0 +tzlocal==1.5.1 +uc-micro-py==1.0.1 +unicodecsv==0.14.1 +urllib3==1.25.11 +watchtower==2.0.1 +Werkzeug==2.2.2 +wrapt==1.14.1 +WTForms==2.3.3 +zope.deprecation==4.4.0 diff --git a/shopee_crawler_engine/shopee_category_products.py b/shopee_crawler_engine/shopee_category_products.py new file mode 100644 index 0000000..f9f3d56 --- /dev/null +++ b/shopee_crawler_engine/shopee_category_products.py @@ -0,0 +1,177 @@ +import hashlib +import logging + +from selenium import webdriver +from selenium.webdriver import ActionChains, Keys +from selenium.webdriver.chrome.service import Service +import psycopg2 +from selenium.webdriver.common.by import By +import bs4 +from webdriver_manager.chrome import ChromeDriverManager +import random +from bs4 import BeautifulSoup +import json +import time + +class shopee_category_products: + + def __init__(self, config): + self.config = config + self.crawler_name = self.config.get("crawler_name") + self.url = "https://shopee.co.id/" + self.product_limit = int(self.config.get("product_per_category")) + self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) + self.conn.autocommit = True + self.cur = self.conn.cursor() + sql = "delete from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where crawler_name='"+str(self.crawler_name)+"'" + self.cur.execute(sql) + + def __del__(self): + print("Closing connection.....") + self.conn.close() + + def browse_category_page(self): + op = webdriver.ChromeOptions() + hight = str(random.randint(640,1280)) + width = str(random.randint(1024,1920)) + op.add_argument("window-size="+width+","+hight+"") + op.add_experimental_option("useAutomationExtension", False) + op.add_argument('--no-sandbox') + op.add_argument('--disable-notifications') + op.add_argument("--lang=en-GB") + op.headless = True + driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op) + + driver.get("https://shopee.co.id") + time.sleep(5) + cat = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/div/div/div[3]/div[2]/div[1]/div/div/div[2]/div/div[1]/ul/li[2]/div/a[2]/div') + ActionChains(driver).move_to_element(cat).double_click().perform() + time.sleep(10) + driver.execute_script("document.body.style.zoom='15%'") + time.sleep(10) + + filters = driver.find_elements(By.CLASS_NAME, 'shopee-sort-by-options__option') + for filter in filters: + if filter.text == 'Terlaris': + logging.info("Sorting data by top sales.......") + driver.execute_script("arguments[0].click();", filter) + time.sleep(5) + + lim = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/div/div[1]/div[4]/div[2]/div/div[1]/div[2]/div/span[2]').text + + + cat = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/div/div[1]/div[4]/div[1]/div[1]/div/div/div[1]/a').text + print("Collecting products for category: {}".format(str(cat))) + pg_cnt = 1 + print("Collecting data for page: {}".format(str(pg_cnt))) + + cnt = 0 + skip = 0 + cnt, skip = self.get_product(driver.page_source, cat, cnt, skip) + for i in range(int(lim)-1): + pg_cnt += 1 + next = driver.find_element(By.XPATH,'/html/body/div[1]/div/div[2]/div/div[1]/div[4]/div[2]/div/div[1]/div[2]/button[2]') + driver.execute_script("arguments[0].click();", next) + time.sleep(5) + print("Collecting data for page: {}".format(str(pg_cnt))) + cnt, skip = self.get_product(driver.page_source, cat, cnt, skip) + if cnt >=self.product_limit: + break + + + more_cat = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/div/div/div[4]/div[1]/div[1]/div/div/div[2]/div/div[1]/div') + driver.execute_script("arguments[0].click();", more_cat) + + time.sleep(10) + + elements = driver.find_elements(By.CLASS_NAME, 'shopee-category-list__sub-category') + + for element in elements: + driver.execute_script("arguments[0].click();", element) + time.sleep(5) + filters = driver.find_elements(By.CLASS_NAME, 'shopee-sort-by-options__option') + for filter in filters: + if filter.text == 'Terlaris': + logging.info("Sorting data by top sales.......") + driver.execute_script("arguments[0].click();", filter) + time.sleep(5) + lim = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/div/div/div[3]/div[2]/div/div[1]/div[2]/div/span[2]').text + + + print("Collecting products for subcategory: {}".format(str(element.text))) + pg_cnt = 1 + print("Collecting data for page: {}".format(str(pg_cnt))) + + cnt = 0 + skip = 0 + cnt, skip = self.get_product(driver.page_source, element.text, cnt, skip) + for i in range(int(lim)-1): + pg_cnt += 1 + next = driver.find_element(By.XPATH,'/html/body/div[1]/div/div[2]/div/div/div[3]/div[2]/div/div[1]/div[2]/button[2]') + driver.execute_script("arguments[0].click();", next) + time.sleep(5) + print("Collecting data for page: {}".format(str(pg_cnt))) + cnt, skip = self.get_product(driver.page_source, element.text, cnt, skip) + if cnt >=self.product_limit: + break + + time.sleep(random.randint(20,35)) + + def get_product(self, page_source, cat, cnt_main, skip_main): + try: + #Fetch page source + data = page_source + time.sleep(5) + + #Fetch data from page source + try: + soup = bs4.BeautifulSoup(data,features="lxml") + all_product = soup.find_all('div',{'class':"col-xs-2-4 shopee-search-item-result__item"}) + + cnt = cnt_main + skip = skip_main + + for product in all_product: + + try: + + product_link_element = product.find('a') + product_page_url = product_link_element.get('href') + product_page_url = ("https://shopee.co.id"+product_page_url).replace("'","''") + product_page_url_hash = hashlib.md5(product_page_url.encode('utf-8')).hexdigest() + + + ids = ((product_page_url.split('-i.')[1]).split('?')[0]).split('.') + itemid = ids[1] + shopid = ids[0] + flag = 0 + + #print("itemid: {}; shopid: {}".format(str(itemid), str(shopid))) + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where itemid='"+itemid+"' and shopid='"+shopid+"'" + self.cur.execute(sql) + res = self.cur.fetchall() + + if not res: + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+"(crawler_name,keyword,shopid,itemid,product_page_url,product_page_url_hash,flag) values('"+str(self.crawler_name)+"','"+str(cat)+"',"+str(shopid)+","+str(itemid)+",'"+product_page_url+"','"+product_page_url_hash+"',"+str(flag)+")" + self.cur.execute(sql) + + cnt += 1 + if cnt >=self.product_limit: + break + + #conn.commit() + else: + #print("Already collected. Skipping") + skip += 1 + except Exception as e: + print("ERROR: {}".format(str(e))) + + print("Total Items: {}\nTotal Collected: {}\nTotal Skipped: {}".format(str(len(all_product)),str(cnt), str(skip))) + return cnt, skip + except Exception as e: + print("Error: {}".format(str(e))) + + except: + print("ERROR: Data cannot be collected.") + diff --git a/shopee_crawler_engine/shopee_crawler.py b/shopee_crawler_engine/shopee_crawler.py new file mode 100644 index 0000000..c3faba6 --- /dev/null +++ b/shopee_crawler_engine/shopee_crawler.py @@ -0,0 +1,213 @@ +from shopee_sub_categories import shopee_sub_categories +from shopee_category_products import shopee_category_products +from shopee_products import shopee_products +import logging +import psycopg2 +import json + +###### Looger ###### +format = "%(asctime)s: %(message)s" +logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") + +config = {} + +def get_sub_category(): + sub_cat = shopee_sub_categories(config) + sub_cat.get_sub_categories() + + +def get_category_products(cur, slave01, slave02): + products = shopee_category_products(config) + products.browse_category_page() + + if not slave01: + sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_slave01',1)" + cur.execute(sql) + else: + sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=1 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave01'" + cur.execute(sql) + + + if not slave02: + sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_slave02',1)" + cur.execute(sql) + else: + sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=1 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave02'" + cur.execute(sql) + +def get_products_info(): + product_info = shopee_products(config) + product_info.get_shopee_products() + + +def main(): + + crawler_main = int(config.get('crawler_main')) + crawler_slave_no = int(config.get('crawler_slave_no')) if config.get('crawler_slave_no') else None + + if crawler_main: + crawler_master() + else: + if crawler_slave_no == 1: + crawler_slave1() + elif crawler_slave_no ==2: + crawler_slave2() + +def crawler_master(): + conn = psycopg2.connect(database=config.get('database'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port')) + conn.autocommit = True + cur = conn.cursor() + + sql = "select crawler_name,keyword,flag from "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'" + cur.execute(sql) + res = cur.fetchone() + + sql = "select crawler_name,keyword,flag from "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave01'" + cur.execute(sql) + slave01 = cur.fetchone() + + sql = "select crawler_name,keyword,flag from "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave02'" + cur.execute(sql) + slave02 = cur.fetchone() + + if not res: + sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_master',0)" + cur.execute(sql) + if not slave01: + sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_slave01',0)" + cur.execute(sql) + else: + sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=0 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave01'" + cur.execute(sql) + + + if not slave02: + sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_slave02',0)" + cur.execute(sql) + else: + sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=0 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave02'" + cur.execute(sql) + + get_sub_category() + sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=1 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'" + cur.execute(sql) + + get_category_products(cur, slave01, slave02) + sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=2 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'" + cur.execute(sql) + + get_products_info() + sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=3 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'" + cur.execute(sql) + else: + if res[2]==0: + if not slave01: + sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_slave01',0)" + cur.execute(sql) + else: + sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=0 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave01'" + cur.execute(sql) + + + if not slave02: + sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_slave02',0)" + cur.execute(sql) + else: + sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=0 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave02'" + cur.execute(sql) + + + get_sub_category() + sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=1 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'" + cur.execute(sql) + + get_category_products(cur, slave01, slave02) + sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=2 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'" + cur.execute(sql) + + get_products_info() + sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=3 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'" + cur.execute(sql) + elif res[2]==1: + get_category_products(cur, slave01, slave02) + sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=2 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'" + cur.execute(sql) + + get_products_info() + sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=3 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'" + cur.execute(sql) + elif res[2]==2: + get_products_info() + sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=3 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'" + cur.execute(sql) + elif res[2]==3: + + if slave01[2]==2 and slave02[2]==2: + sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=0 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'" + cur.execute(sql) + main() + else: + logging.info("Slaves are working.....") + + conn.close() + + conn.close() + +def crawler_slave1(): + conn = psycopg2.connect(database=config.get('database'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port')) + conn.autocommit = True + cur = conn.cursor() + + sql = "select crawler_name,keyword,flag from "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave01'" + cur.execute(sql) + res = cur.fetchone() + + if res: + if res[2]==1: + get_products_info() + sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=2 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave01'" + cur.execute(sql) + else: + logging.info("Slave02 or Master are working.....") + + + + conn.close() + +def crawler_slave2(): + conn = psycopg2.connect(database=config.get('database'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port')) + conn.autocommit = True + cur = conn.cursor() + + sql = "select crawler_name,keyword,flag from "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave02'" + cur.execute(sql) + res = cur.fetchone() + + if res: + if res[2]==1: + get_products_info() + sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=2 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave02'" + cur.execute(sql) + else: + logging.info("Slave01 or Master are working.....") + + conn.close() + +if __name__ == "__main__": + logging.info("Starting Shopee Crawler.......") + try: + logging.info("Loading config file.......") + with open("conf.json", "r") as jsonfile: + config = json.load(jsonfile) + logging.info("Config file loaded.......") + + main() + + except Exception as e: + #logging.info("Error: ".format(e)) + logging.info("Cannot load cofig file. Please check. Exiting......") + exit(1) + + + + diff --git a/shopee_crawler_engine/shopee_db_writer.py b/shopee_crawler_engine/shopee_db_writer.py new file mode 100755 index 0000000..55eff14 --- /dev/null +++ b/shopee_crawler_engine/shopee_db_writer.py @@ -0,0 +1,587 @@ +import logging +import psycopg2 + +###### Looger ###### +format = "%(asctime)s: %(message)s" +logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") + +class shopee_db_writer: + def __init__(self, config): + self.config = config + self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) + self.conn.autocommit = True + self.cur = self.conn.cursor() + + def __del__(self): + logging.info("Closing connection.....") + self.conn.close() + + def rce_category(self, data): + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" where rce_source_category_id = "+str(data['rce_source_category_id']) + self.cur.execute(sql) + res = self.cur.fetchone() + + cat_name = data['category_name'].replace("'","''") + cat_url = data['category_page_url'].replace("'","''") + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" (parent_category_id,rce_source_id," \ + "rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name) values (" \ + +str(data['parent_category_id'])+","+str(data['rce_source_id'])+", "+str(data['rce_source_category_id'])+", "+str(data['rce_source_status'])+", " \ + "'"+str(cat_url)+"', '"+str(data['category_page_url_hash'])+"', '"+str(cat_name)+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(id,parent_category_id,rce_source_id," \ + "rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat) " \ + "select id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \ + "category_name,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \ + "where rce_source_category_id = "+ str(data['rce_source_category_id']) + #logging.info(sql) + + self.cur.execute(sql) + + else: + if str(data['parent_category_id'])==str(res[1]) and str(data['rce_source_category_id'])==str(res[3]) and str(data['category_name']) == str(res[7]) and \ + str(data['category_page_url'])==str(res[5]): + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" set updatedat=now() " \ + "where rce_source_category_id = "+ str(res[3]) + logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + logging.info(sql) + self.cur.execute(sql) + else: + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" set parent_category_id = " \ + ""+str(data['parent_category_id'])+", rce_source_category_id = "+str(data['rce_source_category_id'])+", " \ + "category_name='"+str(cat_name)+"', category_page_url='"+str(cat_url)+"', " \ + "category_page_url_hash='"+str(data['category_page_url_hash'])+"', updatedat=now() where " \ + "rce_source_category_id = "+ str(res[3]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(id,parent_category_id,rce_source_id," \ + "rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat) " \ + "select id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \ + "category_name,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \ + "where rce_source_category_id = "+ str(res[3]) + #logging.info(sql) + + self.cur.execute(sql) + + def rce_product(self, data): + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_id = "+str(data['rce_source_product_id']) + self.cur.execute(sql) + res = self.cur.fetchone() + + data['product_page_url'] = data['product_page_url'].replace("'","''") + data['rce_source_product_name'] = data['rce_source_product_name'].replace("'","''") + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" (rce_source_product_id," \ + "rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \ + "rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \ + "product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \ + "ships_from) values("+str(data['rce_source_product_id'])+","+str(data['rce_source_product_status'])+",'"+str(data['product_page_url'])+"'," \ + "'"+str(data['product_page_url_hash'])+"',"+str(data['rce_category_id'])+","+str(data['rce_brand_id'])+","+str(data['rce_store_id'])+"," \ + "'"+str(data['rce_source_product_name'])+"','"+str(data['product_images'])+"','"+str(data['product_description'])+"',"+str(data['product_sold_total'])+"," \ + ""+str(data['product_sold'])+",'"+str(data['product_price_min'])+"','"+str(data['product_price_min_before_discount'])+"','"+str(data['product_price_max'])+"'," \ + "'"+str(data['product_price_max_before_discount'])+"','"+str(data['ratings'])+"','"+str(data['ships_from'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" (id,rce_source_product_id," \ + "rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \ + "rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \ + "product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \ + "ships_from,createdat,updatedat) select id,rce_source_product_id," \ + "rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \ + "rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \ + "product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \ + "ships_from,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where " \ + "rce_source_product_id="+str(data['rce_source_product_id'])+"" + #logging.info(sql) + self.cur.execute(sql) + else: + + if str(data['rce_source_product_id'])==str(res[1]) and str(data['rce_source_product_status'])==str(res[2]) and \ + str(data['product_page_url'])==str(res[3]) and str(data['product_page_url_hash'])==str(res[4]) and str(data['rce_category_id'])==str(res[5]) and \ + str(data['rce_brand_id'])==str(res[6]) and str(data['rce_store_id'])==str(res[7]) and str(data['rce_source_product_name'])==str(res[8]) and \ + str(data['product_images'])==str(res[9]) and str(data['product_sold_total'])==str(res[11]) and \ + str(data['product_sold'])==str(res[12]) and str(data['product_price_min'])==str(res[13]) and str(data['product_price_min_before_discount'])==str(res[14]) and \ + str(data['product_price_max'])==str(res[15]) and str(data['product_price_max_before_discount'])==str(res[16]) and str(data['ratings'])==str(res[17]) and \ + str(data['ships_from'])==str(res[18]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" set updatedat=now() " \ + "where rce_source_product_id = "+ str(res[1]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" set rce_source_product_id="+str(data['rce_source_product_id'])+"," \ + "rce_source_product_status="+str(data['rce_source_product_status'])+",product_page_url='"+str(data['product_page_url'])+"',product_page_url_hash= " \ + "'"+str(data['product_page_url_hash'])+"',rce_category_id="+str(data['rce_category_id'])+",rce_brand_id="+str(data['rce_brand_id'])+"," \ + "rce_store_id="+str(data['rce_store_id'])+",rce_source_product_name='"+str(data['rce_source_product_name'])+"',product_images='"+str(data['product_images'])+"'" \ + ",product_description='"+str(data['product_description'])+"',product_sold_total="+str(data['product_sold_total'])+",product_sold="+str(data['product_sold'])+"," \ + "product_price_min='"+str(data['product_price_min'])+"',product_price_min_before_discount='"+str(data['product_price_min_before_discount'])+"'," \ + "product_price_max='"+str(data['product_price_max'])+"',product_price_max_before_discount='"+str(data['product_price_max_before_discount'])+"',ratings='"+str(data['ratings'])+"'," \ + "ships_from='"+str(data['ships_from'])+"', updatedat=now() where rce_source_product_id = "+ str(res[1]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" (id,rce_source_product_id," \ + "rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \ + "rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \ + "product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \ + "ships_from,createdat,updatedat) select id,rce_source_product_id," \ + "rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \ + "rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \ + "product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \ + "ships_from,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where " \ + "rce_source_product_id="+str(res[1])+"" + #logging.info(sql) + self.cur.execute(sql) + + + def rce_product_variant(self, data): + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where rce_source_variant_id = "+str(data['rce_source_variant_id']) + self.cur.execute(sql) + res = self.cur.fetchone() + + data['product_variant_name'] = data['product_variant_name'].replace("'","''") + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" (rce_source_variant_id,rce_product_id," \ + "product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock) values("+str(data['rce_source_variant_id'])+"," \ + ""+str(data['rce_product_id'])+",'"+str(data['product_variant_name'])+"','"+str(data['product_variant_price'])+"'," \ + "'"+str(data['product_variant_price_before_discount'])+"',"+str(data['product_variant_stock'])+")" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" (id,rce_source_variant_id,rce_product_id," \ + "product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,createdat,updatedat) select * from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where rce_source_variant_id="+str(data['rce_source_variant_id'])+"" + #logging.info(sql) + self.cur.execute(sql) + + else: + if str(data['rce_source_variant_id'])==str(res[1]) and str(data['rce_product_id'])==str(res[2]) and str(data['product_variant_name'])==str(res[3]) and \ + str(data['product_variant_price'])==str(res[4]) and str(data['product_variant_price_before_discount'])==str(res[5]) and str(data['product_variant_stock'])==str(res[6]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" set updatedat=now() " \ + "where rce_source_variant_id = "+ str(res[1]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" set rce_source_variant_id="+str(data['rce_source_variant_id'])+", " \ + "rce_product_id="+str(data['rce_product_id'])+", product_variant_name='"+str(data['product_variant_name'])+"', product_variant_price=" \ + "'"+str(data['product_variant_price'])+"',product_variant_price_before_discount='"+str(data['product_variant_price_before_discount'])+"'," \ + "product_variant_stock="+str(data['product_variant_stock'])+", updatedat=now() where rce_source_variant_id = "+ str(res[1]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" (id,rce_source_variant_id,rce_product_id," \ + "product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,createdat,updatedat) select * from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where rce_source_variant_id="+str(res[1])+"" + #logging.info(sql) + + self.cur.execute(sql) + + + def rce_brand(self, data): + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where rce_source_brand_id = "+str(data['rce_source_brand_id']) + self.cur.execute(sql) + res = self.cur.fetchone() + + data['brand_page_url'] = data['brand_page_url'].replace("'","''") + data['brand_name'] = data['brand_name'].replace("'","''") + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" (rce_source_id,rce_source_brand_id,rce_source_brand_status," \ + "brand_page_url,brand_page_url_hash,brand_name) values("+str(data['rce_source_id'])+","+str(data['rce_source_brand_id'])+"," \ + ""+str(data['rce_source_brand_status'])+",'"+str(data['brand_page_url'])+"','"+str(data['brand_page_url_hash'])+"'," \ + "'"+str(data['brand_name'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" (id,rce_source_id,rce_source_brand_id,rce_source_brand_status," \ + "brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat) select id,rce_source_id,rce_source_brand_id,rce_source_brand_status," \ + "brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where rce_source_brand_id="+str(data['rce_source_brand_id'])+"" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_source_brand_id'])==str(res[2]) and str(data['rce_source_brand_status'])==str(res[3]) and str(data['brand_page_url'])==str(res[4]) and \ + str(data['brand_page_url_hash'])==str(res[5]) and str(data['brand_name'])==str(res[6]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" set updatedat=now() " \ + "where rce_source_brand_id = "+ str(res[2]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" set rce_source_id="+str(data['rce_source_id'])+", rce_source_brand_id="+str(data['rce_source_brand_id'])+", " \ + "rce_source_brand_status="+str(data['rce_source_brand_status'])+", brand_page_url='"+str(data['brand_page_url'])+"', brand_page_url_hash=" \ + "'"+str(data['brand_page_url_hash'])+"',brand_name='"+str(data['brand_name'])+"', updatedat=now() where rce_source_brand_id = "+ str(res[2]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" (id,rce_source_id,rce_source_brand_id,rce_source_brand_status," \ + "brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat) select * from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where rce_source_brand_id="+str(res[2])+"" + #logging.info(sql) + + self.cur.execute(sql) + + def rce_reseller(self, data): + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where rce_source_reseller_id = "+str(data['rce_source_reseller_id']) + self.cur.execute(sql) + res = self.cur.fetchone() + + data['reseller_name'] = data['reseller_name'].replace("'","''") + + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" (rce_source_id,rce_source_reseller_id,rce_source_reseller_status," \ + "reseller_name,reseller_average_rating,reseller_follower_count,reseller_response_rate) values("+str(data['rce_source_id'])+","+str(data['rce_source_reseller_id'])+"," \ + ""+str(data['rce_source_reseller_status'])+",'"+str(data['reseller_name'])+"','"+str(data['reseller_average_rating'])+"'," \ + ""+str(data['reseller_follower_count'])+",'"+str(data['reseller_response_rate'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (id,rce_source_id,rce_source_reseller_id,rce_source_reseller_status," \ + "reseller_name,reseller_average_rating,reseller_follower_count,reseller_response_rate,createdat,updatedat) select id,rce_source_id,rce_source_reseller_id,rce_source_reseller_status," \ + "reseller_name,reseller_average_rating,reseller_follower_count,reseller_response_rate,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where rce_source_reseller_id="+str(data['rce_source_reseller_id'])+"" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_source_reseller_id'])==str(res[2]) and str(data['rce_source_reseller_status'])==str(res[3]) and str(data['reseller_name'])==str(res[4]) and \ + str(data['reseller_average_rating'])==str(res[5]) and str(data['reseller_follower_count'])==str(res[7]) and str(data['reseller_response_rate'])==str(res[8]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set updatedat=now() " \ + "where rce_source_reseller_id = "+ str(res[2]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set rce_source_id="+str(data['rce_source_id'])+",rce_source_reseller_id="+str(data['rce_source_reseller_id'])+", " \ + "rce_source_reseller_status="+str(data['rce_source_reseller_status'])+", reseller_name='"+str(data['reseller_name'])+"', reseller_average_rating=" \ + "'"+str(data['reseller_average_rating'])+"',reseller_follower_count='"+str(data['reseller_follower_count'])+"', reseller_response_rate=" \ + "'"+str(data['reseller_response_rate'])+"', updatedat=now() where rce_source_reseller_id = "+ str(res[2]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (id,rce_source_id,rce_source_reseller_id,rce_source_reseller_status," \ + "reseller_name,reseller_average_rating,reseller_follower_count,reseller_response_rate,createdat,updatedat) select id,rce_source_id,rce_source_reseller_id,rce_source_reseller_status," \ + "reseller_name,reseller_average_rating,reseller_follower_count,reseller_response_rate,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where rce_source_reseller_id="+str(res[2]) + #logging.info(sql) + + self.cur.execute(sql) + + def rce_reseller_store(self, data): + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_source_store_id = "+str(data['rce_source_store_id']) + self.cur.execute(sql) + res = self.cur.fetchone() + + data['store_page_url'] = data['store_page_url'].replace("'","''") + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" (rce_source_store_id,rce_source_store_status," \ + "store_page_url,store_page_url_hash,store_location,rce_reseller_id) values("+str(data['rce_source_store_id'])+"," \ + ""+str(data['rce_source_store_status'])+",'"+str(data['store_page_url'])+"','"+str(data['store_page_url_hash'])+"'," \ + "'"+str(data['store_location'])+"', "+str(data['rce_reseller_id'])+")" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (id,rce_source_store_id,rce_source_store_status," \ + "store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat) select id,rce_source_store_id,rce_source_store_status," \ + "store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_source_store_id="+str(data['rce_source_store_id'])+"" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_source_store_id'])==str(res[1]) and str(data['rce_source_store_status'])==str(res[2]) and str(data['store_page_url'])==str(res[3]) and \ + str(data['store_page_url_hash'])==str(res[4]) and str(data['store_location'])==str(res[5]) and str(data['rce_reseller_id'])==str(res[6]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set updatedat=now() " \ + "where rce_source_store_id = "+ str(res[1]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set rce_source_store_id="+str(data['rce_source_store_id'])+", " \ + "rce_source_store_status="+str(data['rce_source_store_status'])+", store_page_url='"+str(data['store_page_url'])+"', store_page_url_hash=" \ + "'"+str(data['store_page_url_hash'])+"',store_location='"+str(data['store_location'])+"', rce_reseller_id="+str(data['rce_reseller_id'])+", " \ + "updatedat=now() where rce_source_store_id = "+ str(res[1]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (id,rce_source_store_id,rce_source_store_status," \ + "store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat) select id,rce_source_store_id,rce_source_store_status," \ + "store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_source_store_id="+str(res[1])+"" + #logging.info(sql) + + self.cur.execute(sql) + + def rce_ratings_reviews(self, data): + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id = "+str(data['rce_product_id'])+" and username ='"+str(data['username'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + data['username'] = data['username'].replace("'","''") + data['img_url'] = data['img_url'].replace("'","''") + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" (id,rce_product_id,username," \ + "review,img_url,review_like_count,user_tier,shop_id,video_url,rating) values("+str(data['id'])+","+str(data['rce_product_id'])+"," \ + "'"+str(data['username'])+"','"+str(data['review'])+"','"+str(data['img_url'])+"',"+str(data['review_like_count'])+",'"+str(data['user_tier'])+"'," \ + ""+str(data['shop_id'])+", '"+str(data['video_url'])+"', '"+str(data['rating'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" (id,rce_product_id,username," \ + "review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select id,rce_product_id,username," \ + "review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id="+str(data['rce_product_id'])+" and username ='"+str(data['username'])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_product_id'])==str(res[1]) and str(data['username'])==str(res[2]) and str(data['review'])==str(res[3]) and \ + str(data['img_url'])==str(res[4]) and str(data['review_like_count'])==str(res[5]) and str(data['user_tier'])==str(res[6]) and \ + str(data['shop_id'])==str(res[7]) and str(data['video_url'])==str(res[8]) and str(data['rating'])==str(res[9]): + + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set updatedat=now() " \ + "where rce_product_id = "+ str(res[1])+" and username ='"+res[2]+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set rce_product_id="+str(data['rce_product_id'])+", " \ + "username='"+str(data['username'])+"', review='"+str(data['review'])+"', img_url=" \ + "'"+str(data['img_url'])+"',review_like_count="+str(data['review_like_count'])+", user_tier='"+str(data['user_tier'])+"', " \ + "shop_id="+str(data['shop_id'])+", video_url='"+str(data['video_url'])+"', rating='"+str(data['rating'])+"', updatedat=now() " \ + "where rce_product_id = "+ str(res[1])+" and username ='"+str(data['username'])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" (id,rce_product_id,username," \ + "review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select id,rce_product_id,username," \ + "review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id="+str(res[1])+" and username ='"+str(data['username'])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + def rce_ratings_reviews_productmodels(self,data): + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id = "+str(data['rce_rating_id']) + self.cur.execute(sql) + res = self.cur.fetchone() + + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" (rce_rating_id,model_id) " \ + "values("+str(data['rce_rating_id'])+",'"+str(data['model_id'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \ + "createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(data['rce_rating_id'])+"" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_rating_id'])==str(res[1]) and str(data['model_id'])==str(res[2]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set updatedat=now() " \ + "where rce_rating_id = "+ str(res[1]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set model_id="+str(data['model_id'])+", " \ + "updatedat=now() where rce_source_store_id = "+ str(res[1]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \ + "createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(res[1])+"" + #logging.info(sql) + + self.cur.execute(sql) + + + def rce_tags(self,data): + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description = '"+str(data['description'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" (id,description) " \ + "values("+str(data['id'])+",'"+str(data['description'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \ + "createdat,updatedat) select id,description,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(data['description'])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['description'])==str(res[1]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set updatedat=now() " \ + "where description = '"+ str(res[1])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set description='"+str(data['description'])+"', " \ + "updatedat=now() where description = "+ str(res[1]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \ + "createdat,updatedat) select id,description,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(res[1])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + + def rce_ratings_reviews_producttags(self,data): + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id = '"+str(data['rce_rating_id'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + + if not res: + + sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" (rce_rating_id,tag_ids) " \ + "values("+str(data['rce_rating_id'])+",'"+str(data['tag_ids'])+"')" + #logging.info(sql) + + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \ + "createdat,updatedat) select id,rce_rating_id,tag_ids,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id='"+str(data['rce_rating_id'])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_rating_id'])==str(res[1]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set updatedat=now() " \ + "where rce_rating_id = '"+ str(res[1])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set rce_rating_id='"+str(data['rce_rating_id'])+"', " \ + "updatedat=now() where rce_rating_id = "+ str(res[1]) + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \ + "createdat,updatedat) select id,description,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where description='"+str(res[1])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + + diff --git a/shopee_crawler_engine/shopee_products.py b/shopee_crawler_engine/shopee_products.py new file mode 100755 index 0000000..382d730 --- /dev/null +++ b/shopee_crawler_engine/shopee_products.py @@ -0,0 +1,540 @@ +import hashlib +import logging +import sys + +from selenium.webdriver.remote.remote_connection import LOGGER +LOGGER.setLevel(logging.WARNING) +import string +from seleniumwire import webdriver +from selenium.webdriver.chrome.service import Service +import psycopg2 +import bs4 +from webdriver_manager.chrome import ChromeDriverManager +import random +from bs4 import BeautifulSoup +import json +import time +import gzip +import re +from shopee_db_writer import shopee_db_writer +import random + +class shopee_products: + def __init__(self, config): + self.config = config + self.crawler_name = self.config.get("crawler_name") + self.pattern = r'[' + string.punctuation + ']' + self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) + self.conn.autocommit = True + self.cur = self.conn.cursor() + self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Shopee'") + self.rce_source_id = self.cur.fetchone()[0] + self.db_writer = shopee_db_writer(config) + + def __del__(self): + print("Closing connection.....") + self.conn.close() + + def get_raw_product(self, url): + op = webdriver.ChromeOptions() + hight = str(random.randint(640,1280)) + width = str(random.randint(1024,1920)) + op.add_argument("window-size="+width+","+hight+"") + op.add_experimental_option("useAutomationExtension", False) + op.add_argument('--no-sandbox') + op.add_argument('--disable-notifications') + op.add_argument("--lang=en-GB") + op.add_argument("--log-level=3") + op.headless = True + driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op) + + driver.get(url) + time.sleep(10) + + iteminfo = "" + shopinfo = "" + ratinginfo = "" + + try: + + for request in driver.requests: + if request.response: + if '/api/v4/item/get?item' in request.url: + encoding = request.response.headers.get('content-encoding') + if encoding: + iteminfo = gzip.decompress(request.response.body).decode() + else: + iteminfo = request.response.body + if '/api/v4/product/get_shop_info?shopid' in request.url: + encoding = request.response.headers.get('content-encoding') + if encoding: + shopinfo = gzip.decompress(request.response.body).decode() + else: + shopinfo = request.response.body + if '/api/v2/item/get_ratings' in request.url: + if encoding: + ratinginfo = gzip.decompress(request.response.body).decode() + else: + ratinginfo = request.response.body + except: + pass + + driver.close() + + return iteminfo, shopinfo, ratinginfo + + + def product_info(self, data_item, item): + + ### rce_brand + + data_brand = {} + + data_brand['rce_source_id'] = self.rce_source_id + data_brand['rce_source_brand_id'] = "" + data_brand['rce_source_brand_status'] = 1 + data_brand['brand_page_url'] = "" + data_brand['brand_page_url_hash'] = "" + data_brand['brand_name'] = "" + + try: + data_brand['rce_source_brand_id'] = data_item['data']['brand_id'] + data_brand['brand_page_url'] = "https://shopee.co.id/search?brands=" + str(data_item['data']['brand_id']) + data_brand['brand_page_url_hash'] = hashlib.md5(data_brand['brand_page_url'].encode('utf-8')).hexdigest() + + try: + brand_name = data_item['data']['brand'] + data_brand['brand_name'] = re.sub(self.pattern, '', brand_name) + except: pass + + self.db_writer.rce_brand(data_brand) + + except: pass + + ### rce_product + + data_product = {} + + data_product['rce_source_product_id'] = item[3] #itemid + data_product['rce_source_product_status'] = 1 + data_product['product_page_url'] = item[4] #product page url + data_product['product_page_url_hash'] = item[5] #product page url hash + data_product['rce_category_id'] = "" + data_product['rce_brand_id'] = "" + data_product['rce_store_id'] = "" + data_product['rce_source_product_name'] = "" + data_product['product_images'] = "" + data_product['product_description'] = "" + data_product['product_sold_total'] = "" + data_product['product_sold'] = "" + data_product['product_price_min'] = "" + data_product['product_price_min_before_discount'] ="" + data_product['product_price_max'] = "" + data_product['product_price_max_before_discount'] = "" + data_product['ratings'] = "" + data_product['ships_from'] = "" + + try: + keyword = item[1] + sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" where lower(category_name) = lower('"+keyword+"')" + self.cur.execute(sql) + data_product['rce_category_id'] = self.cur.fetchone()[0] + except: pass + + try: + sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where rce_source_brand_id = "+str(data_brand['rce_source_brand_id']) + self.cur.execute(sql) + data_product['rce_brand_id'] = self.cur.fetchone()[0] + except: pass + + try: + sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_source_store_id = "+str(item[2])+"" + self.cur.execute(sql) + data_product['rce_store_id'] = self.cur.fetchone()[0] + except: pass + + try: + rce_source_product_name = data_item['data']['name'] + data_product['rce_source_product_name'] = str(re.sub(self.pattern, '', rce_source_product_name)) + except: pass + + try: + product_images = str(data_item["data"]["images"]) + data_product['product_images'] = str(product_images.replace("'","")) + except: pass + + try: + product_description = str(data_item["data"]["description"]) + data_product['product_description'] = str(re.sub(self.pattern, '', product_description)) + except: pass + + try: data_product['product_sold_total'] = str(data_item["data"]["historical_sold"]) + except: pass + + try: data_product['product_sold'] = str(data_item["data"]["sold"]) + except: pass + + try: data_product['product_price_min'] = str(data_item["data"]["price_min"]) + except: pass + + try: data_product['product_price_min_before_discount'] = str(data_item["data"]["price_min_before_discount"]) + except: pass + + try: data_product['product_price_max'] = str(data_item["data"]["price_max"]) + except: pass + + try: data_product['product_price_max_before_discount'] = str(data_item["data"]["price_max_before_discount"]) + except: pass + + try: data_product['ratings'] = str(data_item["data"]["item_rating"]["rating_star"]) + except: pass + + try: data_product['ships_from'] = str(data_item["data"]["shop_location"]) + except: pass + + self.db_writer.rce_product(data_product) + + ### rce_product_variant + data_variant = {} + + data_variant['rce_source_variant_id'] = "" + data_variant['rce_product_id'] = "" + data_variant['product_variant_name'] = "" + data_variant['product_variant_price'] = "" + data_variant['product_variant_price_before_discount'] = "" + data_variant['product_variant_stock'] = "" + + try: + sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_id = "+str(data_product['rce_source_product_id']) + self.cur.execute(sql) + data_variant['rce_product_id'] = self.cur.fetchone()[0] + except: + pass + + try: + MODELS = data_item["data"]["models"] + + for i in MODELS: + try: + data_variant['rce_source_variant_id'] = str(i["modelid"]) + + try: + product_variant_name = str(i["name"]) + data_variant['product_variant_name'] = re.sub(self.pattern, '', product_variant_name) + except: pass + + try: data_variant['product_variant_price'] = str(i["price"]) + except: pass + + try: data_variant['product_variant_price_before_discount'] = str(i["price_before_discount"]) + except: pass + + try: data_variant['product_variant_stock'] = str(i["stock"]) + except: pass + + self.db_writer.rce_product_variant(data_variant) + + except: pass + except: pass + + + + + def reseller_info(self, data_shop, item): + + data_reseller = {} + + data_reseller['rce_source_id'] = self.rce_source_id + data_reseller['rce_source_reseller_id'] = "" + data_reseller['rce_source_reseller_status'] = 1 + data_reseller['reseller_name'] = "" + data_reseller['reseller_average_rating'] = "" + data_reseller['reseller_follower_count'] = "" + data_reseller['reseller_response_rate'] = "" + + + try: + data_reseller['rce_source_reseller_id'] = str(data_shop["data"]["userid"]) + except: pass + + try: + reseller_name = str(data_shop["data"]["name"]) + data_reseller['reseller_name'] = re.sub(self.pattern, '', reseller_name) + except: pass + + try: data_reseller['reseller_average_rating'] = str(data_shop["data"]["rating_star"]) + except: pass + + try: data_reseller['reseller_follower_count'] = str(data_shop["data"]["follower_count"]) + except: pass + + try: data_reseller['reseller_response_rate'] = str(data_shop["data"]["response_rate"]) + except: pass + + self.db_writer.rce_reseller(data_reseller) + + + data_reseller_store = {} + + data_reseller_store['rce_source_store_id'] = item[2] + data_reseller_store['rce_source_store_status'] = 1 + data_reseller_store['store_page_url'] = "" + data_reseller_store['store_page_url_hash'] = "" + data_reseller_store['store_location'] = "" + data_reseller_store['rce_reseller_id'] = "" + + try: + username = str(data_shop["data"]["account"]["username"]) + data_reseller_store['store_page_url'] = "https://shopee.co.id/"+username + data_reseller_store['store_page_url_hash'] = hashlib.md5(data_reseller_store['store_page_url'].encode('utf-8')).hexdigest() + except: + pass + + try: data_reseller_store['store_location'] = str(data_shop["data"]["shop_location"]) + except: pass + + try: + self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where rce_source_reseller_id = "+str(data_reseller['rce_source_reseller_id'])) + rce_reseller_id = self.cur.fetchone() + data_reseller_store['rce_reseller_id'] = rce_reseller_id[0] + except: + pass + + + self.db_writer.rce_reseller_store(data_reseller_store) + + + def rating_info(self, data_rating, item): + + sql = "select max(id) from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab') + self.cur.execute(sql) + rating_id = self.cur.fetchone() + + if rating_id[0]==None: + rating_id = 1 + else: + rating_id = int(rating_id[0]) + 1 + + for data in data_rating['data']['ratings']: + + data_review = {} + + data_review["id"] = rating_id + data_review["rce_product_id"] = "" + data_review["username"] = "" + data_review["review"] = "" + data_review["img_url"] = "" + data_review["review_like_count"] = "" + data_review["user_tier"] = "" + data_review["shop_id"] = item[2] + data_review["video_url"] = "" + data_review["rating"] = "" + + try: + sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_id = "+str(item[3]) + self.cur.execute(sql) + data_review["rce_product_id"] = self.cur.fetchone()[0] + except: pass + + try: data_review["username"] = str(data['author_username']) + except: pass + + try: + review = str(data['comment']) + review = review.replace(",", " ") + review = review.replace("'", " ") + + comments = list(review.split("\n")) + + for comment_items in range(len(comments)): + temp_comment = re.sub('[^a-zA-Z0-9\: ]([a-zA-Z\:]+)?\s{0,2}[^a-zA-Z0-9\: ]?', ' ', comments[comment_items]) + if not re.match('[A-Za-z0-9\s*]*\s*(\:)\s*[A-Za-z0-9\s*]*', temp_comment): + data_review["review"] = data_review["review"] + (comments[comment_items]) + except: pass + + try: data_review["img_url"] = str(data['images']).replace("'","").replace("[","").replace("]","") + except: pass + + try: + if data['like_count']: + data_review["review_like_count"] = str(data['like_count']) + else: + data_review["review_like_count"]=0 + except: pass + + try: data_review["user_tier"] = str(data['loyalty_info']['tier_text']) + except: pass + + try: + rce_video_url = [] + for urls in data["videos"]: + rce_video_url.append(urls["url"]) + + data_review["video_url"] = str(",".join(rce_video_url)) + except: pass + + try: data_review["rating"] = str(data['rating_star']) + except: pass + + self.db_writer.rce_ratings_reviews(data_review) + + sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where id="+str(data_review['id']) + + self.cur.execute(sql) + res = self.cur.fetchall() + if res: + + data_review_product_model = {} + + data_review_product_model["rce_rating_id"] = rating_id + data_review_product_model["model_id"] = "" + + try: + product_models = [] + for models in data["product_items"]: + product_models.append(models["modelid"]) + + data_review_product_model["model_id"] = str(product_models).replace("[","").replace("]","") + + self.db_writer.rce_ratings_reviews_productmodels(data_review_product_model) + + except: pass + + + if data['tags']: + rce_tags_list = [] + for tags in data["tags"]: + + sql = "select max(id) from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab') + self.cur.execute(sql) + tag_id = self.cur.fetchone() + + if tag_id[0]==None: + tag_id = 1 + else: + tag_id = int(tag_id[0]) + 1 + + data_tags = {} + + data_tags['id'] = tag_id + data_tags['description'] = tags["tag_description"] + + self.db_writer.rce_tags(data_tags) + + rce_tags_list.append(tags["tag_description"]) + + rce_tags_list = str(rce_tags_list).replace('[','').replace(']','') + + + tags_id_query = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description in (" + str(rce_tags_list) + ")" + self.cur.execute(tags_id_query) + tags_id_query = self.cur.fetchall() + rce_tag_ids = str(tags_id_query) + rce_tag_ids = rce_tag_ids.replace("[", "") + rce_tag_ids = rce_tag_ids.replace("]", "") + rce_tag_ids = rce_tag_ids.replace("(", "") + rce_tag_ids = rce_tag_ids.replace(")", "") + rce_tag_ids = rce_tag_ids.replace(",,", ",") + rce_tag_ids = rce_tag_ids.rstrip(",") + + data_review_product_tags = {} + + data_review_product_tags['rce_rating_id'] = rating_id + data_review_product_tags['tag_ids'] = rce_tag_ids + + self.db_writer.rce_ratings_reviews_producttags(data_review_product_tags) + + + + def get_shopee_products(self): + + crawler_main = int(self.config.get('crawler_main')) + crawler_slave_no = int(self.config.get('crawler_slave_no')) if self.config.get('crawler_slave_no') else None + + sql = None + + if crawler_main: + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where flag=0 and crawler_name='"+self.config.get('crawler_name')+"' " \ + "and keyword in ('Perawatan & Kecantikan','Perawatan Tubuh','Perawatan Tangan','Perawatan Kaki','Perawatan Kuku','Perawatan Rambut','Perawatan Pria'," \ + "'Parfum & Wewangian') order by id" + else: + if crawler_slave_no == 1: + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where flag=0 and crawler_name='"+self.config.get('crawler_name')+"' " \ + "and keyword in ('Kosmetik Wajah','Kosmetik Mata','Kosmetik Bibir','Pembersih Make Up','Aksesoris Make Up','Alat Perawatan Wajah','Alat Pelangsing Tubuh') order by id" + elif crawler_slave_no ==2: + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where flag=0 and crawler_name='"+self.config.get('crawler_name')+"' " \ + "and keyword in ('Alat Penghilang Bulu Rambut','Alat Rambut','Perawatan Wajah','Treatment Mata','Treatment Bibir','Paket & Set Kecantikan','Kecantikan Lainnya') order by id" + + + if sql: + self.cur.execute(sql) + + items = self.cur.fetchall() + + logging.info("Total Item found: {}".format(str(len(items)))) + + for item in items: + self.crawl_shopee_products(item) + time.sleep(random.randint(15,25)) + else: + logging.info("SQL not generated. Please check if Master or Slaves are working correctly.") + sys.exit(1) + + + + def crawl_shopee_products(self,item, flag=0): + + logging.info("Collecting info for itemid="+str(item[3])+" and shopid="+str(item[2])) + + iteminfo, shopinfo, ratinginfo = self.get_raw_product(item[4]) + + try: + + data_item = json.loads(iteminfo) + data_shop = json.loads(shopinfo) + data_rating = json.loads(ratinginfo) + + X = None + Y = None + Z = None + try : X = data_item["data"] + except: pass + try : Y = data_shop["data"] + except: pass + try : Z = data_rating["data"] + except: pass + + if not X or not Y or not Z: + if flag == 0: + print("Data is NULL. Retrying..... Itemid: {}, Shopid: {}".format(str(item[3]),str(item[2]))) + self.crawl_shopee_products(item, flag=1) + else: + print("Data is NULL. Skipping") + pass + else: + try: + self.reseller_info(data_shop,item) + except Exception as e: + logging.info("Reseller info: "+ str(e)) + pass + + try: + self.product_info(data_item,item) + except Exception as e: + logging.info("Product info: "+ str(e)) + pass + + try: + self.rating_info(data_rating,item) + except Exception as e: + logging.info("Rating info: "+ str(e)) + + except Exception as e: + logging.info("Data not parsable..... Skipping....") + #self.crawl_shopee_products(item, flag=1) + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" set flag=1 where itemid="+str(item[3])+" and shopid="+str(item[2])+" and crawler_name='"+self.config.get('crawler_name')+"'" + logging.info(sql) + self.cur.execute(sql) + + diff --git a/shopee_crawler_engine/shopee_sub_categories.py b/shopee_crawler_engine/shopee_sub_categories.py new file mode 100644 index 0000000..d81187e --- /dev/null +++ b/shopee_crawler_engine/shopee_sub_categories.py @@ -0,0 +1,107 @@ +import hashlib +from selenium import webdriver +from selenium.webdriver.chrome.service import Service +import psycopg2 +from webdriver_manager.chrome import ChromeDriverManager +import random +from bs4 import BeautifulSoup +import json +import logging +from shopee_db_writer import shopee_db_writer + +###### Looger ###### +format = "%(asctime)s: %(message)s" +logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") + +class shopee_sub_categories: + def __init__(self, config): + logging.info("Loading Sub Categories of Beauty & Care.........") + self.config = config + self.url = "https://shopee.co.id/api/v4/pages/get_category_tree" + self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) + self.conn.autocommit = True + self.cur = self.conn.cursor() + self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Shopee'") + try : self.rce_source_id = self.cur.fetchone()[0] + except: + logging.info("Source tab is empty. Please check. Exiting.....") + exit(1) + self.db_writer = shopee_db_writer(config) + + def __del__(self): + logging.info("Closing connection.....") + self.conn.close() + + def get_sub_categories(self): + op = webdriver.ChromeOptions() + hight = str(random.randint(640,1280)) + width = str(random.randint(1024,1920)) + op.add_argument("window-size="+width+","+hight+"") + op.add_experimental_option("useAutomationExtension", False) + op.add_argument('--no-sandbox') + op.add_argument('--disable-notifications') + op.headless = True + driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op) + + driver.get(self.url) + + self.page_source = driver.page_source + + self.parse() + + def parse(self): + soup = BeautifulSoup(self.page_source,features="html.parser") + + all_cat = json.loads(soup.body.text)['data']['category_list'] + + for cat in all_cat: + if cat['catid'] == int(self.config.get('source_category')): + self.sub_cats = cat['children'] + data = {} + data['parent_category_id'] = cat['parent_catid'] + data['rce_source_id'] = self.rce_source_id + data['rce_source_category_id'] = cat['catid'] + data['rce_source_status'] = 1 + data['category_name'] = cat['display_name'] + data['category_page_url'] = self.get_url(name=data['category_name'], pcatid=data['rce_source_category_id']) + data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest() + self.db_writer.rce_category(data) + + self.process_sub_categories() + + + def process_sub_categories(self): + + for sub_cat in self.sub_cats: + data = {} + data['parent_category_id'] = sub_cat['parent_catid'] + data['rce_source_id'] = self.rce_source_id + data['rce_source_category_id'] = sub_cat['catid'] + data['rce_source_status'] = 1 + data['category_name'] = sub_cat['display_name'] + data['category_page_url'] = self.get_url(name=data['category_name'], pcatid=data['parent_category_id'], ccatid=data['rce_source_category_id']) + data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest() + self.db_writer.rce_category(data) + + + def get_url(self, name, pcatid=None, ccatid=None): + uri = name.split('& ') + uri = ''.join(uri) + uri = uri.split(' ') + uri = '-'.join(uri) + + url = 'https://shopee.co.id/' + uri + + if not ccatid: + url = url + '-cat.' + str(pcatid) + else: + url = url + '-cat.' + str(pcatid) + '.' + str(ccatid) + + return url + + + + + + + diff --git a/tokopedia_crawler_engine/.gitignore b/tokopedia_crawler_engine/.gitignore new file mode 100644 index 0000000..592dcee --- /dev/null +++ b/tokopedia_crawler_engine/.gitignore @@ -0,0 +1 @@ +conf.json \ No newline at end of file diff --git a/tokopedia_crawler_engine/Readme.md b/tokopedia_crawler_engine/Readme.md new file mode 100644 index 0000000..bad6730 --- /dev/null +++ b/tokopedia_crawler_engine/Readme.md @@ -0,0 +1,13 @@ +### Run: ### +* run "python tokopedia_crawler.py" + +### Configuration: ### +* Ensure that tables are created already. +* cp conf.json.sample conf.json +* Install zyte certificate - https://docs.zyte.com/smart-proxy-manager/next-steps/fetching-https-pages-with-smart-proxy.html#fetching-https-pages-with-smart-proxy + +### Notes: ### +* Cronjob can be setup for 'Master' to run every 1 minute. +* It is expected to capture all product urls in ~107 minutes. +* It makes only 2 API calls per minute(3 in the first minute) to prevent IP blocking. +* Infinite slaves can be added. \ No newline at end of file diff --git a/tokopedia_crawler_engine/conf.json.sample b/tokopedia_crawler_engine/conf.json.sample new file mode 100644 index 0000000..4c9a23c --- /dev/null +++ b/tokopedia_crawler_engine/conf.json.sample @@ -0,0 +1,28 @@ +{ + "crawler_name": "raena_crawler_engine_tokopedia", + "crawler_target": "Tokopedia", + "crawler_target_url": "https://www.tokopedia.com/", + "crawler_schema": "raena_spider_management", + "category_tab": "rce_category", + "tracker_tab": "crawler_tracker", + "product_tab": "rce_product", + "variant_tab": "rce_product_variant", + "brand_tab": "rce_brand", + "reseller_tab": "rce_reseller", + "reseller_store_tab": "rce_reseller_store", + "review_tab": "rce_ratings_reviews", + "review_productmodels_tab": "rce_ratings_reviews_productmodels", + "review_producttags_tab": "rce_ratings_reviews_producttags", + "review_tags": "rce_tags", + "source_tab": "rce_source", + "product_per_category": "120", + "source_category": "61", + "proxy_url": "http://59e7e01ebdf54a6585c7db8824efa1e8:@proxy.crawlera.com:8011/", + "db_user": "", + "db_pass": "", + "database": "raena_db", + "db_host": "localhost", + "db_port": "5432", + "crawler_main": "1", + "crawler_slave_no": "" +} diff --git a/tokopedia_crawler_engine/tokopedia_api.py b/tokopedia_crawler_engine/tokopedia_api.py new file mode 100644 index 0000000..e8f3ee0 --- /dev/null +++ b/tokopedia_crawler_engine/tokopedia_api.py @@ -0,0 +1,31 @@ +import requests +from pathlib import Path +from tokopedia_config import Config + +class api(): + config = Config().get() + + def post(self, url, payload): + try: + response = requests.post(url, payload) + return response.json() + except: + return [] + + def postProxy(self, url, payload, headers): + path = Path.cwd() + proxyUrl = self.config.get('proxy_url') + # print(data) + try: + response = requests.post(url, + data=payload, + headers=headers, + proxies={ + "http": proxyUrl, + "https": proxyUrl, + }, + verify=f'{path}/zyte-proxy-ca.crt' + ) + return response.json() + except: + return [] diff --git a/tokopedia_crawler_engine/tokopedia_config.py b/tokopedia_crawler_engine/tokopedia_config.py new file mode 100644 index 0000000..834e315 --- /dev/null +++ b/tokopedia_crawler_engine/tokopedia_config.py @@ -0,0 +1,25 @@ +import json +from tokopedia_logger import logger + +class Config(): + config = None + + def __new__(cls, *args, **kw): + if not hasattr(cls, '_instance'): + orig = super(Config, cls) + cls._instance = orig.__new__(cls, *args, **kw) + return cls._instance + + def __init__(self): + if not self.config: + try: + logger.info("Loading config fine...") + with open("conf.json", "r") as jsonfile: + self.config = json.load(jsonfile) + logger.info("Config file loaded.") + except Exception as e: + logger.error("Cannot load config file. Please check. Exiting......") + exit(1) + + def get(self): + return self.config diff --git a/tokopedia_crawler_engine/tokopedia_crawler.py b/tokopedia_crawler_engine/tokopedia_crawler.py new file mode 100644 index 0000000..68049c5 --- /dev/null +++ b/tokopedia_crawler_engine/tokopedia_crawler.py @@ -0,0 +1,43 @@ +from tokopedia_logger import logger +from tokopedia_db_writer import DB +from tokopedia_config import Config +from tokopedia_sub_categories import TokopediaSubCategories +from tokopedia_db_migrations import db_migrations +from tokopedia_product_list import ProductList +from tokopedia_products import Products + +def checkSource(): + config = Config().get() + table = config.get("crawler_schema") + "." + config.get("source_tab") + query = "select id from " + table + " where source_name='Tokopedia'" + data = DB().fetchone(query) + if not data: + logger.error("Please create source in " + table) + exit(1) + +def runMainCrawler(): + db_migrations() + checkSource() + TokopediaSubCategories() + ProductList() + +def runSlaveCrawler(): + config = Config().get() + try: + int(config.get('crawler_slave_no')) + except: + logger.error("Please set slave number") + exit(1) + Products() + +def main(): + config = Config().get() + isMainCrawler = bool(int(config.get('crawler_main'))) + + if isMainCrawler: + runMainCrawler() + else: + runSlaveCrawler() + +if __name__ == "__main__": + main() diff --git a/tokopedia_crawler_engine/tokopedia_db_migrations.py b/tokopedia_crawler_engine/tokopedia_db_migrations.py new file mode 100644 index 0000000..8ecf0ca --- /dev/null +++ b/tokopedia_crawler_engine/tokopedia_db_migrations.py @@ -0,0 +1,109 @@ +from tokopedia_logger import logger +from tokopedia_db_writer import DB +from tokopedia_config import Config + +class db_migrations(): + config = Config().get() + + def __init__(self): + logger.info('Running database migrations') + self.updateSource() + self.updateCategoryColumn() + self.alterCrawlerTracker() + self.alterProductTab() + self.alterResellerStoreTab() + logger.info('Database migrations completed') + + def updateSource(self): + table = f'{self.config.get("crawler_schema")}.{self.config.get("source_tab")}' + target = self.config.get("crawler_target") + target_url = self.config.get("crawler_target_url") + query = f'''INSERT INTO {table} (source_name, source_main_url) + SELECT '{target}', '{target_url}' + WHERE + NOT EXISTS ( + SELECT id FROM {table} WHERE source_name = '{target}' + );''' + try: + DB().execute_query(query) + except: + logger.error(f'Problem while creating source in {table}') + exit(1) + + def updateCategoryColumn(self): + table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}' + aud_table = f'{self.config.get("crawler_schema")}.aud_{self.config.get("category_tab")}' + query = f'Alter table {table} ADD COLUMN IF NOT EXISTS category_slug character varying UNIQUE' + aud_query = f'Alter table {aud_table} ADD COLUMN IF NOT EXISTS category_slug character varying UNIQUE' + try: + DB().execute_query(query) + DB().execute_query(aud_query) + except: + logger.error(f'Problem while updating column in {table}') + exit(1) + + def alterCrawlerTracker(self): + table = f'{self.config.get("crawler_schema")}.{self.config.get("tracker_tab")}' + query = f''' + ALTER TABLE {table} + ADD CONSTRAINT unique_product_page_url UNIQUE (product_page_url); + ''' + try: + DB().execute_query(query) + except: + # This might be the reason of a silent error + pass + + def alterProductTab(self): + table = f'{self.config.get("crawler_schema")}.{self.config.get("product_tab")}' + aud_table = f'{self.config.get("crawler_schema")}.aud_{self.config.get("product_tab")}' + query = f'Alter table {table} ADD COLUMN IF NOT EXISTS rce_source_id bigint;' + aud_query = f'Alter table {aud_table} ADD COLUMN IF NOT EXISTS rce_source_id bigint;' + + constraint_query = f''' + ALTER TABLE {table} + ADD CONSTRAINT product_source_id_ukey UNIQUE (rce_source_product_id, rce_source_id); + ''' + + try: + DB().execute_query(query + aud_query) + except: + logger.error(f'Problem while updating column in {table}') + exit(1) + + try: + DB().execute_query(constraint_query) + except: + pass + + def alterResellerStoreTab(self): + table = f'{self.config.get("crawler_schema")}.{self.config.get("reseller_store_tab")}' + aud_table = f'{self.config.get("crawler_schema")}.aud_{self.config.get("reseller_store_tab")}' + query = f'Alter table {table} ADD COLUMN IF NOT EXISTS rce_source_id bigint;' + aud_query = f'Alter table {aud_table} ADD COLUMN IF NOT EXISTS rce_source_id bigint;' + + constraint_query = f''' + ALTER TABLE {table} + ADD CONSTRAINT store_source_id_ukey UNIQUE (rce_source_store_id, rce_source_id); + ''' + + aud_constraint_query = f''' + ALTER TABLE {aud_table} + ADD CONSTRAINT aud_store_source_id_ukey UNIQUE (rce_source_store_id, rce_source_id); + ''' + + try: + DB().execute_query(query + aud_query) + except: + logger.error(f'Problem while updating column in {table}') + exit(1) + + try: + DB().execute_query(constraint_query) + except: + pass + + try: + DB().execute_query(aud_constraint_query) + except: + pass diff --git a/tokopedia_crawler_engine/tokopedia_db_writer.py b/tokopedia_crawler_engine/tokopedia_db_writer.py new file mode 100644 index 0000000..bc1f224 --- /dev/null +++ b/tokopedia_crawler_engine/tokopedia_db_writer.py @@ -0,0 +1,60 @@ +from tokopedia_config import Config +from tokopedia_logger import logger +import psycopg2 + +class DBConnector: + def __init__(self): + config = Config().get() + self.host = config.get('db_host') + self.database = config.get('database') + self.user = config.get('db_user') + self.password = config.get('db_pass') + self.port = config.get('db_port') + self.dbconn = None + + def create_connection(self): + return psycopg2.connect( + database=self.database, + user=self.user, + password=self.password, + host=self.host, + port=self.port + ) + + def __enter__(self): + self.dbconn = self.create_connection() + return self.dbconn + + def __exit__(self, exc_type, exc_val, exc_tb): + self.dbconn.close() + +class DB(object): + connection = None + + def __new__(cls, *args, **kw): + if not hasattr(cls, '_instance'): + orig = super(DB, cls) + cls._instance = orig.__new__(cls, *args, **kw) + return cls._instance + + def get_connection(self): + if not self.connection: + self.connection = DBConnector().create_connection() + return self.connection + + def execute_query(self, query): + connection = self.get_connection() + connection.autocommit = True + try: + cursor = connection.cursor() + except psycopg2.ProgrammingError: + connection = self.get_connection() + cursor = connection.cursor() + cursor.execute(query) + return cursor + + def fetchone(self, query): + return self.execute_query(query).fetchone() + + def fetchall(self, query): + return self.execute_query(query).fetchall() diff --git a/tokopedia_crawler_engine/tokopedia_logger.py b/tokopedia_crawler_engine/tokopedia_logger.py new file mode 100644 index 0000000..a527054 --- /dev/null +++ b/tokopedia_crawler_engine/tokopedia_logger.py @@ -0,0 +1,7 @@ +import logging + +###### Logger ###### +format = "%(asctime)s: %(message)s" +logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") + +logger = logging.getLogger("tokopedia") diff --git a/tokopedia_crawler_engine/tokopedia_product_list.py b/tokopedia_crawler_engine/tokopedia_product_list.py new file mode 100644 index 0000000..d221643 --- /dev/null +++ b/tokopedia_crawler_engine/tokopedia_product_list.py @@ -0,0 +1,108 @@ +import json +from tokopedia_db_writer import DB +from tokopedia_logger import logger +from tokopedia_config import Config +from tokopedia_api import api + +class ProductList(): + config = Config().get() + sourceId = None + + def __init__(self): + self.sourceId = self.getSourceId() + self.get() + + def getSourceId(self): + table = f'{self.config.get("crawler_schema")}.{self.config.get("source_tab")}' + query = f'select id from {table} where source_name=\'Tokopedia\'' + data = DB().fetchone(query) + return data[0] + + # fetch 1 row of category which does not have rce_source_status set + # Make api call to fetch 120 products and store in crawler_tracker + def getCategoryIdentifier(self): + table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}' + query = f""" + SELECT category_slug FROM {table} + WHERE rce_source_id = {self.sourceId} and rce_source_status is null + ORDER BY id ASC + Limit 1 + """ + try: + data = DB().fetchone(query) + return data[0] if data else None + except: + return None + + def getProductList(self, identifier, page): + url = 'https://gql.tokopedia.com/graphql/SearchProductQuery' + params = f"ob=&page={page}&start={1 + (page-1)*60}&identifier={identifier}&sc=2266&user_id=0&rows=60&source=directory&device=desktop&related=true&st=product&safe_search=false" + payload = json.dumps([{ + "operationName": "SearchProductQuery", + "variables": { + "params": params + }, + "query": "query SearchProductQuery($params: String) {\n CategoryProducts: searchProduct(params: $params) {\n data: products {\n id\n url\n }\n }\n }\n" + }]) + data = api().post(url, payload) + return data + + def processData(self, data1, data2): + crawler_name = self.config.get("crawler_name") + data = None + try: + rootData1 = data1[0]["data"]["CategoryProducts"]["data"] + rootData2 = data2[0]["data"]["CategoryProducts"]["data"] + data = rootData1 + rootData2 + except: + data = [] + + proccessedData = list(map(lambda x: (f"'{crawler_name}'", f'\'{x["url"]}\''), data)) + return proccessedData + + @staticmethod + def convertToString(n, delimeter = ','): + return delimeter.join(n) + + def updateTracker(self, rawData): + table = f'{self.config.get("crawler_schema")}.{self.config.get("tracker_tab")}' + data = f"({self.convertToString(map(self.convertToString, rawData), '),(')})" + query = f""" + INSERT INTO {table}(crawler_name, product_page_url) + VALUES {data} + ON CONFLICT (product_page_url) DO Nothing; + """ + try: + DB().execute_query(query) + return True + except: + logger.info(f'Error while inserting data in {table}') + return False + + def updateCategoryTableRow(self, identifier): + table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}' + query = f""" + Update {table} + SET rce_source_status = 1 + WHERE category_slug='{identifier}' + """ + try: + data = DB().execute_query(query) + except: + logger.error(f'Something went wrong while updating {table}') + + def get(self): + identifier = self.getCategoryIdentifier() + if not identifier: + logger.info("All the categories are processed, no task left for master") + return + + data1 = self.getProductList(identifier, 1) + data2 = self.getProductList(identifier, 2) + processedData = self.processData(data1, data2) + isDataInserted = self.updateTracker(processedData) + + if isDataInserted: + self.updateCategoryTableRow(identifier) + + logger.info(f'All the URLs are fetched for the following category identifier - {identifier}') diff --git a/tokopedia_crawler_engine/tokopedia_products.py b/tokopedia_crawler_engine/tokopedia_products.py new file mode 100644 index 0000000..af78df3 --- /dev/null +++ b/tokopedia_crawler_engine/tokopedia_products.py @@ -0,0 +1,284 @@ +import json +import re +import html +from tokopedia_db_writer import DB +from tokopedia_logger import logger +from tokopedia_config import Config +from tokopedia_api import api + +class Products(): + config = Config().get() + sourceId = None + slaveId = None + + def __init__(self): + self.sourceId = self.getSourceId() + self.slaveId = self.config.get('crawler_slave_no') + self.get() + + def getSourceId(self): + table = f'{self.config.get("crawler_schema")}.{self.config.get("source_tab")}' + query = f'select id from {table} where source_name=\'Tokopedia\'' + data = DB().fetchone(query) + return data[0] + + def fetchProductRowId(self): + table = f'{self.config.get("crawler_schema")}.{self.config.get("tracker_tab")}' + crawlerName = self.config.get("crawler_name") + query = f''' + SELECT id FROM {table} + WHERE crawler_name = '{crawlerName}' + AND flag is null + ORDER BY id ASC + LIMIT 1 + ''' + try: + data = DB().fetchone(query) + return data[0] if data else None + except: + return None + + def lockProductRow(self, rowId): + table = f'{self.config.get("crawler_schema")}.{self.config.get("tracker_tab")}' + query = f''' + UPDATE {table} + set flag={self.slaveId} + WHERE id={rowId} + ''' + try: + DB().execute_query(query) + except: + logger.error(f'Some error while locking product row for slave({self.slaveId})') + exit(1) + + def unlockProductRow(self, rowId): + table = f'{self.config.get("crawler_schema")}.{self.config.get("tracker_tab")}' + query = f''' + UPDATE {table} + set flag=null + WHERE id={rowId} + ''' + try: + DB().execute_query(query) + except: + logger.error(f'Some error while unlocking product row for slave({self.slaveId})') + exit(1) + + def removeProductRow(self, rowId): + table = f'{self.config.get("crawler_schema")}.{self.config.get("tracker_tab")}' + query = f''' + Delete from {table} + WHERE id={rowId} + ''' + try: + DB().execute_query(query) + except: + logger.error(f'Some error while deleting product row for slave({self.slaveId})') + exit(1) + + def getProductUrl(self, rowId): + table = f'{self.config.get("crawler_schema")}.{self.config.get("tracker_tab")}' + query = f''' + SELECT product_page_url FROM {table} + WHERE id = {rowId} + ''' + try: + data = DB().fetchone(query) + return data[0] if data else None + except: + return None + + def getProductUrlInfo(self, productUrl): + try: + x = re.search("https?:\/\/.*?\/(.*)\/(.*)[\?,$]", productUrl) + return { + 'shopDomain': x.groups()[0], + 'productKey': x.groups()[1], + 'shopUrl': f'https://www.tokopedia.com/{x.groups()[0]}' + } + except: + return { + 'shopDomain': '', + 'productKey': '', + 'shopUrl': 'https://www.tokopedia.com/' + } + + def getProductData(self, productUrl): + shopInfo = self.getProductUrlInfo(productUrl) + url = 'https://gql.tokopedia.com/graphql/PDPGetLayoutQuery' + payload = json.dumps([{ + "operationName": "PDPGetLayoutQuery", + "variables": { + "shopDomain": shopInfo['shopDomain'], + "productKey": shopInfo['productKey'], + "layoutID": "", + "apiVersion": 1, + "userLocation": { + "cityID": "176", + "addressID": "0", + "districtID": "2274", + "postalCode": "", + "latlon": "" + }, + "extParam": "" + }, + "query": "fragment ProductVariant on pdpDataProductVariant {\n errorCode\n parentID\n defaultChild\n sizeChart\n totalStockFmt\n variants {\n productVariantID\n variantID\n name\n identifier\n option {\n picture {\n urlOriginal: url\n urlThumbnail: url100\n __typename\n }\n productVariantOptionID\n variantUnitValueID\n value\n hex\n stock\n __typename\n }\n __typename\n }\n children {\n productID\n price\n priceFmt\n optionID\n optionName\n productName\n productURL\n picture {\n urlOriginal: url\n urlThumbnail: url100\n __typename\n }\n stock {\n stock\n isBuyable\n stockWordingHTML\n minimumOrder\n maximumOrder\n __typename\n }\n isCOD\n isWishlist\n campaignInfo {\n campaignID\n campaignType\n campaignTypeName\n campaignIdentifier\n background\n discountPercentage\n originalPrice\n discountPrice\n stock\n stockSoldPercentage\n startDate\n endDate\n endDateUnix\n appLinks\n isAppsOnly\n isActive\n hideGimmick\n isCheckImei\n minOrder\n __typename\n }\n thematicCampaign {\n additionalInfo\n background\n campaignName\n icon\n __typename\n }\n __typename\n }\n __typename\n}\n\nfragment ProductMedia on pdpDataProductMedia {\n media {\n type\n urlOriginal: URLOriginal\n urlThumbnail: URLThumbnail\n urlMaxRes: URLMaxRes\n videoUrl: videoURLAndroid\n prefix\n suffix\n description\n variantOptionID\n __typename\n }\n videos {\n source\n url\n __typename\n }\n __typename\n}\n\nfragment ProductHighlight on pdpDataProductContent {\n name\n price {\n value\n currency\n __typename\n }\n campaign {\n campaignID\n campaignType\n campaignTypeName\n campaignIdentifier\n background\n percentageAmount\n originalPrice\n discountedPrice\n originalStock\n stock\n stockSoldPercentage\n threshold\n startDate\n endDate\n endDateUnix\n appLinks\n isAppsOnly\n isActive\n hideGimmick\n __typename\n }\n thematicCampaign {\n additionalInfo\n background\n campaignName\n icon\n __typename\n }\n stock {\n useStock\n value\n stockWording\n __typename\n }\n variant {\n isVariant\n parentID\n __typename\n }\n wholesale {\n minQty\n price {\n value\n currency\n __typename\n }\n __typename\n }\n isCashback {\n percentage\n __typename\n }\n isTradeIn\n isOS\n isPowerMerchant\n isWishlist\n isCOD\n isFreeOngkir {\n isActive\n __typename\n }\n preorder {\n duration\n timeUnit\n isActive\n preorderInDays\n __typename\n }\n __typename\n}\n\nfragment ProductCustomInfo on pdpDataCustomInfo {\n icon\n title\n isApplink\n applink\n separator\n description\n __typename\n}\n\nfragment ProductInfo on pdpDataProductInfo {\n row\n content {\n title\n subtitle\n applink\n __typename\n }\n __typename\n}\n\nfragment ProductDetail on pdpDataProductDetail {\n content {\n title\n subtitle\n applink\n showAtFront\n isAnnotation\n __typename\n }\n __typename\n}\n\nfragment ProductDataInfo on pdpDataInfo {\n icon\n title\n isApplink\n applink\n content {\n icon\n text\n __typename\n }\n __typename\n}\n\nfragment ProductSocial on pdpDataSocialProof {\n row\n content {\n icon\n title\n subtitle\n applink\n type\n rating\n __typename\n }\n __typename\n}\n\nquery PDPGetLayoutQuery($shopDomain: String, $productKey: String, $layoutID: String, $apiVersion: Float, $userLocation: pdpUserLocation, $extParam: String) {\n pdpGetLayout(shopDomain: $shopDomain, productKey: $productKey, layoutID: $layoutID, apiVersion: $apiVersion, userLocation: $userLocation, extParam: $extParam) {\n requestID\n name\n pdpSession\n basicInfo {\n alias\n createdAt\n isQA\n id: productID\n shopID\n shopName\n minOrder\n maxOrder\n weight\n weightUnit\n condition\n status\n url\n needPrescription\n catalogID\n isLeasing\n isBlacklisted\n menu {\n id\n name\n url\n __typename\n }\n category {\n id\n name\n title\n breadcrumbURL\n isAdult\n isKyc\n minAge\n detail {\n id\n name\n breadcrumbURL\n isAdult\n __typename\n }\n __typename\n }\n txStats {\n transactionSuccess\n transactionReject\n countSold\n paymentVerified\n itemSoldFmt\n __typename\n }\n stats {\n countView\n countReview\n countTalk\n rating\n __typename\n }\n __typename\n }\n components {\n name\n type\n position\n data {\n ...ProductMedia\n ...ProductHighlight\n ...ProductInfo\n ...ProductDetail\n ...ProductSocial\n ...ProductDataInfo\n ...ProductCustomInfo\n ...ProductVariant\n __typename\n }\n __typename\n }\n __typename\n }\n}\n" + }]) + headers = { + "accept": "*/*", + "accept-language": "en-GB,en-US;q=0.9,en;q=0.8", + "cache-control": "no-cache", + "content-type": "application/json", + "pragma": "no-cache", + "sec-ch-ua": "\"Chromium\";v=\"112\", \"Google Chrome\";v=\"112\", \"Not:A-Brand\";v=\"99\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "\"macOS\"", + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "same-site", + "x-device": "desktop", + "x-source": "tokopedia-lite", + "x-tkpd-akamai": "pdpGetLayout", + "x-tkpd-lite-service": "zeus", + "x-version": "b9e88e5", + "cookie": "_gcl_au=1.1.348240926.1680364922; DID=530d72c6b246c3610123a5d94a19c6eab8c39c4a74b341887d97a75ef6c99bed79a52c4447535a1516b0b772f91cbb61; DID_JS=NTMwZDcyYzZiMjQ2YzM2MTAxMjNhNWQ5NGExOWM2ZWFiOGMzOWM0YTc0YjM0MTg4N2Q5N2E3NWVmNmM5OWJlZDc5YTUyYzQ0NDc1MzVhMTUxNmIwYjc3MmY5MWNiYjYx47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=; _UUID_NONLOGIN_=cb1f299a0551a2f26ff83afdaff5be20; _UUID_CAS_=5b1bdb20-7c89-43c1-a7b7-fd32568eac25; _CASE_=20793f123f796169696c6f77793a123f79616b7779373937796179113a303a292f3a7b0b2e283a2f79777938123f79616a6c6d77793734353c796179797779373a2f7961797977792b18347961797977792c123f79616a69696a6b686c6e777928123f79616a6a6e686b6e6c687779280f222b3e79617969337977792c3328796179002007792c3a293e33342e283e04323f0779616a69696a6b686c6e770779283e292d32383e042f222b3e07796107796933077977077904042f222b3e353a363e07796107790c3a293e33342e283e28077926772007792c3a293e33342e283e04323f0779616b770779283e292d32383e042f222b3e07796107796a6e36077977077904042f222b3e353a363e07796107790c3a293e33342e283e2807792606797779370e2b3f796179696b6968766b6f766b6a0f6968616b69616b6b706b6c616b6b7926; _fbp=fb.1.1680364924824.814673339; hfv_banner=true; _gid=GA1.2.1341633608.1681281758; _SID_Tokopedia_=W6f-l31DSFpXyunL9L-iVN6X8iXN825d6yCMD5mv6bhmJG2ysPwb1wE3HGLbuBWz7chHZHcmaBP_9btm4B2GTNdX-8B4YEwSfUfb630Hxx0dQPe5VBZY1-B9xRKNVSsN; bm_sz=BF390F0FA93A37D0D88ABA97459B29A9~YAAQnIwsMQ7MflyHAQAAao2BdRNgMozO5tpNuvhnUs5/mIBTb9QkbuyryfMBUEjaeyTN3k28k3MA47gfrXqJjgVkulvZvM/q3QZClZMXecBHu1dmD7OdQukZlxJqtzs9xaC3pFrcWxuRTldFL/1t6zj41K1xZRVB2Bgnae66VdNRkLJzbTQjRp793sIdoXUxeOqiycgaw+ubiUUZjp7/FA2OyLLSwFzM46CtIY/h7T8E+QIiJnGuYVKaB/Zzrpsi5DuNQVfCHFYZ3MACQKoYPEZwT+AbPy4t390A9msDqp+fti5r8Og=~3420229~3748421; ak_bmsc=023A7422C465CF2BE11A4528B4C1FE06~000000000000000000000000000000~YAAQnIwsMUTMflyHAQAA0pGBdRN953SFCO24Hf4AAB+uuV7P/LxJXfSF58YAffY7YzEWnDlGJZK45PmxnfcYr+ZucDZMb6GMgGqhQBo0tk8yAeWZBr0YYX1zitWS6iQRevL2h8GYkzKZ2Muuf/iZDmdsFMOlTfxEFTfJdSMgIzGxBfBJ6hlpGrLEwrSbNvaY0Bcm6tgbhJ7q9XTUOIyDBYbSRX61UUSLonQSMkH22R2MqUYNRMzGRziKU4p7n59L4YwX3TZOxZdWWFHDky4surxJQd+cX/+iXW0ViK6Ve39DJyokv5R2a7VOMOrzAhImNoqvMMdJzutyDpy7WCRQtg6/KNr0rDbSYQzvN5tzCdDCiPjZ+dYq59XEmbNVrZOJ0KmNMI2pT2C68sU3rMUEC51mT6d1Vqx3b+sWEwBSwMtyKDbdSYzaJxlr28E/be33pNZ0XFhq7/MXyKwpk4cPV0JemQuZwo83hyWU23sBuQjqkMDCAKwcpRyJzZ8=; _abck=A84A907111D5FF0F4588FFC869A1AD60~0~YAAQnIwsMSzOflyHAQAA7cSBdQmDT4vvtcIofh4BQLvlCnTMUWXuCLFJeG+dUQ5WhnTImgGgsXIfCZoZMTDKfXArGddL/RDvUCghBZRZrCS0fN/S6mYBTFS4ASoSjHCtPvk1C2GhSJ8dsocUzuxaCgjdFioEh5D9f37CHAWPjsRHIDRyDSsSQ71nWa1Wh9bQ33zeUauCRCtOOvSiH00ZFUcFlzmio918m/TJlERQhWEKtAqv5pt/5wdCnsZzLdSuhEcYwq9GAExINeWRj03Psg55WP+P8wcZxPoSzppWVZmi2tbS80ZdIAvLKXLsesUevAkQgp5VnJh0wtEMlmSWaILbI5gY7yPVegLUg13YYiJRULya5S6Pj6reHGJPkhd7eA/YfNhKR3EvvTNjh5fSSNwR5IYMtpDPlYGP~-1~-1~-1; bm_sv=25A151988F0B55251B24510C44906401~YAAQnIwsMduNf1yHAQAAZfSZdRN/f9u9QGJPRxoWKskWovCw8nVoxlStGOy/CfDmyIly/EjLyr7JADK2NozffxZ8FGMz6ig5QeHWwWePEId6ZKXjdS9dkIhQCv01Kmr5Nu9G1F6ZOao3JpOWzhuAOIi3caIZIMgU7w0nOZ9LsYVssjNCK+6gQgpK3EBZ62NXQ3Hw6q3vU7GD0xYdHIbj6WUdt/vlBLnVhc8H/f78uVfSXkxMtjDKOL2a6miLMGO/p5RD~1; _dc_gtm_UA-126956641-6=1; _dc_gtm_UA-9801603-1=1; _ga_70947XW48P=GS1.1.1681303639.13.1.1681305238.60.0.0; _ga=GA1.1.533113241.1680364919", + "Referer": "https://www.tokopedia.com/townshell/cloud-korean-headband-bando-tebal-korea-empuk-bandana-lebar-dusty-pink?src=topads", + "Referrer-Policy": "no-referrer-when-downgrade" + } + data = api().postProxy(url, payload, headers) + return data + + def updateProductAudTable(self, productId): + sourceId = self.sourceId + table = f'{self.config.get("crawler_schema")}.{self.config.get("product_tab")}' + aud_table = f'{self.config.get("crawler_schema")}.aud_{self.config.get("product_tab")}' + query = f''' + SELECT id, rce_source_id, rce_source_product_id, product_page_url, rce_category_id, rce_store_id, rce_source_product_name, product_description, product_sold_total, product_sold, product_price_min, product_price_min_before_discount, product_price_max, product_price_max_before_discount, ratings, createdat, updatedat + FROM {table} + WHERE rce_source_product_id={productId} + AND rce_source_id={sourceId} + ''' + + try: + data = DB().fetchone(query) + except: + return + + update_query = f''' + INSERT INTO {aud_table}( + id, rce_source_id, rce_source_product_id, product_page_url, rce_category_id, rce_store_id, rce_source_product_name, product_description, product_sold_total, product_sold, product_price_min, product_price_min_before_discount, product_price_max, product_price_max_before_discount, ratings, createdat, updatedat) + Values ({data[0]}, {data[1]}, {data[2]}, '{data[3]}', {data[4]}, {data[5]}, '{data[6]}', '{data[7]}', {data[8]}, {data[9]}, {data[10]}, {data[11]}, {data[12]}, {data[13]}, {data[14]}, '{data[15].strftime("%Y-%m-%d %H:%M:%S.%f")}', '{data[16].strftime("%Y-%m-%d %H:%M:%S.%f")}') + ''' + + try: + DB().execute_query(update_query) + except: + pass + + def updateStoreAudTable(self, storeId): + sourceId = self.sourceId + table = f'{self.config.get("crawler_schema")}.{self.config.get("reseller_store_tab")}' + aud_table = f'{self.config.get("crawler_schema")}.aud_{self.config.get("reseller_store_tab")}' + query = f''' + SELECT id, rce_source_store_id, store_page_url, createdat, updatedat + FROM {table} + WHERE rce_source_store_id={storeId} + AND rce_source_id={sourceId} + ''' + + try: + data = DB().fetchone(query) + except: + return + + update_query = f''' + INSERT INTO {aud_table}( + id, rce_source_id, rce_source_store_id, store_page_url, createdat, updatedat) + Values ({data[0]}, {sourceId}, {data[1]}, '{data[2]}', '{data[3].strftime("%Y-%m-%d %H:%M:%S.%f")}', '{data[4].strftime("%Y-%m-%d %H:%M:%S.%f")}') + ON CONFLICT (rce_source_id, rce_source_store_id) DO UPDATE SET updatedat = now(); + ''' + + try: + DB().execute_query(update_query) + except: + pass + + def processProductData(self, data, productUrl, rowId): + try: + rootData = data[0]['data']['pdpGetLayout'] + rootComponents = rootData['components'] + productContents = list(filter(lambda x: x['name'] == 'product_content', rootComponents)) + productDetails = list(filter(lambda x: x['name'] == 'product_detail', rootComponents)) + rce_source_product_id = rootData['basicInfo']['id'] + product_page_url = rootData['basicInfo']['url'] + rce_category_id = rootData['basicInfo']['category']['id'] + rce_store_id = rootData['basicInfo']['shopID'] + productName = productContents[0]['data'][0]['name'].replace("'", "") + productDescription = list(filter(lambda x: x['title'] == 'Deskripsi', productDetails[0]['data'][0]['content']))[0]['subtitle'].replace("'", "") + productSoldTotal = rootData['basicInfo']['txStats']['countSold'] + productPrice = productContents[0]['data'][0]['campaign']['discountedPrice'] + productPriceBeforeDiscount = productContents[0]['data'][0]['campaign']['originalPrice'] + ratings = rootData['basicInfo']['stats']['rating'] + rce_source_id = self.sourceId + + # print(productDescription) + + # store + shopInfo = self.getProductUrlInfo(productUrl) + rce_source_store_id = rootData['basicInfo']['shopID'] + store_page_url = shopInfo['shopUrl'] + + table = f'{self.config.get("crawler_schema")}.{self.config.get("product_tab")}' + query = f''' + INSERT INTO {table}( + rce_source_id, rce_source_product_id, product_page_url, rce_category_id, rce_store_id, rce_source_product_name, product_description, product_sold_total, product_sold, product_price_min, product_price_min_before_discount, product_price_max, product_price_max_before_discount, ratings) + VALUES ({rce_source_id}, {rce_source_product_id}, '{product_page_url}', {rce_category_id}, {rce_store_id}, '{productName}', '{productDescription}', '{productSoldTotal}', '{productSoldTotal}', '{productPrice}', '{productPriceBeforeDiscount}', '{productPrice}', '{productPriceBeforeDiscount}', {ratings}) + ON CONFLICT (rce_source_id, rce_source_product_id) + DO UPDATE SET updatedat = now(), product_page_url = EXCLUDED.product_page_url, rce_category_id=EXCLUDED.rce_category_id, rce_store_id=EXCLUDED.rce_store_id, rce_source_product_name=EXCLUDED.rce_source_product_name, product_description=EXCLUDED.product_description, product_sold_total=EXCLUDED.product_sold_total, + product_sold=EXCLUDED.product_sold, product_price_min=EXCLUDED.product_price_min, product_price_min_before_discount=EXCLUDED.product_price_min_before_discount, product_price_max=EXCLUDED.product_price_max, product_price_max_before_discount=EXCLUDED.product_price_max_before_discount, ratings=EXCLUDED.ratings; + ''' + + table2 = f'{self.config.get("crawler_schema")}.{self.config.get("reseller_store_tab")}' + query2 = f''' + INSERT INTO {table2}( + rce_source_id, rce_source_store_id, store_page_url) + VALUES ({rce_source_id}, {rce_source_store_id}, '{store_page_url}') + ON CONFLICT (rce_source_id, rce_source_store_id) DO UPDATE SET updatedat = now(); + ''' + + DB().execute_query(query + query2) + self.removeProductRow(rowId) + + self.updateProductAudTable(rce_source_product_id) + self.updateStoreAudTable(rce_source_store_id) + return True + except Exception as e: + print(str(e)) + return False + + + def get(self): + rowId = self.fetchProductRowId() + if not rowId: + logger.info(f'No tasks pending for slave({self.slaveId})') + return + self.lockProductRow(rowId) + self.getProductUrl(rowId) + apiAttempts = 3 + data = [] + while apiAttempts and not data: + logger.info(f'Slave({self.slaveId}) attempting api call - {productUrl}') + apiAttempts = apiAttempts-1 + data = self.getProductData(productUrl) + if data: + attempt = 3 + processProductResponse = False + while attempt and processProductResponse == False: + attempt = attempt - 1 + processProductResponse = self.processProductData(data, productUrl, rowId) + if processProductResponse == False: + # Product was still not processed after three attempts, lets unlock + self.unlockProductRow(rowId) + else: + logger.info(f'Data could not be processed for - {productUrl}') + self.unlockProductRow(rowId) diff --git a/tokopedia_crawler_engine/tokopedia_sub_categories.py b/tokopedia_crawler_engine/tokopedia_sub_categories.py new file mode 100644 index 0000000..c03acfd --- /dev/null +++ b/tokopedia_crawler_engine/tokopedia_sub_categories.py @@ -0,0 +1,142 @@ +import json +from datetime import datetime +from tokopedia_logger import logger +from tokopedia_config import Config +from tokopedia_api import api +from tokopedia_db_writer import DB + +class TokopediaSubCategories: + config = Config().get() + sourceCategoryId = int(config.get("source_category")) + sourceId = None + + def __init__(self): + self.sourceId = self.getSourceId() + self.populate() + + def getSourceId(self): + table = f'{self.config.get("crawler_schema")}.{self.config.get("source_tab")}' + query = f'select id from {table} where source_name=\'Tokopedia\'' + data = DB().fetchone(query) + return data[0] + + def getSourceCategoryUpdatedTime(self): + table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}' + where = f'rce_source_category_id={self.sourceCategoryId} and rce_source_id={self.sourceId}' + query = f'select updatedat from {table} where {where}' + data = DB().fetchone(query) + return data[0] if data else None + + def fetchCategories(self): + url = 'https://gql.tokopedia.com/graphql/categoryAllList' + payload = json.dumps([{ + "operationName": "categoryAllList", + "variables": { + "categoryID": self.sourceCategoryId + }, + "query": "query categoryAllList($categoryID: Int, $type: String) {\n CategoryAllList: categoryAllList(categoryID: $categoryID, type: $type) {\n categories {\n identifier\n url\n name\n id\n child {\n id\n identifier\n name\n url\n child {\n name\n identifier\n url\n id\n }\n }\n }\n }\n }\n" + }]) + data = api().post(url, payload) + return data + + def processData(self, rawData): + sourceId = self.sourceId + data = rawData[0]['data']['CategoryAllList']['categories'][0] + values = [(str(sourceId), str(0), data['id'], f"'{data['url']}'", f"'{data['name']}'", f"'{data['identifier']}'")] + for fc in data['child']: + values.insert(len(values), (str(sourceId), data['id'], fc['id'], f"'{fc['url']}'", f"'{fc['name']}'", f"'{fc['identifier']}'")) + for sc in fc['child']: + values.insert(len(values), (str(sourceId), fc['id'], sc['id'], f"'{sc['url']}'", f"'{sc['name']}'", f"'{sc['identifier']}'")) + return values + + @staticmethod + def convertToString(n, delimeter = ','): + return delimeter.join(n) + + def upsertData(self): + table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}' + categories = self.fetchCategories() + rawData = self.processData(categories) + data = f"({self.convertToString(map(self.convertToString, rawData), '),(')})" + query = f''' + INSERT INTO {table} (rce_source_id, parent_category_id, rce_source_category_id, category_page_url, category_name, category_slug) + VALUES {data} + ON CONFLICT (category_slug) DO UPDATE SET updatedat = now(); + ''' + + try: + DB().execute_query(query) + except: + logger.error('Issue while inserting categories') + exit(1) + + def deleteTokoCategories(self): + table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}' + query = f'Delete from {table} where rce_source_id={self.sourceId};' + try: + DB().execute_query(query) + except: + logger.error(f'Tokopedia categories were not deleted from {table}') + + def fetchCategoriesFromDB(self): + table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}' + query = f'Select id, rce_source_id, parent_category_id, rce_source_category_id, category_page_url, category_name, category_slug, createdat, updatedat from {table} where rce_source_id={self.sourceId};' + # query = f'Select (id, rce_source_id, parent_category_id, rce_source_category_id, category_page_url, category_name, category_slug, updatedat) from {table} where rce_source_id={self.sourceId};' + try: + return DB().fetchall(query) + except: + logger.error(f'Issue while fetching data from {table}') + exit(1) + + def processAudData(self, data): + processedData = [] + for x in data: + t = list(x) + t[0] = str(t[0]) + t[1] = str(t[1]) + t[2] = str(t[2]) + t[3] = str(t[3]) + t[4] = f"'{t[4]}'" + t[5] = f"'{t[5]}'" + t[6] = f"'{t[6]}'" + t[7] = f'\'{t[7].strftime("%Y-%m-%d %H:%M:%S.%f")}\'' + t[8] = f'\'{t[8].strftime("%Y-%m-%d %H:%M:%S.%f")}\'' + processedData.insert(len(processedData), tuple(t)) + return processedData + + def updateAudTable(self): + dbData = self.fetchCategoriesFromDB() + rawData = self.processAudData(dbData) + table = f'{self.config.get("crawler_schema")}.aud_{self.config.get("category_tab")}' + data = f"({self.convertToString(map(self.convertToString, rawData), '),(')})" + query = f''' + Insert into {table} + (id, rce_source_id, parent_category_id, rce_source_category_id, category_page_url, category_name, category_slug, createdat, updatedat) + values {data} + ON CONFLICT (category_slug) DO UPDATE SET updatedat = now(), id=EXCLUDED.id; + ''' + + try: + return DB().execute_query(query) + except Exception as e: + logger.error(f'Issue while updating {table} {str(e)}') + exit(1) + + def populate(self): + sourceCategoryUpdatedTime = self.getSourceCategoryUpdatedTime() + if sourceCategoryUpdatedTime: + diffDays = (datetime.now() - sourceCategoryUpdatedTime).days + # Let's keep a frequency of 1 day to fetch/update categories + if diffDays < 1: + logger.info('Categories were populated recently, so skipping this step') + return + + # delete data from main table + logger.info('Deleting categories from main table') + self.deleteTokoCategories() + # insert fresh data + logger.info('Inserting categories in main table') + self.upsertData() + # update audit table, if required + logger.info('Inserting/Updating categories in audit table') + self.updateAudTable() diff --git a/tokopedia_crawler_engine/zyte-proxy-ca.crt b/tokopedia_crawler_engine/zyte-proxy-ca.crt new file mode 100644 index 0000000..a873127 --- /dev/null +++ b/tokopedia_crawler_engine/zyte-proxy-ca.crt @@ -0,0 +1,25 @@ +-----BEGIN CERTIFICATE----- +MIIERzCCAy+gAwIBAgIJAN/VCi6U4Y5SMA0GCSqGSIb3DQEBCwUAMIG5MQswCQYD +VQQGEwJJRTEQMA4GA1UECAwHTXVuc3RlcjENMAsGA1UEBwwEQ29yazEUMBIGA1UE +CgwLU2NyYXBpbmdIdWIxNTAzBgNVBAsMLExlYWRpbmcgVGVjaG5vbG9neSBhbmQg +UHJvZmVzc2lvbmFsIFNlcnZpY2VzMRQwEgYDVQQDDAtDcmF3bGVyYSBDQTEmMCQG +CSqGSIb3DQEJARYXc3VwcG9ydEBzY3JhcGluZ2h1Yi5jb20wHhcNMTUwNTE5MTQ1 +NjA3WhcNMjUwNTE2MTQ1NjA3WjCBuTELMAkGA1UEBhMCSUUxEDAOBgNVBAgMB011 +bnN0ZXIxDTALBgNVBAcMBENvcmsxFDASBgNVBAoMC1NjcmFwaW5nSHViMTUwMwYD +VQQLDCxMZWFkaW5nIFRlY2hub2xvZ3kgYW5kIFByb2Zlc3Npb25hbCBTZXJ2aWNl +czEUMBIGA1UEAwwLQ3Jhd2xlcmEgQ0ExJjAkBgkqhkiG9w0BCQEWF3N1cHBvcnRA +c2NyYXBpbmdodWIuY29tMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA +3I3nDH62M7FHT6HG5ZNS9cBeXmMZaKaxYdr+7ioSiVXzruDkH3uX6CQZLkvR2KpG +icHOnd0FM4S4rHYQoWc82b/UGgwjQdi47ED8fqCPusEcgo/7eY3y2Y/JivEWKk6f +z+gBlvEHjKj2EyzZ7FaExTEMQTTe28EroXTNySUctY9jprtKrs8jjGXd2sR6AHF1 +M6O+5CT/5kXhuDO9/Q9Tfym7wxBsU/k+6hhNH+RkYlNEvkv0d8vdku/ZKTCBuL9D +NTqgXFvAmOj0MNEjf5kFrF95g+k5+PxPU04TPUtOwU30GYbCjE+ecYsoTODg6+ju +TQoNk3RFt0A0wZS3ly1rnQIDAQABo1AwTjAdBgNVHQ4EFgQUn6fXHOpDIsaswTMr +K2DwcOHLtZ0wHwYDVR0jBBgwFoAUn6fXHOpDIsaswTMrK2DwcOHLtZ0wDAYDVR0T +BAUwAwEB/zANBgkqhkiG9w0BAQsFAAOCAQEAOLtBuyHixFblY2BieG3ZCs8D74Xc +Z1usYCUNuVxOzKhuLt/cv49r39SVienqvS2UTr3kmKdyaaRJnYQ06b5FmAP72vdI +4wUAU2F7bFErAVnH1rihB+YMRE/5/6VPLfwuK8yf3rkzdrKcV2DlRQwsnwroSIR8 +iON6JK2HOI0/LsKxPXUk9cHrli7e99yazS5+jBhRFGx8AVfoJg/6uLe6IKuw5xEZ +xAzDdjEIB/tf1cE0SQ+5sdmepO1cIjQYVSL7U+br+y9A1J9N+FYkBKVevM/W25tb +iGWBe46djkdm/6eyQ7gtuxhby5lwtRl5sIm9/ID/vWWDMf8O4GPPnW/Xug== +-----END CERTIFICATE-----