From 45e6965679751403f0ec8fee466b828f52564b63 Mon Sep 17 00:00:00 2001 From: "shariar@raenabeauty.com" Date: Thu, 14 Mar 2024 09:16:59 +0400 Subject: [PATCH] added Hasaki crawler --- .idea/dataSources.xml | 12 + .idea/sqldialects.xml | 6 + hasaki_crawler_engine/Util.py | 24 + hasaki_crawler_engine/changes.sql | 78 ++ hasaki_crawler_engine/conf.json | 26 + hasaki_crawler_engine/hasaki_categories.py | 143 ++++ .../hasaki_category_products.py | 160 ++++ hasaki_crawler_engine/hasaki_crawler.py | 47 ++ hasaki_crawler_engine/hasaki_db_writer.py | 754 ++++++++++++++++++ hasaki_crawler_engine/hasaki_product_info.py | 454 +++++++++++ hasaki_crawler_engine/test.py | 63 ++ hasaki_crawler_engine/test2.py | 25 + 12 files changed, 1792 insertions(+) create mode 100644 .idea/dataSources.xml create mode 100644 .idea/sqldialects.xml create mode 100644 hasaki_crawler_engine/Util.py create mode 100644 hasaki_crawler_engine/changes.sql create mode 100755 hasaki_crawler_engine/conf.json create mode 100644 hasaki_crawler_engine/hasaki_categories.py create mode 100644 hasaki_crawler_engine/hasaki_category_products.py create mode 100644 hasaki_crawler_engine/hasaki_crawler.py create mode 100755 hasaki_crawler_engine/hasaki_db_writer.py create mode 100644 hasaki_crawler_engine/hasaki_product_info.py create mode 100644 hasaki_crawler_engine/test.py create mode 100644 hasaki_crawler_engine/test2.py diff --git a/.idea/dataSources.xml b/.idea/dataSources.xml new file mode 100644 index 0000000..424cd96 --- /dev/null +++ b/.idea/dataSources.xml @@ -0,0 +1,12 @@ + + + + + redshift + true + com.amazon.redshift.jdbc.Driver + jdbc:redshift://redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com:5439/analytics + $ProjectFileDir$ + + + \ No newline at end of file diff --git a/.idea/sqldialects.xml b/.idea/sqldialects.xml new file mode 100644 index 0000000..972ddc3 --- /dev/null +++ b/.idea/sqldialects.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/hasaki_crawler_engine/Util.py b/hasaki_crawler_engine/Util.py new file mode 100644 index 0000000..73b80d6 --- /dev/null +++ b/hasaki_crawler_engine/Util.py @@ -0,0 +1,24 @@ +from deep_translator import GoogleTranslator + +# def translate_text_to_english(text): +# if text: +# translated = GoogleTranslator(source='auto', target='en').translate(text) +# return translated +# return text +# + +def translate_text_to_english(text): + if text: + chunk_size = 4800 + text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] + + translated_chunks = [] + for chunk in text_chunks: + translated_chunk = GoogleTranslator(source='auto', target='en').translate(chunk) + translated_chunks.append(translated_chunk) + + translated_text = ' '.join(translated_chunks) + + return translated_text + + return text \ No newline at end of file diff --git a/hasaki_crawler_engine/changes.sql b/hasaki_crawler_engine/changes.sql new file mode 100644 index 0000000..65cfaec --- /dev/null +++ b/hasaki_crawler_engine/changes.sql @@ -0,0 +1,78 @@ +ALTER TABLE test_spider_management.rce_category ADD category_parent_name varchar(24000) NULL; +ALTER TABLE test_spider_management.aud_rce_category ADD category_parent_name varchar(24000) NULL; + +CREATE TABLE IF NOT EXISTS test_spider_management.crawler_tracker_hasaki +( + crawler_name VARCHAR(24000) ENCODE lzo + ,product_section VARCHAR(24000) ENCODE lzo + ,product_name VARCHAR(24000) ENCODE lzo + ,product_url VARCHAR(24000) ENCODE lzo + ,product_image VARCHAR(24000) ENCODE lzo + ,product_sold INTEGER NOT NULL ENCODE az64 + ,product_brand VARCHAR(24000) ENCODE lzo + ,gift VARCHAR(24000) ENCODE lzo + ,product_rank INTEGER NOT NULL ENCODE az64 + ,categoryid INTEGER NOT NULL ENCODE az64 + ,flag SMALLINT DEFAULT 0 ENCODE az64 +) +DISTSTYLE AUTO +; + +ALTER TABLE test_spider_management.rce_brand ADD brand_following int8 NULL; +ALTER TABLE test_spider_management.rce_brand ADD brand_rating int8 NULL; +ALTER TABLE test_spider_management.aud_rce_brand ADD brand_following int8 NULL; +ALTER TABLE test_spider_management.aud_rce_brand ADD brand_rating int8 NULL; + +ALTER TABLE test_spider_management.rce_product_variant ADD product_variant_sku varchar(1000) NULL; +ALTER TABLE test_spider_management.aud_rce_product_variant ADD product_variant_sku varchar(1000) NULL; + +CREATE TABLE IF NOT EXISTS test_spider_management.rce_seo +( + id INTEGER ENCODE az64 + ,rce_product_id INTEGER ENCODE az64 + ,rce_source_id INTEGER ENCODE az64 + ,seo_title VARCHAR(2000) ENCODE lzo + ,seo_description VARCHAR(10000) ENCODE lzo + ,seo_url VARCHAR(2000) ENCODE lzo + ,seo_url_hash VARCHAR(2000) ENCODE lzo + ,seo_image VARCHAR(2000) ENCODE lzo + ,seo_price_amount BIGINT ENCODE az64 + ,seo_price_currency VARCHAR(2000) ENCODE lzo + ,seo_product_band VARCHAR(2000) ENCODE lzo + ,seo_product_availability VARCHAR(2000) ENCODE lzo + ,seo_product_category VARCHAR(2000) ENCODE lzo + ,seo_product_condition VARCHAR(2000) ENCODE lzo + ,seo_product_retailer_item_id BIGINT ENCODE az64 + ,seo_product_robots VARCHAR(2000) ENCODE lzo + ,createdat TIMESTAMP WITHOUT TIME ZONE DEFAULT getdate() ENCODE az64 + ,updatedat TIMESTAMP WITHOUT TIME ZONE DEFAULT getdate() ENCODE az64 +) +DISTSTYLE AUTO +; + +CREATE TABLE IF NOT EXISTS test_spider_management.aud_rce_seo +( + auditid INTEGER ENCODE az64 + ,id INTEGER ENCODE az64 + ,rce_product_id INTEGER ENCODE az64 + ,rce_source_id INTEGER ENCODE az64 + ,seo_title VARCHAR(2000) ENCODE lzo + ,seo_description VARCHAR(10000) ENCODE lzo + ,seo_url VARCHAR(2000) ENCODE lzo + ,seo_url_hash VARCHAR(2000) ENCODE lzo + ,seo_image VARCHAR(2000) ENCODE lzo + ,seo_price_amount BIGINT ENCODE az64 + ,seo_price_currency VARCHAR(2000) ENCODE lzo + ,seo_product_band VARCHAR(2000) ENCODE lzo + ,seo_product_availability VARCHAR(2000) ENCODE lzo + ,seo_product_category VARCHAR(2000) ENCODE lzo + ,seo_product_condition VARCHAR(2000) ENCODE lzo + ,seo_product_retailer_item_id BIGINT ENCODE az64 + ,seo_product_robots VARCHAR(2000) ENCODE lzo + ,createdat TIMESTAMP WITHOUT TIME ZONE ENCODE az64 + ,updatedat TIMESTAMP WITHOUT TIME ZONE ENCODE az64 + ,audit_createdat TIMESTAMP WITHOUT TIME ZONE DEFAULT getdate() ENCODE az64 +) +DISTSTYLE AUTO +; + diff --git a/hasaki_crawler_engine/conf.json b/hasaki_crawler_engine/conf.json new file mode 100755 index 0000000..841f33f --- /dev/null +++ b/hasaki_crawler_engine/conf.json @@ -0,0 +1,26 @@ +{ + "crawler_name": "raena_crawler_engine_hasaki", + "crawler_schema": "test_spider_management", + "category_tab": "rce_category", + "tracker_tab": "crawler_tracker_hasaki", + "product_tab": "rce_product", + "variant_tab": "rce_product_variant", + "brand_tab": "rce_brand", + "reseller_tab": "rce_reseller", + "reseller_store_tab": "rce_reseller_store", + "review_tab": "rce_ratings_reviews", + "review_productmodels_tab": "rce_ratings_reviews_productmodels", + "review_producttags_tab": "rce_ratings_reviews_producttags", + "review_tags": "rce_tags", + "source_tab": "rce_source", + "seo_tab": "rce_seo", + "product_per_category": "1000", + "source_category": "11043145", + "db_user": "dbadmin", + "db_pass": "5qCif6eyY3Kmg4z", + "database": "analytics", + "db_host": "redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com", + "db_port": "5439", + "crawler_main": "1", + "crawler_slave_no": "" +} \ No newline at end of file diff --git a/hasaki_crawler_engine/hasaki_categories.py b/hasaki_crawler_engine/hasaki_categories.py new file mode 100644 index 0000000..d312bc9 --- /dev/null +++ b/hasaki_crawler_engine/hasaki_categories.py @@ -0,0 +1,143 @@ +import hashlib +import logging +import time +import psycopg2 +import pandas as pd + +from playwright.sync_api import sync_playwright +from hasaki_db_writer import hasaki_db_writer +from Util import translate_text_to_english + + + +class HasakiCategories: + def __init__(self, config): + logging.info("Initializing HasakiSubCategories") + self.master_category = [] + self.config = config + self.crawler_name = self.config.get("crawler_name") + self.product_limit = int(self.config.get("product_per_category")) + self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), + password=self.config.get('db_pass'), host=self.config.get('db_host'), + port=self.config.get('db_port')) + self.conn.autocommit = True + self.cur = self.conn.cursor() + self.cur.execute(f"""select id from {self.config.get('crawler_schema')}.{self.config.get('source_tab')} where source_name='Hasaki'""") + try: + self.rce_source_id = self.cur.fetchone()[0] + except: + logging.info("Source tab is empty. Please check. Exiting.....") + exit(1) + + self.db_writer = hasaki_db_writer(config) + + def __del__(self): + print("Closing connection.....") + self.conn.close() + + def start_processing(self): + + self.crawl_and_track("HEALTH - BEAUTY", "https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html") + + df = pd.DataFrame(self.master_category, columns=['Index', 'Parent', 'Name', 'Link']) + + df = df.sort_values('Index') + + df = df.drop_duplicates(subset='Name', keep='first') + + self.process_category(df) + + + def process_category(self, category): + + for index, row in category.iterrows(): + data = {} + + data['parent_category_id'] = 0 + data['rce_source_id'] = self.rce_source_id + data['rce_source_category_id'] = 0 + data['rce_source_status'] = 1 + data['category_name'] = str(row["Name"]).replace("'","") + data['category_page_url'] = row["Link"] + data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest() + data['category_parent_name'] = str(row["Parent"]).replace("'","") + + self.db_writer.rce_category(data) + + + def crawl_and_track(self, parent, url_to_visit): + self.master_category.append((0,"0", parent, url_to_visit)) + + print(self.master_category) + + cats = self.crawl_categories(parent, url_to_visit) + time.sleep(10) + if cats: + for cat in cats: + self.master_category.append((1,)+(cat)) + print((1,)+(cat)) + + sub_cats1 = self.crawl_categories(cat[1], cat[2]) + time.sleep(10) + if sub_cats1: + for sub_cat1 in sub_cats1: + self.master_category.append((2,) + (sub_cat1)) + print((2,) + (sub_cat1)) + + sub_cats2 = self.crawl_categories(sub_cat1[1], sub_cat1[2]) + time.sleep(10) + if sub_cats2: + for sub_cat2 in sub_cats2: + self.master_category.append((3,) + (sub_cat2)) + print((3,) + (sub_cat2)) + + sub_cats3 = self.crawl_categories(sub_cat2[1], sub_cat2[2]) + time.sleep(10) + if sub_cats3: + for sub_cat3 in sub_cats3: + self.master_category.append((4,) + (sub_cat3)) + print((4,) + (sub_cat3)) + + def crawl_categories(self, parent, url_to_visit): + + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + # context = browser.new_context( + # viewport={"width": 375, "height": 667, "isMobile": True} + # ) + page = browser.new_page() + + # Load the webpage + page.goto(url_to_visit) + # page.goto('https://hasaki.vn/danh-muc/my-pham-high-end-c1907.html') + + page.wait_for_load_state('load') + + container_element = page.query_selector('.block_colaps_sticky.width_common.collaps_sticky') + + if container_element: + item_elements = container_element.query_selector_all('.item_fillter') + content_elements = container_element.query_selector_all('.content_fillter') + + urls = [] + + for item_element in item_elements: + text = item_element.query_selector('a').inner_text() + text = translate_text_to_english(text) + href = item_element.query_selector('a').get_attribute('href') + urls.append((parent, text, href)) + + for content_element in content_elements: + text = content_element.query_selector('a').inner_text() + text = translate_text_to_english(text) + href = content_element.query_selector('a').get_attribute('href') + urls.append((parent, text, href)) + + # removing previously collected data + master_urls = [item[3] for item in self.master_category] + filtered_data = [(parent, name, url) for parent, name, url in urls if url not in master_urls] + + return filtered_data + + browser.close() + diff --git a/hasaki_crawler_engine/hasaki_category_products.py b/hasaki_crawler_engine/hasaki_category_products.py new file mode 100644 index 0000000..26e6e94 --- /dev/null +++ b/hasaki_crawler_engine/hasaki_category_products.py @@ -0,0 +1,160 @@ +import hashlib +import logging +import random +import time +import psycopg2 +from playwright.sync_api import sync_playwright +from deep_translator import GoogleTranslator +from hasaki_db_writer import hasaki_db_writer +import pandas as pd +from Util import translate_text_to_english +class HasakiCategoryProducts: + def __init__(self, config): + logging.info("Initializing HasakiCategoryProducts........") + self.config = config + self.crawler_name = self.config.get("crawler_name") + self.product_limit = int(self.config.get("product_per_category")) + self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), + password=self.config.get('db_pass'), host=self.config.get('db_host'), + port=self.config.get('db_port')) + self.conn.autocommit = True + self.cur = self.conn.cursor() + self.cur.execute( + f"""select id from {self.config.get('crawler_schema')}.{self.config.get('source_tab')} where source_name='Hasaki'""") + try: + self.rce_source_id = self.cur.fetchone()[0] + except: + logging.info("Source tab is empty. Please check. Exiting.....") + exit(1) + + self.db_writer = hasaki_db_writer(config) + + def __del__(self): + print("Closing connection.....") + self.conn.close() + + def start_processing(self): + + logging.info("Starting crawler to collect category products.........") + + sql = f""" + select * from {self.config.get('crawler_schema')}.{self.config.get('category_tab')} + where rce_source_id = {self.rce_source_id} order by id + """ + + self.cur.execute(sql) + + categories = self.cur.fetchall() + + for category in categories: + logging.info("================= Fetching Products for : {} ====================".format(str(category[7]))) + pages = self.get_pages(category[5]) + + time.sleep(random.randint(10,20)) + + self.get_product_list(urls = pages, categoryId = category[0]) + + + + + def get_pages(self, url): + + pages = [] + pages.append(url) + + try: + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + + page = browser.new_page() + page.goto(url) + + page.wait_for_load_state('load') + + pagination = page.query_selector(".pagination.ul-pagination").query_selector_all(".change-page") + + for pagination in pagination: + if str(pagination.get_attribute('data-page')).strip() != "1": + new_url = str(pagination.get_attribute('href')).strip() + new_url = "https://hasaki.vn" + new_url + pages.append(new_url) + browser.close() + except Exception as e: + pass + finally: + return pages + + def get_product_list(self,urls, categoryId): + + try: + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + + page = browser.new_page() + + page_count = 1 + + logging.info("Found {} pages. Looping through URLS to get all products.".format(str(len(urls)))) + for url in urls: + logging.info("+++++++++++++ Loading page : {} +++++++++++++++++".format(str(page_count))) + + page.goto(url) + + page.wait_for_load_state('load') + + container_element = page.query_selector('.ProductGrid__grid.width_common') + if container_element: + item_elements = container_element.query_selector_all('.ProductGridItem__itemOuter') + item_count = 1 + for item_element in item_elements: + try: + product_section = "Base Product Page " + str(page_count) + product_name = translate_text_to_english(str(item_element.query_selector('.width_common.name_sp.space_bottom_5').text_content()).strip().replace("'","")) + product_url = str(item_element.query_selector('.v3_thumb_common_sp.relative').get_attribute('href')).strip() + product_brand = translate_text_to_english(str(item_element.query_selector('.width_common.txt_color_1.space_bottom_3').text_content()).strip().replace("'","")) + product_rank = item_count + + product_image = "" + try: + product_image = str(item_element.query_selector('.v3_thumb_common_sp.relative').query_selector('.img_thumb.lazy.loaded').get_attribute('src')).strip().replace("'","") + except: + pass + + gift = "" + try: + gift = translate_text_to_english(str(item_element.query_selector('.block_gift_list_item').text_content()).strip().replace("'","")) + except: + pass + + product_sold = 0 + try: + product_sold = int(str(item_element.query_selector('.item_count_by').text_content()).strip().replace('.','')) + except: + pass + + + sql = f""" + insert into {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')}(crawler_name,product_section, product_name, product_url, product_image, product_sold, product_brand, gift, product_rank, categoryid) + values('{self.crawler_name}','{product_section}','{product_name.replace("'","")}','{product_url}','{product_image}',{product_sold},'{product_brand}','{gift}',{product_rank},{categoryId}) + """ + + logging.info(sql) + + self.cur.execute(sql) + + except Exception as e: + print(e) + + item_count += 1 + + time.sleep(random.randint(10,30)) + + page_count += 1 + + browser.close() + except Exception as e: + print(e) + + + + diff --git a/hasaki_crawler_engine/hasaki_crawler.py b/hasaki_crawler_engine/hasaki_crawler.py new file mode 100644 index 0000000..4f7bf56 --- /dev/null +++ b/hasaki_crawler_engine/hasaki_crawler.py @@ -0,0 +1,47 @@ +import logging +import json +import time + +from hasaki_categories import HasakiCategories +from hasaki_category_products import HasakiCategoryProducts +from hasaki_product_info import HasakiProductInfo + +##### Looger ###### +format = "%(asctime)s: %(message)s" +logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") + +config = {} + + +def main(): + # hasaki_categories = HasakiCategories(config) + # hasaki_categories.start_processing() + # + # time.sleep(60) + # + # hasaki_category_products = HasakiCategoryProducts(config) + # hasaki_category_products.start_processing() + # + # time.sleep(60) + + hasaki_products = HasakiProductInfo(config) + hasaki_products.start_processing() + + + +if __name__ == "__main__": + logging.info("Starting Hasaki Crawler.......") + try: + logging.info("Loading config file.......") + with open("conf.json", "r") as jsonfile: + config = json.load(jsonfile) + logging.info("Config file loaded.......") + print(config) + + main() + + except Exception as e: + logging.info("Error: ".format(e)) + #logging.info("Cannot load config file. Please check. Exiting......") + #send_mail() + exit(1) diff --git a/hasaki_crawler_engine/hasaki_db_writer.py b/hasaki_crawler_engine/hasaki_db_writer.py new file mode 100755 index 0000000..bf3b999 --- /dev/null +++ b/hasaki_crawler_engine/hasaki_db_writer.py @@ -0,0 +1,754 @@ +import logging +import psycopg2 + +###### Looger ###### +format = "%(asctime)s: %(message)s" +logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") + +class hasaki_db_writer: + def __init__(self, config): + self.config = config + self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) + self.conn.autocommit = True + self.cur = self.conn.cursor() + + def __del__(self): + logging.info("Closing connection.....") + self.conn.close() + + def get_id(self, schema, table): + sql = f""" + select max(id) from {schema}.{table} + """ + self.cur.execute(sql) + res = self.cur.fetchone() + + if res[0] != None: + id = res[0] + 1 + else: + id = 1 + + return id + + def get_aud_id(self, schema, table): + sql = f""" + select max(auditid) from {schema}.{table} + """ + self.cur.execute(sql) + res = self.cur.fetchone() + + if res[0] != None: + id = res[0] + 1 + else: + id = 1 + + return id + + def rce_category(self, data): + sql = f""" + select * from {self.config.get('crawler_schema')}.{self.config.get('category_tab')} where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']} + """ + self.cur.execute(sql) + res = self.cur.fetchone() + + id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('category_tab')) + id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('category_tab')) + + + if not res: + sql = f""" + insert into {self.config.get('crawler_schema')}.{self.config.get('category_tab')}(id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,category_parent_name) + values({id_main},{data['parent_category_id']},{data['rce_source_id']},{data['rce_source_category_id']},{data['rce_source_status']},'{data['category_page_url']}','{data['category_page_url_hash']}','{data['category_name']}','{data['category_parent_name']}') + """ + logging.info(sql) + + self.cur.execute(sql) + + sql = f""" + insert into {self.config.get('crawler_schema')}.aud_{self.config.get('category_tab')}(auditid,id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat,category_parent_name) + select {id_aud},id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat,category_parent_name from {self.config.get('crawler_schema')}.{self.config.get('category_tab')} + where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']} + """ + + logging.info(sql) + + self.cur.execute(sql) + + else: + if str(data['parent_category_id'])==str(res[1]) and str(data['rce_source_category_id'])==str(res[3]) and str(data['category_name']) == str(res[7]) and \ + str(data['category_page_url'])==str(res[5]) and str(data['category_parent_name'])==str(res[12]): + sql = f""" + update {self.config.get('crawler_schema')}.{self.config.get('category_tab')} set updatedat=GETDATE() + where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']} + """ + logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + logging.info(sql) + self.cur.execute(sql) + else: + sql = f""" + update {self.config.get('crawler_schema')}.{self.config.get('category_tab')} set parent_category_id={data['parent_category_id']}, rce_source_category_id = {data['rce_source_category_id']}, + category_name = '{data['category_name']}', category_page_url = '{data['category_page_url']}', category_page_url_hash = '{data['category_page_url_hash']}', category_parent_name = '{data['category_parent_name']}', + updatedat=GETDATE() where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']} + """ + logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(auditid,id,parent_category_id,rce_source_id," \ + "rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat,category_parent_name) " \ + "select "+str(id_aud)+", id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \ + "category_name,createdat,updatedat,category_parent_name from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \ + "where category_name = '"+ str(res[7])+"'" + logging.info(sql) + + self.cur.execute(sql) + + def rce_product(self, data): + sql = f""" + select * from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} + where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']} + """ + self.cur.execute(sql) + res = self.cur.fetchone() + + id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('product_tab')) + id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('product_tab')) + + if not res: + sql = f""" + insert into {self.config.get('crawler_schema')}.{self.config.get('product_tab')}(id,rce_source_product_id,rce_source_product_status,product_page_url, + product_page_url_hash,rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold, + product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings,product_section, + rce_source_id,countryoforigin,rank,ships_from) values({id_main},{data['rce_source_product_id']},{data['rce_source_product_status']},'{data['product_page_url']}', + '{data['product_page_url_hash']}',{data['rce_category_id']},{data['rce_brand_id']},{data['rce_store_id']},'{data['rce_source_product_name']}','{data['product_images']}','{data['product_description']}',{data['product_sold_total']},{data['product_sold']}, + {data['product_price_min']},{data['product_price_min_before_discount']},{data['product_price_max']},{data['product_price_max_before_discount']},{data['ratings']},'{data['product_section']}', + {data['rce_source_id']},'{data['countryoforigin']}',{data['rank']},'{data['ships_from']}') + """ + logging.info(sql) + + self.cur.execute(sql) + + sql = f""" + insert into {self.config.get('crawler_schema')}.aud_{self.config.get('product_tab')}(auditid,id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash, + rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount, + product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank) + select {id_aud},id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash, + rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount, + product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} + where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']} + """ + + logging.info(sql) + self.cur.execute(sql) + else: + + if str(data['rce_source_product_id'])==str(res[1]) and str(data['rce_source_product_status'])==str(res[2]) and \ + str(data['product_page_url'])==str(res[3]) and str(data['product_page_url_hash'])==str(res[4]) and str(data['rce_category_id'])==str(res[5]) and \ + str(data['rce_brand_id'])==str(res[6]) and str(data['rce_store_id'])==str(res[7]) and str(data['rce_source_product_name'])==str(res[8]) and \ + str(data['product_images'])==str(res[9]) and str(data['product_sold_total'])==str(res[11]) and \ + str(data['product_sold'])==str(res[12]) and str(data['product_price_min'])==str(res[13]) and str(data['product_price_min_before_discount'])==str(res[14]) and \ + str(data['product_price_max'])==str(res[15]) and str(data['product_price_max_before_discount'])==str(res[16]) and str(data['ratings'])==str(res[17]) \ + and str(data['ships_from'])==str(res[18]) and str(data['rce_source_id'])==str(res[21]) \ + and str(data['product_section'])==str(res[22]) and str(data['countryoforigin'])==str(res[23])\ + and str(data['rank'])==str(res[24]): + + sql = f""" + update {self.config.get('crawler_schema')}.{self.config.get('product_tab')} set updatedat=GETDATE() + where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']} + """ + logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + logging.info(sql) + self.cur.execute(sql) + else: + sql = f""" + update {self.config.get('crawler_schema')}.{self.config.get('product_tab')} set rce_source_product_id = {data['rce_source_product_id']}, rce_source_product_status={data['rce_source_product_status']}, product_page_url='{data['product_page_url']}', + product_page_url_hash='{data['product_page_url_hash']}', rce_category_id={data['rce_category_id']}, rce_brand_id={data['rce_brand_id']}, rce_store_id={data['rce_store_id']}, + rce_source_product_name='{data['rce_source_product_name']}', product_images='{data['product_images']}', product_description='{data['product_description']}', product_sold_total={data['product_sold_total']}, + product_sold={data['product_sold']}, product_price_min='{data['product_price_min']}',product_price_min_before_discount='{data['product_price_min_before_discount']}', + product_price_max='{data['product_price_max']}', product_price_max_before_discount='{data['product_price_max_before_discount']}', ratings={data['ratings']}, + ships_from='{data['ships_from']}',product_section='{data['product_section']}',countryoforigin='{data['countryoforigin']}',rank={data['rank']}, updatedat=GETDATE() + where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']} + """ + + logging.info(sql) + self.cur.execute(sql) + + sql = f""" + insert into {self.config.get('crawler_schema')}.aud_{self.config.get('product_tab')}(auditid,id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash, + rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount, + product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank) + select {id_aud},id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash, + rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount, + product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} + where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']} + """ + logging.info(sql) + self.cur.execute(sql) + + + def rce_product_variant(self, data): + sql = f""" + select * from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} where + rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']} + """ + self.cur.execute(sql) + res = self.cur.fetchone() + + id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('variant_tab')) + id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('variant_tab')) + + if not res: + + sql = f""" + insert into {self.config.get('crawler_schema')}.{self.config.get('variant_tab')}(id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku) + values({id_main},{data['rce_source_variant_id']},{data['rce_product_id']},'{data['product_variant_name']}',{data['product_variant_price']},{data['product_variant_price_before_discount']},{data['product_variant_stock']},'{data['product_variant_sku']}') + """ + + logging.info(sql) + + self.cur.execute(sql) + + sql = f""" + insert into {self.config.get('crawler_schema')}.aud_{self.config.get('variant_tab')}(auditid,id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat) + select {id_aud},id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat + from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']} + """ + + logging.info(sql) + self.cur.execute(sql) + + else: + if str(data['rce_source_variant_id'])==str(res[1]) and str(data['rce_product_id'])==str(res[2]) and str(data['product_variant_name'])==str(res[3]) and \ + str(data['product_variant_price'])==str(res[4]) and str(data['product_variant_price_before_discount'])==str(res[5]) and str(data['product_variant_stock'])==str(res[6])\ + and str(data['product_variant_sku'])==str(res[9]): + + sql = f""" + update {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} set updatedat=GETDATE() + where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']} + """ + logging.info(sql) + self.cur.execute(sql) + + + sql = f""" + update {self.config.get('crawler_schema')}.aud_{self.config.get('variant_tab')} a set updatedat=b.updatedat + from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} b where a.id=b.id and b.id = {res[0]} + """ + + logging.info(sql) + self.cur.execute(sql) + else: + + sql = f""" + update {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} set rce_source_variant_id={data['rce_source_variant_id']}, + rce_product_id={data['rce_product_id']},product_variant_name='{data['product_variant_name']}',product_variant_price={data['product_variant_price']}, + product_variant_price_before_discount={data['product_variant_price_before_discount']},product_variant_stock={data['product_variant_stock']}, + product_variant_sku={data['product_variant_sku']}, updatedat=GETDATE() + where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']} + """ + + logging.info(sql) + self.cur.execute(sql) + + sql = f""" + insert into {self.config.get('crawler_schema')}.aud_{self.config.get('variant_tab')}(auditid,id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat) + select {id_aud},id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat + from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']} + """ + + logging.info(sql) + + self.cur.execute(sql) + + + def rce_brand(self, data): + sql = f""" + select * from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} where rce_source_brand_id = {data['rce_source_brand_id']} + and rce_source_id = {data['rce_source_id']} + """ + self.cur.execute(sql) + res = self.cur.fetchone() + + id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('brand_tab')) + id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('brand_tab')) + + if not res: + sql = f""" + insert into {self.config.get('crawler_schema')}.{self.config.get('brand_tab')}(id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating) + values({id_main},{data['rce_source_id']},{data['rce_source_brand_id']},{data['rce_source_brand_status']},'{data['brand_page_url']}','{data['brand_page_url_hash']}','{data['brand_name']}',{data['brand_following']},{data['brand_rating']}) + """ + + logging.info(sql) + + self.cur.execute(sql) + + sql = f""" + insert into {self.config.get('crawler_schema')}.aud_{self.config.get('brand_tab')}(auditid,id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat) + select {id_aud}, id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} + where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']} + """ + logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_source_id'])==str(res[1]) and str(data['rce_source_brand_status'])==str(res[3]) and str(data['brand_page_url'])==str(res[4]) and \ + str(data['brand_page_url_hash'])==str(res[5]) and str(data['brand_name'])==str(res[6]) and str(data['rce_source_brand_id'])==str(res[2]): + + sql = f""" + update {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} set updatedat=GETDATE() where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']} + """ + logging.info(sql) + self.cur.execute(sql) + + sql = f""" + update {self.config.get('crawler_schema')}.aud_{self.config.get('brand_tab')} a set updatedat=b.updatedat + from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} b where a.id=b.id and b.id = {res[0]} and + b.rce_source_id = {data['rce_source_id']} + """ + logging.info(sql) + self.cur.execute(sql) + else: + sql = f""" + update {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} set rce_source_id={data['rce_source_id']}, rce_source_brand_id={data['rce_source_brand_id']}, + rce_source_brand_status={data['rce_source_brand_status']}, brand_page_url='{data['brand_page_url']}', brand_page_url_hash='{data['brand_page_url_hash']}', + brand_name='{data['brand_name']}', brand_following={data['brand_following']}, brand_rating={data['brand_rating']}, updatedat=GETDATE() where rce_source_brand_id={data['rce_source_brand_id']} + and rce_source_id = {data['rce_source_id']} + """ + + logging.info(sql) + self.cur.execute(sql) + + sql = f""" + insert into {self.config.get('crawler_schema')}.aud_{self.config.get('brand_tab')}(auditid,id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat) + select {id_aud}, id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} + where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']} + """ + + logging.info(sql) + + self.cur.execute(sql) + + def rce_reseller(self, data): + data['reseller_name'] = data['reseller_name'] + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data['reseller_name'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('reseller_tab')) + id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('reseller_tab')) + + + if not res: + + sql = f""" + insert into {self.config.get('crawler_schema')}.{self.config.get('reseller_tab')}(id,rce_source_id,rce_source_reseller_status,reseller_name) + values({id_main},'{data['rce_source_id']}','{data['rce_source_reseller_status']}','{data['reseller_name']}') + """ + #logging.info(sql) + + self.cur.execute(sql) + + sql = f""" + insert into {self.config.get('crawler_schema')}.aud_{self.config.get('reseller_tab')}(auditid,id,rce_source_id,rce_source_reseller_status,reseller_name,createdat,updatedat) + select {id_aud}, id,rce_source_id,rce_source_reseller_status,reseller_name,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('reseller_tab')} + where reseller_name='{data['reseller_name']}' + """ + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_source_reseller_status'])==str(res[3]) and str(data['reseller_name'])==str(res[4]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set updatedat=GETDATE() " \ + "where reseller_name = '"+ str(res[4])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set rce_source_id="+str(data['rce_source_id'])+", " \ + "rce_source_reseller_status="+str(data['rce_source_reseller_status'])+", reseller_name='"+str(data['reseller_name'])+"', reseller_average_rating=" \ + "'"+str(data['reseller_average_rating'])+"',reseller_description='"+str(data['reseller_description'])+"', updatedat=GETDATE() where reseller_name = '"+ str(res[4])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (auditid,id,rce_source_id,rce_source_reseller_status," \ + "reseller_name,reseller_average_rating,reseller_description,createdat,updatedat) select "+str(id_aud)+", id,rce_source_id,rce_source_reseller_status," \ + "reseller_name,reseller_average_rating,reseller_description,createdat,updatedat from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name='"+str(res[4])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + def rce_reseller_store(self, data): + + data['store_page_url'] = data['store_page_url'].replace("'","''") + + sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url = '"+str(data['store_page_url'])+"'" + self.cur.execute(sql) + res = self.cur.fetchone() + + id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('reseller_store_tab')) + id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('reseller_store_tab')) + + if not res: + + sql = f""" + insert into {self.config.get('crawler_schema')}.{self.config.get('reseller_store_tab')}(id,rce_source_store_status,store_page_url,store_page_url_hash,rce_reseller_id,rce_source_id) + values({id_main},'{data['rce_source_store_status']}','{data['store_page_url']}','{data['store_page_url_hash']}',{data['rce_reseller_id']},{data['rce_source_id']}) + """ + #logging.info(sql) + + self.cur.execute(sql) + + sql = f""" + insert into {self.config.get('crawler_schema')}.aud_{self.config.get('reseller_store_tab')}(auditid,id,rce_source_store_status,store_page_url,store_page_url_hash,rce_reseller_id,createdat,updatedat,rce_source_id) + select {id_aud}, id,rce_source_store_status,store_page_url,store_page_url_hash,rce_reseller_id,createdat,updatedat,rce_source_id from {self.config.get('crawler_schema')}.{self.config.get('reseller_store_tab')} + where store_page_url= '{data['store_page_url']}' + """ + #logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_source_store_status'])==str(res[2]) and str(data['store_page_url'])==str(res[3]) and \ + str(data['store_page_url_hash'])==str(res[4]) and \ + str(data['rce_reseller_id'])==str(res[6]) and str(data['rce_source_id'])==str(res[9]): + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set updatedat=GETDATE() " \ + "where store_page_url = '"+ str(res[3])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + #logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set " \ + "rce_source_store_status="+str(data['rce_source_store_status'])+", store_page_url='"+str(data['store_page_url'])+"', store_page_url_hash=" \ + "'"+str(data['store_page_url_hash'])+"',store_location='"+str(data['store_location'])+"', rce_reseller_id="+str(data['rce_reseller_id'])+", " \ + "updatedat=GETDATE(), rce_source_id="+str(data['rce_source_id'])+" where store_page_url = '"+ str(res[3])+"'" + #logging.info(sql) + self.cur.execute(sql) + + sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (auditid,id,rce_source_store_status," \ + "store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id) select "+id_aud+", id,rce_source_store_status," \ + "store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id from " \ + ""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url='"+str(res[3])+"'" + #logging.info(sql) + + self.cur.execute(sql) + + def rce_ratings_reviews(self, data): + + sql = f""" + select * from {self.config.get('crawler_schema')}.{self.config.get('review_tab')} + where rce_product_id = {data['rce_product_id']} and username = '{data['username']}' + """ + + self.cur.execute(sql) + res = self.cur.fetchone() + + data['username'] = data['username'].replace("'","") + + id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('review_tab')) + id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('review_tab')) + + if not res: + + sql = f""" + insert into {self.config.get('crawler_schema')}.{self.config.get('review_tab')}(id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating) + values({id_main},{data['rce_product_id']},'{data['username']}','{data['review']}','{data['img_url']}',{data['review_like_count']},'{data['user_tier']}',{data['shop_id']},'{data['video_url']}',{data['rating']}) + """ + + logging.info(sql) + + self.cur.execute(sql) + sql = f""" + insert into {self.config.get('crawler_schema')}.aud_{self.config.get('review_tab')}(auditid,id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) + select {id_aud},id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('review_tab')} + where rce_product_id = {data['rce_product_id']} and username = '{data['username']}' + """ + + logging.info(sql) + + self.cur.execute(sql) + + else: + + if str(data['rce_product_id'])==str(res[1]) and str(data['username'])==str(res[2]) and str(data['review'])==str(res[3]) and \ + str(data['img_url'])==str(res[4]) and str(data['review_like_count'])==str(res[5]) and str(data['user_tier'])==str(res[6]) and \ + str(data['shop_id'])==str(res[7]) and str(data['video_url'])==str(res[8]) and str(data['rating'])==str(res[9]): + + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set updatedat=GETDATE() " \ + "where rce_product_id = "+ str(res[1])+" and username ='"+res[2]+"'" + logging.info(sql) + self.cur.execute(sql) + + sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" a set updatedat=b.updatedat " \ + "from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + logging.info(sql) + self.cur.execute(sql) + else: + + sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set rce_product_id="+str(data['rce_product_id'])+", " \ + "username='"+str(data['username'])+"', review='"+str(data['review'])+"', img_url=" \ + "'"+str(data['img_url'])+"',review_like_count="+str(data['review_like_count'])+", user_tier='"+str(data['user_tier'])+"', " \ + "shop_id="+str(data['shop_id'])+", video_url='"+str(data['video_url'])+"', rating='"+str(data['rating'])+"', updatedat=GETDATE() " \ + "where rce_product_id = "+ str(res[1])+" and username ='"+str(data['username'])+"'" + logging.info(sql) + self.cur.execute(sql) + + sql = f""" + insert into {self.config.get('crawler_schema')}.aud_{self.config.get('review_tab')}(auditid,id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) + select {id_aud},id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('review_tab')} + where rce_product_id = {data['rce_product_id']} and username = '{data['username']}' + """ + logging.info(sql) + + self.cur.execute(sql) + + def rce_seo(self, data): + sql = f""" + select * from {self.config.get('crawler_schema')}.{self.config.get('seo_tab')} + where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']} + """ + + self.cur.execute(sql) + res = self.cur.fetchone() + + id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('seo_tab')) + id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('seo_tab')) + + if not res: + + sql = f""" + insert into {self.config.get('crawler_schema')}.{self.config.get('seo_tab')}(id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots) + values({id_main},{data['rce_product_id']},{data['rce_source_id']},'{data['seo_title']}','{data['seo_description']}','{data['seo_url']}','{data['seo_url_hash']}','{data['seo_image']}',{data['seo_price_amount']},'{data['seo_price_currency']}','{data['seo_product_band']}','{data['seo_product_availability']}','{data['seo_product_category']}', + '{data['seo_product_condition']}',{data['seo_product_retailer_item_id']},'{data['seo_product_robots']}') + """ + + logging.info(sql) + + self.cur.execute(sql) + sql = f""" + insert into {self.config.get('crawler_schema')}.aud_{self.config.get('seo_tab')}(auditid,id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat) + select {id_aud},id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('seo_tab')} + where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']} + """ + + logging.info(sql) + + self.cur.execute(sql) + + else: + + if (str(data['rce_product_id']) == str(res[1]) and str(data['rce_source_id']) == str(res[2]) and str(data['seo_title']) == str(res[3]) and \ + str(data['seo_description']) == str(res[4]) and str(data['seo_url']) == str(res[5]) and str(data['seo_url_hash']) == str(res[6]) and \ + str(data['seo_image']) == str(res[7]) and str(data['seo_price_amount']) == str(res[8]) and str(data['seo_price_currency']) == str(res[9]) and \ + str(data['seo_product_band']) == str(res[10])) and str(data['seo_product_availability']) == str(res[11]) and str(data['seo_product_category']) == str(res[12]) and \ + str(data['seo_product_condition']) == str(res[13]) and str(data['seo_product_retailer_item_id']) == str(res[14]) and str(data['seo_product_robots']) == str(res[15]): + + sql = "update " + self.config.get('crawler_schema') + "." + self.config.get('seo_tab') + " set updatedat=GETDATE() " \ + "where rce_product_id = " + str(res[1]) + " and rce_source_id =" + str(data['rce_source_id']) + logging.info(sql) + self.cur.execute(sql) + + sql = "update " + self.config.get('crawler_schema') + ".aud_" + self.config.get('seo_tab') + " a set updatedat=b.updatedat " \ + "from " + self.config.get('crawler_schema') + "." + self.config.get('seo_tab') + " b where a.id=b.id and b.id = " + str(res[0]) + logging.info(sql) + self.cur.execute(sql) + else: + + sql = f""" + update {self.config.get('crawler_schema')}.{self.config.get('seo_tab')} set rce_product_id={data['rce_product_id']}, rce_source_id={data['rce_source_id']}, seo_title='{data['seo_title']}', seo_description='{data['seo_description']}', + seo_url='{data['seo_url']}', seo_url_hash='{data['seo_url_hash']}', seo_image='{data['seo_image']}', seo_price_amount='{data['seo_price_amount']}', seo_price_currency='{data['seo_price_currency']}', seo_product_band='{data['seo_product_band']}', + seo_product_availability='{data['seo_product_availability']}', seo_product_category='{data['seo_product_category']}', seo_product_condition='{data['seo_product_condition']}', seo_product_retailer_item_id={data['seo_product_retailer_item_id']}, + seo_product_robots='{data['seo_product_robots']}' where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']} + """ + + logging.info(sql) + self.cur.execute(sql) + + sql = f""" + insert into {self.config.get('crawler_schema')}.aud_{self.config.get('seo_tab')}(auditid,id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat) + select {id_aud},id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('seo_tab')} + where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']} + """ + logging.info(sql) + + self.cur.execute(sql) + + + + # def rce_ratings_reviews_productmodels(self,data): + # + # sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id = "+str(data['rce_rating_id']) + # self.cur.execute(sql) + # res = self.cur.fetchone() + # + # + # if not res: + # + # sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" (rce_rating_id,model_id) " \ + # "values("+str(data['rce_rating_id'])+",'"+str(data['model_id'])+"')" + # #logging.info(sql) + # + # self.cur.execute(sql) + # + # sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \ + # "createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \ + # ""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(data['rce_rating_id'])+"" + # #logging.info(sql) + # + # self.cur.execute(sql) + # + # else: + # + # if str(data['rce_rating_id'])==str(res[1]) and str(data['model_id'])==str(res[2]): + # + # sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set updatedat=GETDATE() " \ + # "where rce_rating_id = "+ str(res[1]) + # #logging.info(sql) + # self.cur.execute(sql) + # + # sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" a set updatedat=b.updatedat " \ + # "from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + # #logging.info(sql) + # self.cur.execute(sql) + # else: + # + # sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set model_id="+str(data['model_id'])+", " \ + # "updatedat=GETDATE() where rce_source_store_id = "+ str(res[1]) + # #logging.info(sql) + # self.cur.execute(sql) + # + # sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \ + # "createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \ + # ""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(res[1])+"" + # #logging.info(sql) + # + # self.cur.execute(sql) + # + # + # def rce_tags(self,data): + # + # sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description = '"+str(data['description'])+"'" + # self.cur.execute(sql) + # res = self.cur.fetchone() + # + # + # if not res: + # + # sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" (id,description) " \ + # "values("+str(data['id'])+",'"+str(data['description'])+"')" + # #logging.info(sql) + # + # self.cur.execute(sql) + # + # sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \ + # "createdat,updatedat) select id,description,createdat,updatedat from " \ + # ""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(data['description'])+"'" + # #logging.info(sql) + # + # self.cur.execute(sql) + # + # else: + # + # if str(data['description'])==str(res[1]): + # + # sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set updatedat=GETDATE() " \ + # "where description = '"+ str(res[1])+"'" + # #logging.info(sql) + # self.cur.execute(sql) + # + # sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" a set updatedat=b.updatedat " \ + # "from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + # #logging.info(sql) + # self.cur.execute(sql) + # else: + # + # sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set description='"+str(data['description'])+"', " \ + # "updatedat=GETDATE() where description = "+ str(res[1]) + # #logging.info(sql) + # self.cur.execute(sql) + # + # sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \ + # "createdat,updatedat) select id,description,createdat,updatedat from " \ + # ""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(res[1])+"'" + # #logging.info(sql) + # + # self.cur.execute(sql) + # + # + # def rce_ratings_reviews_producttags(self,data): + # + # sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id = '"+str(data['rce_rating_id'])+"'" + # self.cur.execute(sql) + # res = self.cur.fetchone() + # + # + # if not res: + # + # sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" (rce_rating_id,tag_ids) " \ + # "values("+str(data['rce_rating_id'])+",'"+str(data['tag_ids'])+"')" + # #logging.info(sql) + # + # self.cur.execute(sql) + # + # sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \ + # "createdat,updatedat) select id,rce_rating_id,tag_ids,createdat,updatedat from " \ + # ""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id='"+str(data['rce_rating_id'])+"'" + # #logging.info(sql) + # + # self.cur.execute(sql) + # + # else: + # + # if str(data['rce_rating_id'])==str(res[1]): + # + # sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set updatedat=GETDATE() " \ + # "where rce_rating_id = '"+ str(res[1])+"'" + # #logging.info(sql) + # self.cur.execute(sql) + # + # sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" a set updatedat=b.updatedat " \ + # "from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" b where a.id=b.id and b.id = "+str(res[0]) + # #logging.info(sql) + # self.cur.execute(sql) + # else: + # + # sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set rce_rating_id='"+str(data['rce_rating_id'])+"', " \ + # "updatedat=GETDATE() where rce_rating_id = "+ str(res[1]) + # #logging.info(sql) + # self.cur.execute(sql) + # + # sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \ + # "createdat,updatedat) select id,description,createdat,updatedat from " \ + # ""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where description='"+str(res[1])+"'" + # #logging.info(sql) + # + # self.cur.execute(sql) + # + # + diff --git a/hasaki_crawler_engine/hasaki_product_info.py b/hasaki_crawler_engine/hasaki_product_info.py new file mode 100644 index 0000000..7c5584d --- /dev/null +++ b/hasaki_crawler_engine/hasaki_product_info.py @@ -0,0 +1,454 @@ +import hashlib +import logging +import random +import string +import time +import re +import psycopg2 +from playwright.sync_api import sync_playwright +from deep_translator import GoogleTranslator +from hasaki_db_writer import hasaki_db_writer +import pandas as pd +from bs4 import BeautifulSoup +from Util import translate_text_to_english + +class HasakiProductInfo: + def __init__(self, config): + logging.info("Initializing HasakiProductInfo") + self.pattern = r'[' + string.punctuation + ']' + self.config = config + self.crawler_name = self.config.get("crawler_name") + self.product_limit = int(self.config.get("product_per_category")) + self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), + password=self.config.get('db_pass'), host=self.config.get('db_host'), + port=self.config.get('db_port')) + self.conn.autocommit = True + self.cur = self.conn.cursor() + self.cur.execute( + f"""select id from {self.config.get('crawler_schema')}.{self.config.get('source_tab')} where source_name='Hasaki'""") + try: + self.rce_source_id = self.cur.fetchone()[0] + except: + logging.info("Source tab is empty. Please check. Exiting.....") + exit(1) + + self.db_writer = hasaki_db_writer(config) + + def __del__(self): + print("Closing connection.....") + self.conn.close() + + def start_processing(self): + logging.info("Starting to collect product info from Hasaki........") + + logging.info("Fetching product list from DB......") + + sql = f""" + select * from {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} where flag = 0 + order by categoryid, product_section, product_rank + """ + + self.cur.execute(sql) + rows = self.cur.fetchall() + logging.info("Found {} products.......".format(str(len(rows)))) + cnt = 1 + for row in rows: + logging.info("========= Fetching product info {}/{}: {} =========".format(str(cnt),str(len(rows)),row[3])) + + try: + self.get_product_info(row) + except: + pass + + sql = f""" + update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 + where categoryid={row[9]}, product_section='{row[1]}', product_rank={row[8]}, product_url='{row[3]}' + """ + self.cur.execute(sql) + + cnt += 1 + + + def get_product_info(self, data): + + raw_data = self.get_raw_product_data(data[3]) + + print(raw_data) + + if raw_data: + self.product_info(data, raw_data) + + self.rating_info(raw_data) + + self.seo_info(raw_data) + + + def get_raw_product_data(self, url): + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + context = browser.new_context( + user_agent="Mozilla/5.0 (iPhone X; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1") + page = context.new_page() + + page.goto(url) + + with page.expect_response("**/wap/v2/product/detail**") as response: + api_requests = response.value.json() + + browser.close() + + return api_requests + + def product_info(self, data, raw_data): + + #region rce_brand + + data_brand = {} + + data_brand['rce_source_id'] = self.rce_source_id + data_brand['rce_source_brand_status'] = 1 + data_brand['rce_source_brand_id'] = 0 + data_brand['brand_page_url'] = "" + data_brand['brand_page_url_hash'] = "" + data_brand['brand_name'] = "" + data_brand['brand_following'] = "" + data_brand['brand_rating'] = "" + + try: + + data_brand['rce_source_brand_id'] = raw_data['brand']['id'] + + try: + data_brand['brand_page_url'] = "https://hasaki.vn/" + raw_data['brand']['url'] + ".html" + data_brand['brand_page_url'] = str(data_brand['brand_page_url']).replace("'","") + data_brand['brand_page_url_hash'] = hashlib.md5(data_brand['brand_page_url'].encode('utf-8')).hexdigest() + except: + pass + + try: + data_brand['brand_name'] = translate_text_to_english(str(raw_data['brand']['name']).replace("'","")) + except: + pass + + try: + data_brand['brand_following'] = raw_data['brand']['following'] + except: + pass + + try: + data_brand['brand_rating'] = raw_data['brand']['rating'] + except: + pass + + try: + self.db_writer.rce_brand(data_brand) + except Exception as e: + logging.info(e) + except: + pass + + #endregion + + #region rce_product + + data_product = {} + + try: + + data_product['rce_source_product_id'] = raw_data['id'] + data_product['rce_source_id'] = self.rce_source_id + data_product['rce_source_product_status'] = 1 + data_product['product_page_url'] = str(raw_data['url']).replace("'","") + data_product['product_page_url_hash'] = hashlib.md5(data_product['product_page_url'].encode('utf-8')).hexdigest() + data_product['rce_category_id'] = data[9] + data_product['rce_store_id'] = 0 + + data_product['rce_source_product_name'] = str(raw_data['name']) + str(raw_data['alt_name']) + data_product['rce_source_product_name'] = translate_text_to_english(str(re.sub(self.pattern, '', data_product['rce_source_product_name']))) + data_product['rce_source_product_name'] = str(data_product['rce_source_product_name']).replace("'", "") + + data_product['product_images'] = data[4] + + data_product['product_description'] = "" + try: + + description_raw = raw_data['description'] + soup = BeautifulSoup(description_raw, 'html.parser') + data_product['product_description'] = translate_text_to_english(re.sub(self.pattern, '',soup.get_text()).replace("'","")) + data_product['product_description'] = str(data_product['product_description']).replace("'","") + except: + pass + + data_product['rce_brand_id'] = "" + try: + sql = f""" + select id from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} where + rce_source_id = {self.rce_source_id} and rce_source_brand_id = {raw_data['brand']['id']} + """ + self.cur.execute(sql) + res = self.cur.fetchone() + data_product['rce_brand_id'] = res[0] + except: + pass + + + data_product['product_sold_total'] = 0 + + data_product['product_sold'] = 0 + try: + data_product['product_sold'] = raw_data['bought'] + except: + pass + + data_product['product_price_min'] = 0 + data_product['product_price_max'] = 0 + try: + data_product['product_price_min'] = raw_data['int_final_price'] + data_product['product_price_max'] = raw_data['int_final_price'] + except: + pass + + + data_product['product_price_min_before_discount'] = 0 + data_product['product_price_max_before_discount'] = 0 + try: + data_product['product_price_min_before_discount'] = raw_data['price'] + data_product['product_price_max_before_discount'] = raw_data['price'] + except: + pass + + data_product['ratings'] = 0.0 + try: + data_product['ratings'] = raw_data['rating']['avg_rate'] + except: + pass + + + data_product['ships_from'] = "" + data_product['product_section'] = data[1] + data_product['countryoforigin'] = "" + data_product['rank'] = data[8] + + try: + self.db_writer.rce_product(data_product) + except Exception as e: + logging.info(e) + + #region rce_product_variant + + variant_items = raw_data['attribute']['items'] + + df_variant = pd.DataFrame({}, columns=['product_variant_name', 'rce_source_variant_id', 'product_variant_price', + 'product_variant_stock', 'product_variant_sku']) + + data_variant = {} + for variant in variant_items: + for item in variant['options']: + data_variant['product_variant_name'] = item['long_label'] + for product in item['products']: + data_variant['rce_source_variant_id'] = product['id'] + data_variant['product_variant_price'] = product['price'] + data_variant['product_variant_stock'] = product['quantity'] + data_variant['product_variant_sku'] = product['sku'] + + # variants_arr.append(data_variant) + + tmp = pd.DataFrame([[data_variant['product_variant_name'], + data_variant['rce_source_variant_id'], + data_variant['product_variant_price'], + data_variant['product_variant_stock'], + data_variant['product_variant_sku']]], + columns=['product_variant_name', 'rce_source_variant_id', + 'product_variant_price', + 'product_variant_stock', 'product_variant_sku']) + df_variant = pd.concat([df_variant, tmp]) + + df_variant_merged = df_variant.groupby('product_variant_sku').agg({ + 'product_variant_name': ' '.join, + 'rce_source_variant_id': 'first', + 'product_variant_price': 'first', + 'product_variant_stock': 'first' + }).reset_index() + + #print(df_variant_merged.to_string()) + + for index, row in df_variant_merged.iterrows(): + try: + data_variant = {} + + data_variant['rce_source_variant_id'] = row['rce_source_variant_id'] + data_variant['product_variant_name'] = translate_text_to_english(row['product_variant_name']) + data_variant['product_variant_name'] = re.sub(self.pattern, '', data_variant['product_variant_name']).replace("'","") + data_variant['product_variant_price'] = row['product_variant_price'] + data_variant['product_variant_price_before_discount'] = 0 + data_variant['product_variant_stock'] = row['product_variant_stock'] + data_variant['product_variant_sku'] = row['product_variant_sku'] + + data_variant['rce_product_id'] = "" + + sql = f""" + select id from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where + rce_source_product_id = {data_product['rce_source_product_id']} and rce_source_id = {data_product['rce_source_id']} + """ + self.cur.execute(sql) + data_variant['rce_product_id'] = self.cur.fetchone()[0] + + try: + self.db_writer.rce_product_variant(data_variant) + except Exception as e: + logging.info(e) + except: + pass + + + + #endregion + + except: + pass + + #endregion + + def rating_info(self, raw_data): + + try: + + reviews1 = [] + reviews2 = [] + + try: + reviews1 = raw_data['short_rating_data']['image_reviews'] + except: + pass + + try: + reviews2 = raw_data['short_rating_data']['reviews'] + except: + pass + + reviews = reviews1 + reviews2 + + + + for review in reviews: + data_review = {} + + data_review["rce_product_id"] = "" + data_review["username"] = "" + data_review["review"] = "" + data_review["img_url"] = "" + data_review["review_like_count"] = 0 + data_review["user_tier"] = "" + data_review["shop_id"] = 0 + data_review["video_url"] = "" + data_review["rating"] = "" + + sql = f""" + select id from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where + rce_source_product_id = {raw_data['id']} and rce_source_id = {self.rce_source_id} + """ + self.cur.execute(sql) + data_review["rce_product_id"] = self.cur.fetchone()[0] + + try: + data_review["username"] = str(review['user_fullname']).replace("'", "") + except: + pass + + try: + data_review["review"] = translate_text_to_english(review['content']).replace("'", "") + except: + pass + + try: + data_review["rating"] = review['rating']['star'] + except: + pass + + try: + self.db_writer.rce_ratings_reviews(data_review) + except Exception as e: + logging.info(e) + except Exception as e: + logging.info(e) + + + + def seo_info(self, raw_data): + + try: + data_seo = {} + + data_seo['rce_product_id'] = 0 + data_seo['rce_source_id'] = self.rce_source_id + data_seo['seo_title'] = "" + data_seo['seo_description'] = "" + data_seo['seo_url'] = "" + data_seo['seo_url_hash'] = "" + data_seo['seo_image'] = "" + data_seo['seo_price_amount'] = 0 + data_seo['seo_price_currency'] = "" + data_seo['seo_product_band'] = "" + data_seo['seo_product_availability'] = "" + data_seo['seo_product_category'] = "" + data_seo['seo_product_condition'] = "" + data_seo['seo_product_retailer_item_id'] = 0 + data_seo['seo_product_robots'] = "" + + sql = f""" + select id from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where + rce_source_product_id = {raw_data['id']} and rce_source_id = {self.rce_source_id} + """ + self.cur.execute(sql) + data_seo['rce_product_id'] = self.cur.fetchone()[0] + + try: data_seo['seo_title'] = translate_text_to_english(raw_data['seo']['og:title']).replace("'","") + except: pass + + try: data_seo['seo_description'] = translate_text_to_english(raw_data['seo']['og:description']).replace("'","") + except: pass + + try: data_seo['seo_url'] = str(raw_data['seo']['og:url']).replace("'","") + except: pass + + try: data_seo['seo_image'] = str(raw_data['seo']['og:image']).replace("'","") + except: pass + + try: data_seo['seo_price_amount'] = raw_data['seo']['price:amount'] + except: pass + + try: data_seo['seo_price_currency'] = str(raw_data['seo']['price:currency']).replace("'","") + except: pass + + try: data_seo['seo_product_band'] = translate_text_to_english(raw_data['seo']['product:band']).replace("'","") + except: pass + + try: data_seo['seo_product_availability'] = str(raw_data['seo']['product:availability']).replace("'","") + except: pass + + try: data_seo['seo_product_category'] = translate_text_to_english(raw_data['seo']['product:category']).replace("'","") + except: pass + + try: data_seo['seo_product_condition'] = translate_text_to_english(raw_data['seo']['product:condition']).replace("'","") + except: pass + + try: data_seo['seo_product_retailer_item_id'] = raw_data['seo']['product:retailer_item_id'] + except: pass + + try: data_seo['seo_product_robots'] = raw_data['seo']['product:robots'] + except: pass + + try: + self.db_writer.rce_seo(data_seo) + except Exception as e: + logging.info(e) + + except: + pass + + + + + + + diff --git a/hasaki_crawler_engine/test.py b/hasaki_crawler_engine/test.py new file mode 100644 index 0000000..728385b --- /dev/null +++ b/hasaki_crawler_engine/test.py @@ -0,0 +1,63 @@ +import time +from bs4 import BeautifulSoup +from playwright.sync_api import sync_playwright +import pandas as pd + +# Launch the Playwright browser in mobile mode +with sync_playwright() as p: + browser = p.chromium.launch(headless=False) + context = browser.new_context(user_agent="Mozilla/5.0 (iPhone X; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1") + page = context.new_page() + + page.goto("https://hasaki.vn/san-pham/nuoc-tay-trang-tuoi-mat-l-oreal-3-in-1-danh-cho-da-dau-da-hon-hop-400ml-19325.html") + page.wait_for_load_state('load') + #time.sleep(10) + + # Capture the underlying API request URL + #api_requests = page.evaluate('''() => window.fetch('https://hasaki.vn/wap/v2/product/detail').then(response => response.json())''') + #print(api_requests) + + with page.expect_response("**/wap/v2/product/detail**") as response: + data = response.value.json() + + variant_items = data['attribute']['items'] + + df = pd.DataFrame({}, columns=['product_variant_name','rce_source_variant_id','product_variant_price','product_variant_stock','product_variant_sku']) + + data_variant = {} + for variant in variant_items: + for item in variant['options']: + data_variant['product_variant_name'] = item['long_label'] + for product in item['products']: + data_variant['rce_source_variant_id'] = product['id'] + data_variant['rce_product_id'] = "" + data_variant['product_variant_price'] = product['price'] + data_variant['product_variant_price_before_discount'] = "" + data_variant['product_variant_stock'] = product['quantity'] + data_variant['product_variant_sku'] = product['sku'] + + #variants_arr.append(data_variant) + + tmp = pd.DataFrame([[data_variant['product_variant_name'],data_variant['rce_source_variant_id'],data_variant['product_variant_price'],data_variant['product_variant_stock'],data_variant['product_variant_sku']]], + columns=['product_variant_name', 'rce_source_variant_id', 'product_variant_price', + 'product_variant_stock', 'product_variant_sku']) + df = pd.concat([df, tmp]) + + print(data_variant) + + df = df.sort_values(by=['product_variant_sku']) + print(df.to_string()) + + print("======================================") + + merged_df = df.groupby('product_variant_sku').agg({ + 'product_variant_name': ' '.join, + 'rce_source_variant_id': 'first', + 'product_variant_price': 'first', + 'product_variant_stock': 'first' + }).reset_index() + + print(merged_df.to_string()) + + # Close the browser + browser.close() diff --git a/hasaki_crawler_engine/test2.py b/hasaki_crawler_engine/test2.py new file mode 100644 index 0000000..bdef230 --- /dev/null +++ b/hasaki_crawler_engine/test2.py @@ -0,0 +1,25 @@ +import asyncio +from playwright.async_api import async_playwright + +async def main(): + async with async_playwright() as p: + browser = await p.chromium.launch() + context = await browser.new_context() + + page = await context.new_page() + + # Enable request interception + await page.route('https://hasaki.vn/wap/v2/product/detail', lambda route: route.continue_()) + + # Navigate to the website URL + await page.goto('https://hasaki.vn/san-pham/nuoc-hoa-hong-khong-mui-klairs-danh-cho-da-nhay-cam-180ml-65994.html') + + # Wait for the API request to be made + response = await page.wait_for_event('request', predicate=lambda req: 'v2/product/detail' in req.url) + json_response = await response.response.json() + + print(json_response) + + await browser.close() + +asyncio.run(main())