diff --git a/.idea/dataSources.xml b/.idea/dataSources.xml
new file mode 100644
index 0000000..424cd96
--- /dev/null
+++ b/.idea/dataSources.xml
@@ -0,0 +1,12 @@
+
+
+
+
+ redshift
+ true
+ com.amazon.redshift.jdbc.Driver
+ jdbc:redshift://redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com:5439/analytics
+ $ProjectFileDir$
+
+
+
\ No newline at end of file
diff --git a/.idea/sqldialects.xml b/.idea/sqldialects.xml
new file mode 100644
index 0000000..972ddc3
--- /dev/null
+++ b/.idea/sqldialects.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/hasaki_crawler_engine/Util.py b/hasaki_crawler_engine/Util.py
new file mode 100644
index 0000000..73b80d6
--- /dev/null
+++ b/hasaki_crawler_engine/Util.py
@@ -0,0 +1,24 @@
+from deep_translator import GoogleTranslator
+
+# def translate_text_to_english(text):
+# if text:
+# translated = GoogleTranslator(source='auto', target='en').translate(text)
+# return translated
+# return text
+#
+
+def translate_text_to_english(text):
+ if text:
+ chunk_size = 4800
+ text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
+
+ translated_chunks = []
+ for chunk in text_chunks:
+ translated_chunk = GoogleTranslator(source='auto', target='en').translate(chunk)
+ translated_chunks.append(translated_chunk)
+
+ translated_text = ' '.join(translated_chunks)
+
+ return translated_text
+
+ return text
\ No newline at end of file
diff --git a/hasaki_crawler_engine/changes.sql b/hasaki_crawler_engine/changes.sql
new file mode 100644
index 0000000..65cfaec
--- /dev/null
+++ b/hasaki_crawler_engine/changes.sql
@@ -0,0 +1,78 @@
+ALTER TABLE test_spider_management.rce_category ADD category_parent_name varchar(24000) NULL;
+ALTER TABLE test_spider_management.aud_rce_category ADD category_parent_name varchar(24000) NULL;
+
+CREATE TABLE IF NOT EXISTS test_spider_management.crawler_tracker_hasaki
+(
+ crawler_name VARCHAR(24000) ENCODE lzo
+ ,product_section VARCHAR(24000) ENCODE lzo
+ ,product_name VARCHAR(24000) ENCODE lzo
+ ,product_url VARCHAR(24000) ENCODE lzo
+ ,product_image VARCHAR(24000) ENCODE lzo
+ ,product_sold INTEGER NOT NULL ENCODE az64
+ ,product_brand VARCHAR(24000) ENCODE lzo
+ ,gift VARCHAR(24000) ENCODE lzo
+ ,product_rank INTEGER NOT NULL ENCODE az64
+ ,categoryid INTEGER NOT NULL ENCODE az64
+ ,flag SMALLINT DEFAULT 0 ENCODE az64
+)
+DISTSTYLE AUTO
+;
+
+ALTER TABLE test_spider_management.rce_brand ADD brand_following int8 NULL;
+ALTER TABLE test_spider_management.rce_brand ADD brand_rating int8 NULL;
+ALTER TABLE test_spider_management.aud_rce_brand ADD brand_following int8 NULL;
+ALTER TABLE test_spider_management.aud_rce_brand ADD brand_rating int8 NULL;
+
+ALTER TABLE test_spider_management.rce_product_variant ADD product_variant_sku varchar(1000) NULL;
+ALTER TABLE test_spider_management.aud_rce_product_variant ADD product_variant_sku varchar(1000) NULL;
+
+CREATE TABLE IF NOT EXISTS test_spider_management.rce_seo
+(
+ id INTEGER ENCODE az64
+ ,rce_product_id INTEGER ENCODE az64
+ ,rce_source_id INTEGER ENCODE az64
+ ,seo_title VARCHAR(2000) ENCODE lzo
+ ,seo_description VARCHAR(10000) ENCODE lzo
+ ,seo_url VARCHAR(2000) ENCODE lzo
+ ,seo_url_hash VARCHAR(2000) ENCODE lzo
+ ,seo_image VARCHAR(2000) ENCODE lzo
+ ,seo_price_amount BIGINT ENCODE az64
+ ,seo_price_currency VARCHAR(2000) ENCODE lzo
+ ,seo_product_band VARCHAR(2000) ENCODE lzo
+ ,seo_product_availability VARCHAR(2000) ENCODE lzo
+ ,seo_product_category VARCHAR(2000) ENCODE lzo
+ ,seo_product_condition VARCHAR(2000) ENCODE lzo
+ ,seo_product_retailer_item_id BIGINT ENCODE az64
+ ,seo_product_robots VARCHAR(2000) ENCODE lzo
+ ,createdat TIMESTAMP WITHOUT TIME ZONE DEFAULT getdate() ENCODE az64
+ ,updatedat TIMESTAMP WITHOUT TIME ZONE DEFAULT getdate() ENCODE az64
+)
+DISTSTYLE AUTO
+;
+
+CREATE TABLE IF NOT EXISTS test_spider_management.aud_rce_seo
+(
+ auditid INTEGER ENCODE az64
+ ,id INTEGER ENCODE az64
+ ,rce_product_id INTEGER ENCODE az64
+ ,rce_source_id INTEGER ENCODE az64
+ ,seo_title VARCHAR(2000) ENCODE lzo
+ ,seo_description VARCHAR(10000) ENCODE lzo
+ ,seo_url VARCHAR(2000) ENCODE lzo
+ ,seo_url_hash VARCHAR(2000) ENCODE lzo
+ ,seo_image VARCHAR(2000) ENCODE lzo
+ ,seo_price_amount BIGINT ENCODE az64
+ ,seo_price_currency VARCHAR(2000) ENCODE lzo
+ ,seo_product_band VARCHAR(2000) ENCODE lzo
+ ,seo_product_availability VARCHAR(2000) ENCODE lzo
+ ,seo_product_category VARCHAR(2000) ENCODE lzo
+ ,seo_product_condition VARCHAR(2000) ENCODE lzo
+ ,seo_product_retailer_item_id BIGINT ENCODE az64
+ ,seo_product_robots VARCHAR(2000) ENCODE lzo
+ ,createdat TIMESTAMP WITHOUT TIME ZONE ENCODE az64
+ ,updatedat TIMESTAMP WITHOUT TIME ZONE ENCODE az64
+ ,audit_createdat TIMESTAMP WITHOUT TIME ZONE DEFAULT getdate() ENCODE az64
+)
+DISTSTYLE AUTO
+;
+
diff --git a/hasaki_crawler_engine/conf.json b/hasaki_crawler_engine/conf.json
new file mode 100755
index 0000000..841f33f
--- /dev/null
+++ b/hasaki_crawler_engine/conf.json
@@ -0,0 +1,26 @@
+{
+ "crawler_name": "raena_crawler_engine_hasaki",
+ "crawler_schema": "test_spider_management",
+ "category_tab": "rce_category",
+ "tracker_tab": "crawler_tracker_hasaki",
+ "product_tab": "rce_product",
+ "variant_tab": "rce_product_variant",
+ "brand_tab": "rce_brand",
+ "reseller_tab": "rce_reseller",
+ "reseller_store_tab": "rce_reseller_store",
+ "review_tab": "rce_ratings_reviews",
+ "review_productmodels_tab": "rce_ratings_reviews_productmodels",
+ "review_producttags_tab": "rce_ratings_reviews_producttags",
+ "review_tags": "rce_tags",
+ "source_tab": "rce_source",
+ "seo_tab": "rce_seo",
+ "product_per_category": "1000",
+ "source_category": "11043145",
+ "db_user": "dbadmin",
+ "db_pass": "5qCif6eyY3Kmg4z",
+ "database": "analytics",
+ "db_host": "redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com",
+ "db_port": "5439",
+ "crawler_main": "1",
+ "crawler_slave_no": ""
+}
\ No newline at end of file
diff --git a/hasaki_crawler_engine/hasaki_categories.py b/hasaki_crawler_engine/hasaki_categories.py
new file mode 100644
index 0000000..d312bc9
--- /dev/null
+++ b/hasaki_crawler_engine/hasaki_categories.py
@@ -0,0 +1,143 @@
+import hashlib
+import logging
+import time
+import psycopg2
+import pandas as pd
+
+from playwright.sync_api import sync_playwright
+from hasaki_db_writer import hasaki_db_writer
+from Util import translate_text_to_english
+
+
+
+class HasakiCategories:
+ def __init__(self, config):
+ logging.info("Initializing HasakiSubCategories")
+ self.master_category = []
+ self.config = config
+ self.crawler_name = self.config.get("crawler_name")
+ self.product_limit = int(self.config.get("product_per_category"))
+ self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'),
+ password=self.config.get('db_pass'), host=self.config.get('db_host'),
+ port=self.config.get('db_port'))
+ self.conn.autocommit = True
+ self.cur = self.conn.cursor()
+ self.cur.execute(f"""select id from {self.config.get('crawler_schema')}.{self.config.get('source_tab')} where source_name='Hasaki'""")
+ try:
+ self.rce_source_id = self.cur.fetchone()[0]
+ except:
+ logging.info("Source tab is empty. Please check. Exiting.....")
+ exit(1)
+
+ self.db_writer = hasaki_db_writer(config)
+
+ def __del__(self):
+ print("Closing connection.....")
+ self.conn.close()
+
+ def start_processing(self):
+
+ self.crawl_and_track("HEALTH - BEAUTY", "https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html")
+
+ df = pd.DataFrame(self.master_category, columns=['Index', 'Parent', 'Name', 'Link'])
+
+ df = df.sort_values('Index')
+
+ df = df.drop_duplicates(subset='Name', keep='first')
+
+ self.process_category(df)
+
+
+ def process_category(self, category):
+
+ for index, row in category.iterrows():
+ data = {}
+
+ data['parent_category_id'] = 0
+ data['rce_source_id'] = self.rce_source_id
+ data['rce_source_category_id'] = 0
+ data['rce_source_status'] = 1
+ data['category_name'] = str(row["Name"]).replace("'","")
+ data['category_page_url'] = row["Link"]
+ data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
+ data['category_parent_name'] = str(row["Parent"]).replace("'","")
+
+ self.db_writer.rce_category(data)
+
+
+ def crawl_and_track(self, parent, url_to_visit):
+ self.master_category.append((0,"0", parent, url_to_visit))
+
+ print(self.master_category)
+
+ cats = self.crawl_categories(parent, url_to_visit)
+ time.sleep(10)
+ if cats:
+ for cat in cats:
+ self.master_category.append((1,)+(cat))
+ print((1,)+(cat))
+
+ sub_cats1 = self.crawl_categories(cat[1], cat[2])
+ time.sleep(10)
+ if sub_cats1:
+ for sub_cat1 in sub_cats1:
+ self.master_category.append((2,) + (sub_cat1))
+ print((2,) + (sub_cat1))
+
+ sub_cats2 = self.crawl_categories(sub_cat1[1], sub_cat1[2])
+ time.sleep(10)
+ if sub_cats2:
+ for sub_cat2 in sub_cats2:
+ self.master_category.append((3,) + (sub_cat2))
+ print((3,) + (sub_cat2))
+
+ sub_cats3 = self.crawl_categories(sub_cat2[1], sub_cat2[2])
+ time.sleep(10)
+ if sub_cats3:
+ for sub_cat3 in sub_cats3:
+ self.master_category.append((4,) + (sub_cat3))
+ print((4,) + (sub_cat3))
+
+ def crawl_categories(self, parent, url_to_visit):
+
+ with sync_playwright() as p:
+ browser = p.chromium.launch(headless=True)
+ # context = browser.new_context(
+ # viewport={"width": 375, "height": 667, "isMobile": True}
+ # )
+ page = browser.new_page()
+
+ # Load the webpage
+ page.goto(url_to_visit)
+ # page.goto('https://hasaki.vn/danh-muc/my-pham-high-end-c1907.html')
+
+ page.wait_for_load_state('load')
+
+ container_element = page.query_selector('.block_colaps_sticky.width_common.collaps_sticky')
+
+ if container_element:
+ item_elements = container_element.query_selector_all('.item_fillter')
+ content_elements = container_element.query_selector_all('.content_fillter')
+
+ urls = []
+
+ for item_element in item_elements:
+ text = item_element.query_selector('a').inner_text()
+ text = translate_text_to_english(text)
+ href = item_element.query_selector('a').get_attribute('href')
+ urls.append((parent, text, href))
+
+ for content_element in content_elements:
+ text = content_element.query_selector('a').inner_text()
+ text = translate_text_to_english(text)
+ href = content_element.query_selector('a').get_attribute('href')
+ urls.append((parent, text, href))
+
+ # removing previously collected data
+ master_urls = [item[3] for item in self.master_category]
+ filtered_data = [(parent, name, url) for parent, name, url in urls if url not in master_urls]
+
+ return filtered_data
+
+ browser.close()
+
diff --git a/hasaki_crawler_engine/hasaki_category_products.py b/hasaki_crawler_engine/hasaki_category_products.py
new file mode 100644
index 0000000..26e6e94
--- /dev/null
+++ b/hasaki_crawler_engine/hasaki_category_products.py
@@ -0,0 +1,160 @@
+import hashlib
+import logging
+import random
+import time
+import psycopg2
+from playwright.sync_api import sync_playwright
+from deep_translator import GoogleTranslator
+from hasaki_db_writer import hasaki_db_writer
+import pandas as pd
+from Util import translate_text_to_english
+class HasakiCategoryProducts:
+ def __init__(self, config):
+ logging.info("Initializing HasakiCategoryProducts........")
+ self.config = config
+ self.crawler_name = self.config.get("crawler_name")
+ self.product_limit = int(self.config.get("product_per_category"))
+ self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'),
+ password=self.config.get('db_pass'), host=self.config.get('db_host'),
+ port=self.config.get('db_port'))
+ self.conn.autocommit = True
+ self.cur = self.conn.cursor()
+ self.cur.execute(
+ f"""select id from {self.config.get('crawler_schema')}.{self.config.get('source_tab')} where source_name='Hasaki'""")
+ try:
+ self.rce_source_id = self.cur.fetchone()[0]
+ except:
+ logging.info("Source tab is empty. Please check. Exiting.....")
+ exit(1)
+
+ self.db_writer = hasaki_db_writer(config)
+
+ def __del__(self):
+ print("Closing connection.....")
+ self.conn.close()
+
+ def start_processing(self):
+
+ logging.info("Starting crawler to collect category products.........")
+
+ sql = f"""
+ select * from {self.config.get('crawler_schema')}.{self.config.get('category_tab')}
+ where rce_source_id = {self.rce_source_id} order by id
+ """
+
+ self.cur.execute(sql)
+
+ categories = self.cur.fetchall()
+
+ for category in categories:
+ logging.info("================= Fetching Products for : {} ====================".format(str(category[7])))
+ pages = self.get_pages(category[5])
+
+ time.sleep(random.randint(10,20))
+
+ self.get_product_list(urls = pages, categoryId = category[0])
+
+
+
+
+ def get_pages(self, url):
+
+ pages = []
+ pages.append(url)
+
+ try:
+ with sync_playwright() as p:
+ browser = p.chromium.launch(headless=True)
+
+ page = browser.new_page()
+ page.goto(url)
+
+ page.wait_for_load_state('load')
+
+ pagination = page.query_selector(".pagination.ul-pagination").query_selector_all(".change-page")
+
+ for pagination in pagination:
+ if str(pagination.get_attribute('data-page')).strip() != "1":
+ new_url = str(pagination.get_attribute('href')).strip()
+ new_url = "https://hasaki.vn" + new_url
+ pages.append(new_url)
+ browser.close()
+ except Exception as e:
+ pass
+ finally:
+ return pages
+
+ def get_product_list(self,urls, categoryId):
+
+ try:
+ with sync_playwright() as p:
+ browser = p.chromium.launch(headless=True)
+
+ page = browser.new_page()
+
+ page_count = 1
+
+ logging.info("Found {} pages. Looping through URLS to get all products.".format(str(len(urls))))
+ for url in urls:
+ logging.info("+++++++++++++ Loading page : {} +++++++++++++++++".format(str(page_count)))
+
+ page.goto(url)
+
+ page.wait_for_load_state('load')
+
+ container_element = page.query_selector('.ProductGrid__grid.width_common')
+ if container_element:
+ item_elements = container_element.query_selector_all('.ProductGridItem__itemOuter')
+ item_count = 1
+ for item_element in item_elements:
+ try:
+ product_section = "Base Product Page " + str(page_count)
+ product_name = translate_text_to_english(str(item_element.query_selector('.width_common.name_sp.space_bottom_5').text_content()).strip().replace("'",""))
+ product_url = str(item_element.query_selector('.v3_thumb_common_sp.relative').get_attribute('href')).strip()
+ product_brand = translate_text_to_english(str(item_element.query_selector('.width_common.txt_color_1.space_bottom_3').text_content()).strip().replace("'",""))
+ product_rank = item_count
+
+ product_image = ""
+ try:
+ product_image = str(item_element.query_selector('.v3_thumb_common_sp.relative').query_selector('.img_thumb.lazy.loaded').get_attribute('src')).strip().replace("'","")
+ except:
+ pass
+
+ gift = ""
+ try:
+ gift = translate_text_to_english(str(item_element.query_selector('.block_gift_list_item').text_content()).strip().replace("'",""))
+ except:
+ pass
+
+ product_sold = 0
+ try:
+ product_sold = int(str(item_element.query_selector('.item_count_by').text_content()).strip().replace('.',''))
+ except:
+ pass
+
+
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')}(crawler_name,product_section, product_name, product_url, product_image, product_sold, product_brand, gift, product_rank, categoryid)
+ values('{self.crawler_name}','{product_section}','{product_name.replace("'","")}','{product_url}','{product_image}',{product_sold},'{product_brand}','{gift}',{product_rank},{categoryId})
+ """
+
+ logging.info(sql)
+
+ self.cur.execute(sql)
+
+ except Exception as e:
+ print(e)
+
+ item_count += 1
+
+ time.sleep(random.randint(10,30))
+
+ page_count += 1
+
+ browser.close()
+ except Exception as e:
+ print(e)
+
+
+
+
diff --git a/hasaki_crawler_engine/hasaki_crawler.py b/hasaki_crawler_engine/hasaki_crawler.py
new file mode 100644
index 0000000..4f7bf56
--- /dev/null
+++ b/hasaki_crawler_engine/hasaki_crawler.py
@@ -0,0 +1,47 @@
+import logging
+import json
+import time
+
+from hasaki_categories import HasakiCategories
+from hasaki_category_products import HasakiCategoryProducts
+from hasaki_product_info import HasakiProductInfo
+
+##### Looger ######
+format = "%(asctime)s: %(message)s"
+logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
+
+config = {}
+
+
+def main():
+ # hasaki_categories = HasakiCategories(config)
+ # hasaki_categories.start_processing()
+ #
+ # time.sleep(60)
+ #
+ # hasaki_category_products = HasakiCategoryProducts(config)
+ # hasaki_category_products.start_processing()
+ #
+ # time.sleep(60)
+
+ hasaki_products = HasakiProductInfo(config)
+ hasaki_products.start_processing()
+
+
+
+if __name__ == "__main__":
+ logging.info("Starting Hasaki Crawler.......")
+ try:
+ logging.info("Loading config file.......")
+ with open("conf.json", "r") as jsonfile:
+ config = json.load(jsonfile)
+ logging.info("Config file loaded.......")
+ print(config)
+
+ main()
+
+ except Exception as e:
+ logging.info("Error: ".format(e))
+ #logging.info("Cannot load config file. Please check. Exiting......")
+ #send_mail()
+ exit(1)
diff --git a/hasaki_crawler_engine/hasaki_db_writer.py b/hasaki_crawler_engine/hasaki_db_writer.py
new file mode 100755
index 0000000..bf3b999
--- /dev/null
+++ b/hasaki_crawler_engine/hasaki_db_writer.py
@@ -0,0 +1,754 @@
+import logging
+import psycopg2
+
+###### Looger ######
+format = "%(asctime)s: %(message)s"
+logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
+
+class hasaki_db_writer:
+ def __init__(self, config):
+ self.config = config
+ self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
+ self.conn.autocommit = True
+ self.cur = self.conn.cursor()
+
+ def __del__(self):
+ logging.info("Closing connection.....")
+ self.conn.close()
+
+ def get_id(self, schema, table):
+ sql = f"""
+ select max(id) from {schema}.{table}
+ """
+ self.cur.execute(sql)
+ res = self.cur.fetchone()
+
+ if res[0] != None:
+ id = res[0] + 1
+ else:
+ id = 1
+
+ return id
+
+ def get_aud_id(self, schema, table):
+ sql = f"""
+ select max(auditid) from {schema}.{table}
+ """
+ self.cur.execute(sql)
+ res = self.cur.fetchone()
+
+ if res[0] != None:
+ id = res[0] + 1
+ else:
+ id = 1
+
+ return id
+
+ def rce_category(self, data):
+ sql = f"""
+ select * from {self.config.get('crawler_schema')}.{self.config.get('category_tab')} where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']}
+ """
+ self.cur.execute(sql)
+ res = self.cur.fetchone()
+
+ id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('category_tab'))
+ id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('category_tab'))
+
+
+ if not res:
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.{self.config.get('category_tab')}(id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,category_parent_name)
+ values({id_main},{data['parent_category_id']},{data['rce_source_id']},{data['rce_source_category_id']},{data['rce_source_status']},'{data['category_page_url']}','{data['category_page_url_hash']}','{data['category_name']}','{data['category_parent_name']}')
+ """
+ logging.info(sql)
+
+ self.cur.execute(sql)
+
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.aud_{self.config.get('category_tab')}(auditid,id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat,category_parent_name)
+ select {id_aud},id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat,category_parent_name from {self.config.get('crawler_schema')}.{self.config.get('category_tab')}
+ where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']}
+ """
+
+ logging.info(sql)
+
+ self.cur.execute(sql)
+
+ else:
+ if str(data['parent_category_id'])==str(res[1]) and str(data['rce_source_category_id'])==str(res[3]) and str(data['category_name']) == str(res[7]) and \
+ str(data['category_page_url'])==str(res[5]) and str(data['category_parent_name'])==str(res[12]):
+ sql = f"""
+ update {self.config.get('crawler_schema')}.{self.config.get('category_tab')} set updatedat=GETDATE()
+ where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']}
+ """
+ logging.info(sql)
+ self.cur.execute(sql)
+
+ sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+" a set updatedat=b.updatedat " \
+ "from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" b where a.id=b.id and b.id = "+str(res[0])
+ logging.info(sql)
+ self.cur.execute(sql)
+ else:
+ sql = f"""
+ update {self.config.get('crawler_schema')}.{self.config.get('category_tab')} set parent_category_id={data['parent_category_id']}, rce_source_category_id = {data['rce_source_category_id']},
+ category_name = '{data['category_name']}', category_page_url = '{data['category_page_url']}', category_page_url_hash = '{data['category_page_url_hash']}', category_parent_name = '{data['category_parent_name']}',
+ updatedat=GETDATE() where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']}
+ """
+ logging.info(sql)
+ self.cur.execute(sql)
+
+ sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(auditid,id,parent_category_id,rce_source_id," \
+ "rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat,category_parent_name) " \
+ "select "+str(id_aud)+", id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \
+ "category_name,createdat,updatedat,category_parent_name from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \
+ "where category_name = '"+ str(res[7])+"'"
+ logging.info(sql)
+
+ self.cur.execute(sql)
+
+ def rce_product(self, data):
+ sql = f"""
+ select * from {self.config.get('crawler_schema')}.{self.config.get('product_tab')}
+ where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
+ """
+ self.cur.execute(sql)
+ res = self.cur.fetchone()
+
+ id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('product_tab'))
+ id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('product_tab'))
+
+ if not res:
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.{self.config.get('product_tab')}(id,rce_source_product_id,rce_source_product_status,product_page_url,
+ product_page_url_hash,rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,
+ product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings,product_section,
+ rce_source_id,countryoforigin,rank,ships_from) values({id_main},{data['rce_source_product_id']},{data['rce_source_product_status']},'{data['product_page_url']}',
+ '{data['product_page_url_hash']}',{data['rce_category_id']},{data['rce_brand_id']},{data['rce_store_id']},'{data['rce_source_product_name']}','{data['product_images']}','{data['product_description']}',{data['product_sold_total']},{data['product_sold']},
+ {data['product_price_min']},{data['product_price_min_before_discount']},{data['product_price_max']},{data['product_price_max_before_discount']},{data['ratings']},'{data['product_section']}',
+ {data['rce_source_id']},'{data['countryoforigin']}',{data['rank']},'{data['ships_from']}')
+ """
+ logging.info(sql)
+
+ self.cur.execute(sql)
+
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.aud_{self.config.get('product_tab')}(auditid,id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash,
+ rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount,
+ product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank)
+ select {id_aud},id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash,
+ rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount,
+ product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank from {self.config.get('crawler_schema')}.{self.config.get('product_tab')}
+ where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
+ """
+
+ logging.info(sql)
+ self.cur.execute(sql)
+ else:
+
+ if str(data['rce_source_product_id'])==str(res[1]) and str(data['rce_source_product_status'])==str(res[2]) and \
+ str(data['product_page_url'])==str(res[3]) and str(data['product_page_url_hash'])==str(res[4]) and str(data['rce_category_id'])==str(res[5]) and \
+ str(data['rce_brand_id'])==str(res[6]) and str(data['rce_store_id'])==str(res[7]) and str(data['rce_source_product_name'])==str(res[8]) and \
+ str(data['product_images'])==str(res[9]) and str(data['product_sold_total'])==str(res[11]) and \
+ str(data['product_sold'])==str(res[12]) and str(data['product_price_min'])==str(res[13]) and str(data['product_price_min_before_discount'])==str(res[14]) and \
+ str(data['product_price_max'])==str(res[15]) and str(data['product_price_max_before_discount'])==str(res[16]) and str(data['ratings'])==str(res[17]) \
+ and str(data['ships_from'])==str(res[18]) and str(data['rce_source_id'])==str(res[21]) \
+ and str(data['product_section'])==str(res[22]) and str(data['countryoforigin'])==str(res[23])\
+ and str(data['rank'])==str(res[24]):
+
+ sql = f"""
+ update {self.config.get('crawler_schema')}.{self.config.get('product_tab')} set updatedat=GETDATE()
+ where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
+ """
+ logging.info(sql)
+ self.cur.execute(sql)
+
+ sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" a set updatedat=b.updatedat " \
+ "from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" b where a.id=b.id and b.id = "+str(res[0])
+ logging.info(sql)
+ self.cur.execute(sql)
+ else:
+ sql = f"""
+ update {self.config.get('crawler_schema')}.{self.config.get('product_tab')} set rce_source_product_id = {data['rce_source_product_id']}, rce_source_product_status={data['rce_source_product_status']}, product_page_url='{data['product_page_url']}',
+ product_page_url_hash='{data['product_page_url_hash']}', rce_category_id={data['rce_category_id']}, rce_brand_id={data['rce_brand_id']}, rce_store_id={data['rce_store_id']},
+ rce_source_product_name='{data['rce_source_product_name']}', product_images='{data['product_images']}', product_description='{data['product_description']}', product_sold_total={data['product_sold_total']},
+ product_sold={data['product_sold']}, product_price_min='{data['product_price_min']}',product_price_min_before_discount='{data['product_price_min_before_discount']}',
+ product_price_max='{data['product_price_max']}', product_price_max_before_discount='{data['product_price_max_before_discount']}', ratings={data['ratings']},
+ ships_from='{data['ships_from']}',product_section='{data['product_section']}',countryoforigin='{data['countryoforigin']}',rank={data['rank']}, updatedat=GETDATE()
+ where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
+ """
+
+ logging.info(sql)
+ self.cur.execute(sql)
+
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.aud_{self.config.get('product_tab')}(auditid,id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash,
+ rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount,
+ product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank)
+ select {id_aud},id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash,
+ rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount,
+ product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank from {self.config.get('crawler_schema')}.{self.config.get('product_tab')}
+ where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
+ """
+ logging.info(sql)
+ self.cur.execute(sql)
+
+
+ def rce_product_variant(self, data):
+ sql = f"""
+ select * from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} where
+ rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
+ """
+ self.cur.execute(sql)
+ res = self.cur.fetchone()
+
+ id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('variant_tab'))
+ id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('variant_tab'))
+
+ if not res:
+
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.{self.config.get('variant_tab')}(id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku)
+ values({id_main},{data['rce_source_variant_id']},{data['rce_product_id']},'{data['product_variant_name']}',{data['product_variant_price']},{data['product_variant_price_before_discount']},{data['product_variant_stock']},'{data['product_variant_sku']}')
+ """
+
+ logging.info(sql)
+
+ self.cur.execute(sql)
+
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.aud_{self.config.get('variant_tab')}(auditid,id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat)
+ select {id_aud},id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat
+ from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
+ """
+
+ logging.info(sql)
+ self.cur.execute(sql)
+
+ else:
+ if str(data['rce_source_variant_id'])==str(res[1]) and str(data['rce_product_id'])==str(res[2]) and str(data['product_variant_name'])==str(res[3]) and \
+ str(data['product_variant_price'])==str(res[4]) and str(data['product_variant_price_before_discount'])==str(res[5]) and str(data['product_variant_stock'])==str(res[6])\
+ and str(data['product_variant_sku'])==str(res[9]):
+
+ sql = f"""
+ update {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} set updatedat=GETDATE()
+ where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
+ """
+ logging.info(sql)
+ self.cur.execute(sql)
+
+
+ sql = f"""
+ update {self.config.get('crawler_schema')}.aud_{self.config.get('variant_tab')} a set updatedat=b.updatedat
+ from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} b where a.id=b.id and b.id = {res[0]}
+ """
+
+ logging.info(sql)
+ self.cur.execute(sql)
+ else:
+
+ sql = f"""
+ update {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} set rce_source_variant_id={data['rce_source_variant_id']},
+ rce_product_id={data['rce_product_id']},product_variant_name='{data['product_variant_name']}',product_variant_price={data['product_variant_price']},
+ product_variant_price_before_discount={data['product_variant_price_before_discount']},product_variant_stock={data['product_variant_stock']},
+ product_variant_sku={data['product_variant_sku']}, updatedat=GETDATE()
+ where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
+ """
+
+ logging.info(sql)
+ self.cur.execute(sql)
+
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.aud_{self.config.get('variant_tab')}(auditid,id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat)
+ select {id_aud},id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat
+ from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
+ """
+
+ logging.info(sql)
+
+ self.cur.execute(sql)
+
+
+ def rce_brand(self, data):
+ sql = f"""
+ select * from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} where rce_source_brand_id = {data['rce_source_brand_id']}
+ and rce_source_id = {data['rce_source_id']}
+ """
+ self.cur.execute(sql)
+ res = self.cur.fetchone()
+
+ id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('brand_tab'))
+ id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('brand_tab'))
+
+ if not res:
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.{self.config.get('brand_tab')}(id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating)
+ values({id_main},{data['rce_source_id']},{data['rce_source_brand_id']},{data['rce_source_brand_status']},'{data['brand_page_url']}','{data['brand_page_url_hash']}','{data['brand_name']}',{data['brand_following']},{data['brand_rating']})
+ """
+
+ logging.info(sql)
+
+ self.cur.execute(sql)
+
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.aud_{self.config.get('brand_tab')}(auditid,id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat)
+ select {id_aud}, id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')}
+ where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']}
+ """
+ logging.info(sql)
+
+ self.cur.execute(sql)
+
+ else:
+
+ if str(data['rce_source_id'])==str(res[1]) and str(data['rce_source_brand_status'])==str(res[3]) and str(data['brand_page_url'])==str(res[4]) and \
+ str(data['brand_page_url_hash'])==str(res[5]) and str(data['brand_name'])==str(res[6]) and str(data['rce_source_brand_id'])==str(res[2]):
+
+ sql = f"""
+ update {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} set updatedat=GETDATE() where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']}
+ """
+ logging.info(sql)
+ self.cur.execute(sql)
+
+ sql = f"""
+ update {self.config.get('crawler_schema')}.aud_{self.config.get('brand_tab')} a set updatedat=b.updatedat
+ from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} b where a.id=b.id and b.id = {res[0]} and
+ b.rce_source_id = {data['rce_source_id']}
+ """
+ logging.info(sql)
+ self.cur.execute(sql)
+ else:
+ sql = f"""
+ update {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} set rce_source_id={data['rce_source_id']}, rce_source_brand_id={data['rce_source_brand_id']},
+ rce_source_brand_status={data['rce_source_brand_status']}, brand_page_url='{data['brand_page_url']}', brand_page_url_hash='{data['brand_page_url_hash']}',
+ brand_name='{data['brand_name']}', brand_following={data['brand_following']}, brand_rating={data['brand_rating']}, updatedat=GETDATE() where rce_source_brand_id={data['rce_source_brand_id']}
+ and rce_source_id = {data['rce_source_id']}
+ """
+
+ logging.info(sql)
+ self.cur.execute(sql)
+
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.aud_{self.config.get('brand_tab')}(auditid,id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat)
+ select {id_aud}, id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')}
+ where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']}
+ """
+
+ logging.info(sql)
+
+ self.cur.execute(sql)
+
+ def rce_reseller(self, data):
+ data['reseller_name'] = data['reseller_name']
+
+ sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data['reseller_name'])+"'"
+ self.cur.execute(sql)
+ res = self.cur.fetchone()
+
+ id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('reseller_tab'))
+ id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('reseller_tab'))
+
+
+ if not res:
+
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.{self.config.get('reseller_tab')}(id,rce_source_id,rce_source_reseller_status,reseller_name)
+ values({id_main},'{data['rce_source_id']}','{data['rce_source_reseller_status']}','{data['reseller_name']}')
+ """
+ #logging.info(sql)
+
+ self.cur.execute(sql)
+
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.aud_{self.config.get('reseller_tab')}(auditid,id,rce_source_id,rce_source_reseller_status,reseller_name,createdat,updatedat)
+ select {id_aud}, id,rce_source_id,rce_source_reseller_status,reseller_name,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('reseller_tab')}
+ where reseller_name='{data['reseller_name']}'
+ """
+ #logging.info(sql)
+
+ self.cur.execute(sql)
+
+ else:
+
+ if str(data['rce_source_reseller_status'])==str(res[3]) and str(data['reseller_name'])==str(res[4]):
+
+ sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set updatedat=GETDATE() " \
+ "where reseller_name = '"+ str(res[4])+"'"
+ #logging.info(sql)
+ self.cur.execute(sql)
+
+ sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" a set updatedat=b.updatedat " \
+ "from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" b where a.id=b.id and b.id = "+str(res[0])
+ #logging.info(sql)
+ self.cur.execute(sql)
+ else:
+
+ sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set rce_source_id="+str(data['rce_source_id'])+", " \
+ "rce_source_reseller_status="+str(data['rce_source_reseller_status'])+", reseller_name='"+str(data['reseller_name'])+"', reseller_average_rating=" \
+ "'"+str(data['reseller_average_rating'])+"',reseller_description='"+str(data['reseller_description'])+"', updatedat=GETDATE() where reseller_name = '"+ str(res[4])+"'"
+ #logging.info(sql)
+ self.cur.execute(sql)
+
+ sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (auditid,id,rce_source_id,rce_source_reseller_status," \
+ "reseller_name,reseller_average_rating,reseller_description,createdat,updatedat) select "+str(id_aud)+", id,rce_source_id,rce_source_reseller_status," \
+ "reseller_name,reseller_average_rating,reseller_description,createdat,updatedat from " \
+ ""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name='"+str(res[4])+"'"
+ #logging.info(sql)
+
+ self.cur.execute(sql)
+
+ def rce_reseller_store(self, data):
+
+ data['store_page_url'] = data['store_page_url'].replace("'","''")
+
+ sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url = '"+str(data['store_page_url'])+"'"
+ self.cur.execute(sql)
+ res = self.cur.fetchone()
+
+ id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('reseller_store_tab'))
+ id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('reseller_store_tab'))
+
+ if not res:
+
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.{self.config.get('reseller_store_tab')}(id,rce_source_store_status,store_page_url,store_page_url_hash,rce_reseller_id,rce_source_id)
+ values({id_main},'{data['rce_source_store_status']}','{data['store_page_url']}','{data['store_page_url_hash']}',{data['rce_reseller_id']},{data['rce_source_id']})
+ """
+ #logging.info(sql)
+
+ self.cur.execute(sql)
+
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.aud_{self.config.get('reseller_store_tab')}(auditid,id,rce_source_store_status,store_page_url,store_page_url_hash,rce_reseller_id,createdat,updatedat,rce_source_id)
+ select {id_aud}, id,rce_source_store_status,store_page_url,store_page_url_hash,rce_reseller_id,createdat,updatedat,rce_source_id from {self.config.get('crawler_schema')}.{self.config.get('reseller_store_tab')}
+ where store_page_url= '{data['store_page_url']}'
+ """
+ #logging.info(sql)
+
+ self.cur.execute(sql)
+
+ else:
+
+ if str(data['rce_source_store_status'])==str(res[2]) and str(data['store_page_url'])==str(res[3]) and \
+ str(data['store_page_url_hash'])==str(res[4]) and \
+ str(data['rce_reseller_id'])==str(res[6]) and str(data['rce_source_id'])==str(res[9]):
+
+ sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set updatedat=GETDATE() " \
+ "where store_page_url = '"+ str(res[3])+"'"
+ #logging.info(sql)
+ self.cur.execute(sql)
+
+ sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" a set updatedat=b.updatedat " \
+ "from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" b where a.id=b.id and b.id = "+str(res[0])
+ #logging.info(sql)
+ self.cur.execute(sql)
+ else:
+
+ sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set " \
+ "rce_source_store_status="+str(data['rce_source_store_status'])+", store_page_url='"+str(data['store_page_url'])+"', store_page_url_hash=" \
+ "'"+str(data['store_page_url_hash'])+"',store_location='"+str(data['store_location'])+"', rce_reseller_id="+str(data['rce_reseller_id'])+", " \
+ "updatedat=GETDATE(), rce_source_id="+str(data['rce_source_id'])+" where store_page_url = '"+ str(res[3])+"'"
+ #logging.info(sql)
+ self.cur.execute(sql)
+
+ sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (auditid,id,rce_source_store_status," \
+ "store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id) select "+id_aud+", id,rce_source_store_status," \
+ "store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id from " \
+ ""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url='"+str(res[3])+"'"
+ #logging.info(sql)
+
+ self.cur.execute(sql)
+
+ def rce_ratings_reviews(self, data):
+
+ sql = f"""
+ select * from {self.config.get('crawler_schema')}.{self.config.get('review_tab')}
+ where rce_product_id = {data['rce_product_id']} and username = '{data['username']}'
+ """
+
+ self.cur.execute(sql)
+ res = self.cur.fetchone()
+
+ data['username'] = data['username'].replace("'","")
+
+ id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('review_tab'))
+ id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('review_tab'))
+
+ if not res:
+
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.{self.config.get('review_tab')}(id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating)
+ values({id_main},{data['rce_product_id']},'{data['username']}','{data['review']}','{data['img_url']}',{data['review_like_count']},'{data['user_tier']}',{data['shop_id']},'{data['video_url']}',{data['rating']})
+ """
+
+ logging.info(sql)
+
+ self.cur.execute(sql)
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.aud_{self.config.get('review_tab')}(auditid,id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat)
+ select {id_aud},id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('review_tab')}
+ where rce_product_id = {data['rce_product_id']} and username = '{data['username']}'
+ """
+
+ logging.info(sql)
+
+ self.cur.execute(sql)
+
+ else:
+
+ if str(data['rce_product_id'])==str(res[1]) and str(data['username'])==str(res[2]) and str(data['review'])==str(res[3]) and \
+ str(data['img_url'])==str(res[4]) and str(data['review_like_count'])==str(res[5]) and str(data['user_tier'])==str(res[6]) and \
+ str(data['shop_id'])==str(res[7]) and str(data['video_url'])==str(res[8]) and str(data['rating'])==str(res[9]):
+
+
+ sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set updatedat=GETDATE() " \
+ "where rce_product_id = "+ str(res[1])+" and username ='"+res[2]+"'"
+ logging.info(sql)
+ self.cur.execute(sql)
+
+ sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" a set updatedat=b.updatedat " \
+ "from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" b where a.id=b.id and b.id = "+str(res[0])
+ logging.info(sql)
+ self.cur.execute(sql)
+ else:
+
+ sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set rce_product_id="+str(data['rce_product_id'])+", " \
+ "username='"+str(data['username'])+"', review='"+str(data['review'])+"', img_url=" \
+ "'"+str(data['img_url'])+"',review_like_count="+str(data['review_like_count'])+", user_tier='"+str(data['user_tier'])+"', " \
+ "shop_id="+str(data['shop_id'])+", video_url='"+str(data['video_url'])+"', rating='"+str(data['rating'])+"', updatedat=GETDATE() " \
+ "where rce_product_id = "+ str(res[1])+" and username ='"+str(data['username'])+"'"
+ logging.info(sql)
+ self.cur.execute(sql)
+
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.aud_{self.config.get('review_tab')}(auditid,id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat)
+ select {id_aud},id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('review_tab')}
+ where rce_product_id = {data['rce_product_id']} and username = '{data['username']}'
+ """
+ logging.info(sql)
+
+ self.cur.execute(sql)
+
+ def rce_seo(self, data):
+ sql = f"""
+ select * from {self.config.get('crawler_schema')}.{self.config.get('seo_tab')}
+ where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']}
+ """
+
+ self.cur.execute(sql)
+ res = self.cur.fetchone()
+
+ id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('seo_tab'))
+ id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('seo_tab'))
+
+ if not res:
+
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.{self.config.get('seo_tab')}(id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots)
+ values({id_main},{data['rce_product_id']},{data['rce_source_id']},'{data['seo_title']}','{data['seo_description']}','{data['seo_url']}','{data['seo_url_hash']}','{data['seo_image']}',{data['seo_price_amount']},'{data['seo_price_currency']}','{data['seo_product_band']}','{data['seo_product_availability']}','{data['seo_product_category']}',
+ '{data['seo_product_condition']}',{data['seo_product_retailer_item_id']},'{data['seo_product_robots']}')
+ """
+
+ logging.info(sql)
+
+ self.cur.execute(sql)
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.aud_{self.config.get('seo_tab')}(auditid,id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat)
+ select {id_aud},id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('seo_tab')}
+ where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']}
+ """
+
+ logging.info(sql)
+
+ self.cur.execute(sql)
+
+ else:
+
+ if (str(data['rce_product_id']) == str(res[1]) and str(data['rce_source_id']) == str(res[2]) and str(data['seo_title']) == str(res[3]) and \
+ str(data['seo_description']) == str(res[4]) and str(data['seo_url']) == str(res[5]) and str(data['seo_url_hash']) == str(res[6]) and \
+ str(data['seo_image']) == str(res[7]) and str(data['seo_price_amount']) == str(res[8]) and str(data['seo_price_currency']) == str(res[9]) and \
+ str(data['seo_product_band']) == str(res[10])) and str(data['seo_product_availability']) == str(res[11]) and str(data['seo_product_category']) == str(res[12]) and \
+ str(data['seo_product_condition']) == str(res[13]) and str(data['seo_product_retailer_item_id']) == str(res[14]) and str(data['seo_product_robots']) == str(res[15]):
+
+ sql = "update " + self.config.get('crawler_schema') + "." + self.config.get('seo_tab') + " set updatedat=GETDATE() " \
+ "where rce_product_id = " + str(res[1]) + " and rce_source_id =" + str(data['rce_source_id'])
+ logging.info(sql)
+ self.cur.execute(sql)
+
+ sql = "update " + self.config.get('crawler_schema') + ".aud_" + self.config.get('seo_tab') + " a set updatedat=b.updatedat " \
+ "from " + self.config.get('crawler_schema') + "." + self.config.get('seo_tab') + " b where a.id=b.id and b.id = " + str(res[0])
+ logging.info(sql)
+ self.cur.execute(sql)
+ else:
+
+ sql = f"""
+ update {self.config.get('crawler_schema')}.{self.config.get('seo_tab')} set rce_product_id={data['rce_product_id']}, rce_source_id={data['rce_source_id']}, seo_title='{data['seo_title']}', seo_description='{data['seo_description']}',
+ seo_url='{data['seo_url']}', seo_url_hash='{data['seo_url_hash']}', seo_image='{data['seo_image']}', seo_price_amount='{data['seo_price_amount']}', seo_price_currency='{data['seo_price_currency']}', seo_product_band='{data['seo_product_band']}',
+ seo_product_availability='{data['seo_product_availability']}', seo_product_category='{data['seo_product_category']}', seo_product_condition='{data['seo_product_condition']}', seo_product_retailer_item_id={data['seo_product_retailer_item_id']},
+ seo_product_robots='{data['seo_product_robots']}' where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']}
+ """
+
+ logging.info(sql)
+ self.cur.execute(sql)
+
+ sql = f"""
+ insert into {self.config.get('crawler_schema')}.aud_{self.config.get('seo_tab')}(auditid,id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat)
+ select {id_aud},id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('seo_tab')}
+ where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']}
+ """
+ logging.info(sql)
+
+ self.cur.execute(sql)
+
+
+
+ # def rce_ratings_reviews_productmodels(self,data):
+ #
+ # sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id = "+str(data['rce_rating_id'])
+ # self.cur.execute(sql)
+ # res = self.cur.fetchone()
+ #
+ #
+ # if not res:
+ #
+ # sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" (rce_rating_id,model_id) " \
+ # "values("+str(data['rce_rating_id'])+",'"+str(data['model_id'])+"')"
+ # #logging.info(sql)
+ #
+ # self.cur.execute(sql)
+ #
+ # sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
+ # "createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
+ # ""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(data['rce_rating_id'])+""
+ # #logging.info(sql)
+ #
+ # self.cur.execute(sql)
+ #
+ # else:
+ #
+ # if str(data['rce_rating_id'])==str(res[1]) and str(data['model_id'])==str(res[2]):
+ #
+ # sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set updatedat=GETDATE() " \
+ # "where rce_rating_id = "+ str(res[1])
+ # #logging.info(sql)
+ # self.cur.execute(sql)
+ #
+ # sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" a set updatedat=b.updatedat " \
+ # "from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" b where a.id=b.id and b.id = "+str(res[0])
+ # #logging.info(sql)
+ # self.cur.execute(sql)
+ # else:
+ #
+ # sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set model_id="+str(data['model_id'])+", " \
+ # "updatedat=GETDATE() where rce_source_store_id = "+ str(res[1])
+ # #logging.info(sql)
+ # self.cur.execute(sql)
+ #
+ # sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
+ # "createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
+ # ""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(res[1])+""
+ # #logging.info(sql)
+ #
+ # self.cur.execute(sql)
+ #
+ #
+ # def rce_tags(self,data):
+ #
+ # sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description = '"+str(data['description'])+"'"
+ # self.cur.execute(sql)
+ # res = self.cur.fetchone()
+ #
+ #
+ # if not res:
+ #
+ # sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" (id,description) " \
+ # "values("+str(data['id'])+",'"+str(data['description'])+"')"
+ # #logging.info(sql)
+ #
+ # self.cur.execute(sql)
+ #
+ # sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
+ # "createdat,updatedat) select id,description,createdat,updatedat from " \
+ # ""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(data['description'])+"'"
+ # #logging.info(sql)
+ #
+ # self.cur.execute(sql)
+ #
+ # else:
+ #
+ # if str(data['description'])==str(res[1]):
+ #
+ # sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set updatedat=GETDATE() " \
+ # "where description = '"+ str(res[1])+"'"
+ # #logging.info(sql)
+ # self.cur.execute(sql)
+ #
+ # sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" a set updatedat=b.updatedat " \
+ # "from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
+ # #logging.info(sql)
+ # self.cur.execute(sql)
+ # else:
+ #
+ # sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set description='"+str(data['description'])+"', " \
+ # "updatedat=GETDATE() where description = "+ str(res[1])
+ # #logging.info(sql)
+ # self.cur.execute(sql)
+ #
+ # sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
+ # "createdat,updatedat) select id,description,createdat,updatedat from " \
+ # ""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(res[1])+"'"
+ # #logging.info(sql)
+ #
+ # self.cur.execute(sql)
+ #
+ #
+ # def rce_ratings_reviews_producttags(self,data):
+ #
+ # sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id = '"+str(data['rce_rating_id'])+"'"
+ # self.cur.execute(sql)
+ # res = self.cur.fetchone()
+ #
+ #
+ # if not res:
+ #
+ # sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" (rce_rating_id,tag_ids) " \
+ # "values("+str(data['rce_rating_id'])+",'"+str(data['tag_ids'])+"')"
+ # #logging.info(sql)
+ #
+ # self.cur.execute(sql)
+ #
+ # sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
+ # "createdat,updatedat) select id,rce_rating_id,tag_ids,createdat,updatedat from " \
+ # ""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id='"+str(data['rce_rating_id'])+"'"
+ # #logging.info(sql)
+ #
+ # self.cur.execute(sql)
+ #
+ # else:
+ #
+ # if str(data['rce_rating_id'])==str(res[1]):
+ #
+ # sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set updatedat=GETDATE() " \
+ # "where rce_rating_id = '"+ str(res[1])+"'"
+ # #logging.info(sql)
+ # self.cur.execute(sql)
+ #
+ # sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" a set updatedat=b.updatedat " \
+ # "from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
+ # #logging.info(sql)
+ # self.cur.execute(sql)
+ # else:
+ #
+ # sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set rce_rating_id='"+str(data['rce_rating_id'])+"', " \
+ # "updatedat=GETDATE() where rce_rating_id = "+ str(res[1])
+ # #logging.info(sql)
+ # self.cur.execute(sql)
+ #
+ # sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
+ # "createdat,updatedat) select id,description,createdat,updatedat from " \
+ # ""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where description='"+str(res[1])+"'"
+ # #logging.info(sql)
+ #
+ # self.cur.execute(sql)
+ #
+ #
+
diff --git a/hasaki_crawler_engine/hasaki_product_info.py b/hasaki_crawler_engine/hasaki_product_info.py
new file mode 100644
index 0000000..7c5584d
--- /dev/null
+++ b/hasaki_crawler_engine/hasaki_product_info.py
@@ -0,0 +1,454 @@
+import hashlib
+import logging
+import random
+import string
+import time
+import re
+import psycopg2
+from playwright.sync_api import sync_playwright
+from deep_translator import GoogleTranslator
+from hasaki_db_writer import hasaki_db_writer
+import pandas as pd
+from bs4 import BeautifulSoup
+from Util import translate_text_to_english
+
+class HasakiProductInfo:
+ def __init__(self, config):
+ logging.info("Initializing HasakiProductInfo")
+ self.pattern = r'[' + string.punctuation + ']'
+ self.config = config
+ self.crawler_name = self.config.get("crawler_name")
+ self.product_limit = int(self.config.get("product_per_category"))
+ self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'),
+ password=self.config.get('db_pass'), host=self.config.get('db_host'),
+ port=self.config.get('db_port'))
+ self.conn.autocommit = True
+ self.cur = self.conn.cursor()
+ self.cur.execute(
+ f"""select id from {self.config.get('crawler_schema')}.{self.config.get('source_tab')} where source_name='Hasaki'""")
+ try:
+ self.rce_source_id = self.cur.fetchone()[0]
+ except:
+ logging.info("Source tab is empty. Please check. Exiting.....")
+ exit(1)
+
+ self.db_writer = hasaki_db_writer(config)
+
+ def __del__(self):
+ print("Closing connection.....")
+ self.conn.close()
+
+ def start_processing(self):
+ logging.info("Starting to collect product info from Hasaki........")
+
+ logging.info("Fetching product list from DB......")
+
+ sql = f"""
+ select * from {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} where flag = 0
+ order by categoryid, product_section, product_rank
+ """
+
+ self.cur.execute(sql)
+ rows = self.cur.fetchall()
+ logging.info("Found {} products.......".format(str(len(rows))))
+ cnt = 1
+ for row in rows:
+ logging.info("========= Fetching product info {}/{}: {} =========".format(str(cnt),str(len(rows)),row[3]))
+
+ try:
+ self.get_product_info(row)
+ except:
+ pass
+
+ sql = f"""
+ update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1
+ where categoryid={row[9]}, product_section='{row[1]}', product_rank={row[8]}, product_url='{row[3]}'
+ """
+ self.cur.execute(sql)
+
+ cnt += 1
+
+
+ def get_product_info(self, data):
+
+ raw_data = self.get_raw_product_data(data[3])
+
+ print(raw_data)
+
+ if raw_data:
+ self.product_info(data, raw_data)
+
+ self.rating_info(raw_data)
+
+ self.seo_info(raw_data)
+
+
+ def get_raw_product_data(self, url):
+ with sync_playwright() as p:
+ browser = p.chromium.launch(headless=True)
+ context = browser.new_context(
+ user_agent="Mozilla/5.0 (iPhone X; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1")
+ page = context.new_page()
+
+ page.goto(url)
+
+ with page.expect_response("**/wap/v2/product/detail**") as response:
+ api_requests = response.value.json()
+
+ browser.close()
+
+ return api_requests
+
+ def product_info(self, data, raw_data):
+
+ #region rce_brand
+
+ data_brand = {}
+
+ data_brand['rce_source_id'] = self.rce_source_id
+ data_brand['rce_source_brand_status'] = 1
+ data_brand['rce_source_brand_id'] = 0
+ data_brand['brand_page_url'] = ""
+ data_brand['brand_page_url_hash'] = ""
+ data_brand['brand_name'] = ""
+ data_brand['brand_following'] = ""
+ data_brand['brand_rating'] = ""
+
+ try:
+
+ data_brand['rce_source_brand_id'] = raw_data['brand']['id']
+
+ try:
+ data_brand['brand_page_url'] = "https://hasaki.vn/" + raw_data['brand']['url'] + ".html"
+ data_brand['brand_page_url'] = str(data_brand['brand_page_url']).replace("'","")
+ data_brand['brand_page_url_hash'] = hashlib.md5(data_brand['brand_page_url'].encode('utf-8')).hexdigest()
+ except:
+ pass
+
+ try:
+ data_brand['brand_name'] = translate_text_to_english(str(raw_data['brand']['name']).replace("'",""))
+ except:
+ pass
+
+ try:
+ data_brand['brand_following'] = raw_data['brand']['following']
+ except:
+ pass
+
+ try:
+ data_brand['brand_rating'] = raw_data['brand']['rating']
+ except:
+ pass
+
+ try:
+ self.db_writer.rce_brand(data_brand)
+ except Exception as e:
+ logging.info(e)
+ except:
+ pass
+
+ #endregion
+
+ #region rce_product
+
+ data_product = {}
+
+ try:
+
+ data_product['rce_source_product_id'] = raw_data['id']
+ data_product['rce_source_id'] = self.rce_source_id
+ data_product['rce_source_product_status'] = 1
+ data_product['product_page_url'] = str(raw_data['url']).replace("'","")
+ data_product['product_page_url_hash'] = hashlib.md5(data_product['product_page_url'].encode('utf-8')).hexdigest()
+ data_product['rce_category_id'] = data[9]
+ data_product['rce_store_id'] = 0
+
+ data_product['rce_source_product_name'] = str(raw_data['name']) + str(raw_data['alt_name'])
+ data_product['rce_source_product_name'] = translate_text_to_english(str(re.sub(self.pattern, '', data_product['rce_source_product_name'])))
+ data_product['rce_source_product_name'] = str(data_product['rce_source_product_name']).replace("'", "")
+
+ data_product['product_images'] = data[4]
+
+ data_product['product_description'] = ""
+ try:
+
+ description_raw = raw_data['description']
+ soup = BeautifulSoup(description_raw, 'html.parser')
+ data_product['product_description'] = translate_text_to_english(re.sub(self.pattern, '',soup.get_text()).replace("'",""))
+ data_product['product_description'] = str(data_product['product_description']).replace("'","")
+ except:
+ pass
+
+ data_product['rce_brand_id'] = ""
+ try:
+ sql = f"""
+ select id from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} where
+ rce_source_id = {self.rce_source_id} and rce_source_brand_id = {raw_data['brand']['id']}
+ """
+ self.cur.execute(sql)
+ res = self.cur.fetchone()
+ data_product['rce_brand_id'] = res[0]
+ except:
+ pass
+
+
+ data_product['product_sold_total'] = 0
+
+ data_product['product_sold'] = 0
+ try:
+ data_product['product_sold'] = raw_data['bought']
+ except:
+ pass
+
+ data_product['product_price_min'] = 0
+ data_product['product_price_max'] = 0
+ try:
+ data_product['product_price_min'] = raw_data['int_final_price']
+ data_product['product_price_max'] = raw_data['int_final_price']
+ except:
+ pass
+
+
+ data_product['product_price_min_before_discount'] = 0
+ data_product['product_price_max_before_discount'] = 0
+ try:
+ data_product['product_price_min_before_discount'] = raw_data['price']
+ data_product['product_price_max_before_discount'] = raw_data['price']
+ except:
+ pass
+
+ data_product['ratings'] = 0.0
+ try:
+ data_product['ratings'] = raw_data['rating']['avg_rate']
+ except:
+ pass
+
+
+ data_product['ships_from'] = ""
+ data_product['product_section'] = data[1]
+ data_product['countryoforigin'] = ""
+ data_product['rank'] = data[8]
+
+ try:
+ self.db_writer.rce_product(data_product)
+ except Exception as e:
+ logging.info(e)
+
+ #region rce_product_variant
+
+ variant_items = raw_data['attribute']['items']
+
+ df_variant = pd.DataFrame({}, columns=['product_variant_name', 'rce_source_variant_id', 'product_variant_price',
+ 'product_variant_stock', 'product_variant_sku'])
+
+ data_variant = {}
+ for variant in variant_items:
+ for item in variant['options']:
+ data_variant['product_variant_name'] = item['long_label']
+ for product in item['products']:
+ data_variant['rce_source_variant_id'] = product['id']
+ data_variant['product_variant_price'] = product['price']
+ data_variant['product_variant_stock'] = product['quantity']
+ data_variant['product_variant_sku'] = product['sku']
+
+ # variants_arr.append(data_variant)
+
+ tmp = pd.DataFrame([[data_variant['product_variant_name'],
+ data_variant['rce_source_variant_id'],
+ data_variant['product_variant_price'],
+ data_variant['product_variant_stock'],
+ data_variant['product_variant_sku']]],
+ columns=['product_variant_name', 'rce_source_variant_id',
+ 'product_variant_price',
+ 'product_variant_stock', 'product_variant_sku'])
+ df_variant = pd.concat([df_variant, tmp])
+
+ df_variant_merged = df_variant.groupby('product_variant_sku').agg({
+ 'product_variant_name': ' '.join,
+ 'rce_source_variant_id': 'first',
+ 'product_variant_price': 'first',
+ 'product_variant_stock': 'first'
+ }).reset_index()
+
+ #print(df_variant_merged.to_string())
+
+ for index, row in df_variant_merged.iterrows():
+ try:
+ data_variant = {}
+
+ data_variant['rce_source_variant_id'] = row['rce_source_variant_id']
+ data_variant['product_variant_name'] = translate_text_to_english(row['product_variant_name'])
+ data_variant['product_variant_name'] = re.sub(self.pattern, '', data_variant['product_variant_name']).replace("'","")
+ data_variant['product_variant_price'] = row['product_variant_price']
+ data_variant['product_variant_price_before_discount'] = 0
+ data_variant['product_variant_stock'] = row['product_variant_stock']
+ data_variant['product_variant_sku'] = row['product_variant_sku']
+
+ data_variant['rce_product_id'] = ""
+
+ sql = f"""
+ select id from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where
+ rce_source_product_id = {data_product['rce_source_product_id']} and rce_source_id = {data_product['rce_source_id']}
+ """
+ self.cur.execute(sql)
+ data_variant['rce_product_id'] = self.cur.fetchone()[0]
+
+ try:
+ self.db_writer.rce_product_variant(data_variant)
+ except Exception as e:
+ logging.info(e)
+ except:
+ pass
+
+
+
+ #endregion
+
+ except:
+ pass
+
+ #endregion
+
+ def rating_info(self, raw_data):
+
+ try:
+
+ reviews1 = []
+ reviews2 = []
+
+ try:
+ reviews1 = raw_data['short_rating_data']['image_reviews']
+ except:
+ pass
+
+ try:
+ reviews2 = raw_data['short_rating_data']['reviews']
+ except:
+ pass
+
+ reviews = reviews1 + reviews2
+
+
+
+ for review in reviews:
+ data_review = {}
+
+ data_review["rce_product_id"] = ""
+ data_review["username"] = ""
+ data_review["review"] = ""
+ data_review["img_url"] = ""
+ data_review["review_like_count"] = 0
+ data_review["user_tier"] = ""
+ data_review["shop_id"] = 0
+ data_review["video_url"] = ""
+ data_review["rating"] = ""
+
+ sql = f"""
+ select id from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where
+ rce_source_product_id = {raw_data['id']} and rce_source_id = {self.rce_source_id}
+ """
+ self.cur.execute(sql)
+ data_review["rce_product_id"] = self.cur.fetchone()[0]
+
+ try:
+ data_review["username"] = str(review['user_fullname']).replace("'", "")
+ except:
+ pass
+
+ try:
+ data_review["review"] = translate_text_to_english(review['content']).replace("'", "")
+ except:
+ pass
+
+ try:
+ data_review["rating"] = review['rating']['star']
+ except:
+ pass
+
+ try:
+ self.db_writer.rce_ratings_reviews(data_review)
+ except Exception as e:
+ logging.info(e)
+ except Exception as e:
+ logging.info(e)
+
+
+
+ def seo_info(self, raw_data):
+
+ try:
+ data_seo = {}
+
+ data_seo['rce_product_id'] = 0
+ data_seo['rce_source_id'] = self.rce_source_id
+ data_seo['seo_title'] = ""
+ data_seo['seo_description'] = ""
+ data_seo['seo_url'] = ""
+ data_seo['seo_url_hash'] = ""
+ data_seo['seo_image'] = ""
+ data_seo['seo_price_amount'] = 0
+ data_seo['seo_price_currency'] = ""
+ data_seo['seo_product_band'] = ""
+ data_seo['seo_product_availability'] = ""
+ data_seo['seo_product_category'] = ""
+ data_seo['seo_product_condition'] = ""
+ data_seo['seo_product_retailer_item_id'] = 0
+ data_seo['seo_product_robots'] = ""
+
+ sql = f"""
+ select id from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where
+ rce_source_product_id = {raw_data['id']} and rce_source_id = {self.rce_source_id}
+ """
+ self.cur.execute(sql)
+ data_seo['rce_product_id'] = self.cur.fetchone()[0]
+
+ try: data_seo['seo_title'] = translate_text_to_english(raw_data['seo']['og:title']).replace("'","")
+ except: pass
+
+ try: data_seo['seo_description'] = translate_text_to_english(raw_data['seo']['og:description']).replace("'","")
+ except: pass
+
+ try: data_seo['seo_url'] = str(raw_data['seo']['og:url']).replace("'","")
+ except: pass
+
+ try: data_seo['seo_image'] = str(raw_data['seo']['og:image']).replace("'","")
+ except: pass
+
+ try: data_seo['seo_price_amount'] = raw_data['seo']['price:amount']
+ except: pass
+
+ try: data_seo['seo_price_currency'] = str(raw_data['seo']['price:currency']).replace("'","")
+ except: pass
+
+ try: data_seo['seo_product_band'] = translate_text_to_english(raw_data['seo']['product:band']).replace("'","")
+ except: pass
+
+ try: data_seo['seo_product_availability'] = str(raw_data['seo']['product:availability']).replace("'","")
+ except: pass
+
+ try: data_seo['seo_product_category'] = translate_text_to_english(raw_data['seo']['product:category']).replace("'","")
+ except: pass
+
+ try: data_seo['seo_product_condition'] = translate_text_to_english(raw_data['seo']['product:condition']).replace("'","")
+ except: pass
+
+ try: data_seo['seo_product_retailer_item_id'] = raw_data['seo']['product:retailer_item_id']
+ except: pass
+
+ try: data_seo['seo_product_robots'] = raw_data['seo']['product:robots']
+ except: pass
+
+ try:
+ self.db_writer.rce_seo(data_seo)
+ except Exception as e:
+ logging.info(e)
+
+ except:
+ pass
+
+
+
+
+
+
+
diff --git a/hasaki_crawler_engine/test.py b/hasaki_crawler_engine/test.py
new file mode 100644
index 0000000..728385b
--- /dev/null
+++ b/hasaki_crawler_engine/test.py
@@ -0,0 +1,63 @@
+import time
+from bs4 import BeautifulSoup
+from playwright.sync_api import sync_playwright
+import pandas as pd
+
+# Launch the Playwright browser in mobile mode
+with sync_playwright() as p:
+ browser = p.chromium.launch(headless=False)
+ context = browser.new_context(user_agent="Mozilla/5.0 (iPhone X; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1")
+ page = context.new_page()
+
+ page.goto("https://hasaki.vn/san-pham/nuoc-tay-trang-tuoi-mat-l-oreal-3-in-1-danh-cho-da-dau-da-hon-hop-400ml-19325.html")
+ page.wait_for_load_state('load')
+ #time.sleep(10)
+
+ # Capture the underlying API request URL
+ #api_requests = page.evaluate('''() => window.fetch('https://hasaki.vn/wap/v2/product/detail').then(response => response.json())''')
+ #print(api_requests)
+
+ with page.expect_response("**/wap/v2/product/detail**") as response:
+ data = response.value.json()
+
+ variant_items = data['attribute']['items']
+
+ df = pd.DataFrame({}, columns=['product_variant_name','rce_source_variant_id','product_variant_price','product_variant_stock','product_variant_sku'])
+
+ data_variant = {}
+ for variant in variant_items:
+ for item in variant['options']:
+ data_variant['product_variant_name'] = item['long_label']
+ for product in item['products']:
+ data_variant['rce_source_variant_id'] = product['id']
+ data_variant['rce_product_id'] = ""
+ data_variant['product_variant_price'] = product['price']
+ data_variant['product_variant_price_before_discount'] = ""
+ data_variant['product_variant_stock'] = product['quantity']
+ data_variant['product_variant_sku'] = product['sku']
+
+ #variants_arr.append(data_variant)
+
+ tmp = pd.DataFrame([[data_variant['product_variant_name'],data_variant['rce_source_variant_id'],data_variant['product_variant_price'],data_variant['product_variant_stock'],data_variant['product_variant_sku']]],
+ columns=['product_variant_name', 'rce_source_variant_id', 'product_variant_price',
+ 'product_variant_stock', 'product_variant_sku'])
+ df = pd.concat([df, tmp])
+
+ print(data_variant)
+
+ df = df.sort_values(by=['product_variant_sku'])
+ print(df.to_string())
+
+ print("======================================")
+
+ merged_df = df.groupby('product_variant_sku').agg({
+ 'product_variant_name': ' '.join,
+ 'rce_source_variant_id': 'first',
+ 'product_variant_price': 'first',
+ 'product_variant_stock': 'first'
+ }).reset_index()
+
+ print(merged_df.to_string())
+
+ # Close the browser
+ browser.close()
diff --git a/hasaki_crawler_engine/test2.py b/hasaki_crawler_engine/test2.py
new file mode 100644
index 0000000..bdef230
--- /dev/null
+++ b/hasaki_crawler_engine/test2.py
@@ -0,0 +1,25 @@
+import asyncio
+from playwright.async_api import async_playwright
+
+async def main():
+ async with async_playwright() as p:
+ browser = await p.chromium.launch()
+ context = await browser.new_context()
+
+ page = await context.new_page()
+
+ # Enable request interception
+ await page.route('https://hasaki.vn/wap/v2/product/detail', lambda route: route.continue_())
+
+ # Navigate to the website URL
+ await page.goto('https://hasaki.vn/san-pham/nuoc-hoa-hong-khong-mui-klairs-danh-cho-da-nhay-cam-180ml-65994.html')
+
+ # Wait for the API request to be made
+ response = await page.wait_for_event('request', predicate=lambda req: 'v2/product/detail' in req.url)
+ json_response = await response.response.json()
+
+ print(json_response)
+
+ await browser.close()
+
+asyncio.run(main())