added Hasaki crawler

This commit is contained in:
Shariar Imtiaz 2024-03-14 09:16:59 +04:00
parent e2568e7979
commit 45e6965679
12 changed files with 1792 additions and 0 deletions

12
.idea/dataSources.xml Normal file
View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="DataSourceManagerImpl" format="xml" multifile-model="true">
<data-source source="LOCAL" name="ProdRedshift" uuid="e343ac8d-80ff-44dc-8d8d-5d18fc755a70">
<driver-ref>redshift</driver-ref>
<synchronize>true</synchronize>
<jdbc-driver>com.amazon.redshift.jdbc.Driver</jdbc-driver>
<jdbc-url>jdbc:redshift://redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com:5439/analytics</jdbc-url>
<working-dir>$ProjectFileDir$</working-dir>
</data-source>
</component>
</project>

6
.idea/sqldialects.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="SqlDialectMappings">
<file url="file://$PROJECT_DIR$/hasaki_crawler_engine/changes.sql" dialect="Redshift" />
</component>
</project>

View File

@ -0,0 +1,24 @@
from deep_translator import GoogleTranslator
# def translate_text_to_english(text):
# if text:
# translated = GoogleTranslator(source='auto', target='en').translate(text)
# return translated
# return text
#
def translate_text_to_english(text):
if text:
chunk_size = 4800
text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
translated_chunks = []
for chunk in text_chunks:
translated_chunk = GoogleTranslator(source='auto', target='en').translate(chunk)
translated_chunks.append(translated_chunk)
translated_text = ' '.join(translated_chunks)
return translated_text
return text

View File

@ -0,0 +1,78 @@
ALTER TABLE test_spider_management.rce_category ADD category_parent_name varchar(24000) NULL;
ALTER TABLE test_spider_management.aud_rce_category ADD category_parent_name varchar(24000) NULL;
CREATE TABLE IF NOT EXISTS test_spider_management.crawler_tracker_hasaki
(
crawler_name VARCHAR(24000) ENCODE lzo
,product_section VARCHAR(24000) ENCODE lzo
,product_name VARCHAR(24000) ENCODE lzo
,product_url VARCHAR(24000) ENCODE lzo
,product_image VARCHAR(24000) ENCODE lzo
,product_sold INTEGER NOT NULL ENCODE az64
,product_brand VARCHAR(24000) ENCODE lzo
,gift VARCHAR(24000) ENCODE lzo
,product_rank INTEGER NOT NULL ENCODE az64
,categoryid INTEGER NOT NULL ENCODE az64
,flag SMALLINT DEFAULT 0 ENCODE az64
)
DISTSTYLE AUTO
;
ALTER TABLE test_spider_management.rce_brand ADD brand_following int8 NULL;
ALTER TABLE test_spider_management.rce_brand ADD brand_rating int8 NULL;
ALTER TABLE test_spider_management.aud_rce_brand ADD brand_following int8 NULL;
ALTER TABLE test_spider_management.aud_rce_brand ADD brand_rating int8 NULL;
ALTER TABLE test_spider_management.rce_product_variant ADD product_variant_sku varchar(1000) NULL;
ALTER TABLE test_spider_management.aud_rce_product_variant ADD product_variant_sku varchar(1000) NULL;
CREATE TABLE IF NOT EXISTS test_spider_management.rce_seo
(
id INTEGER ENCODE az64
,rce_product_id INTEGER ENCODE az64
,rce_source_id INTEGER ENCODE az64
,seo_title VARCHAR(2000) ENCODE lzo
,seo_description VARCHAR(10000) ENCODE lzo
,seo_url VARCHAR(2000) ENCODE lzo
,seo_url_hash VARCHAR(2000) ENCODE lzo
,seo_image VARCHAR(2000) ENCODE lzo
,seo_price_amount BIGINT ENCODE az64
,seo_price_currency VARCHAR(2000) ENCODE lzo
,seo_product_band VARCHAR(2000) ENCODE lzo
,seo_product_availability VARCHAR(2000) ENCODE lzo
,seo_product_category VARCHAR(2000) ENCODE lzo
,seo_product_condition VARCHAR(2000) ENCODE lzo
,seo_product_retailer_item_id BIGINT ENCODE az64
,seo_product_robots VARCHAR(2000) ENCODE lzo
,createdat TIMESTAMP WITHOUT TIME ZONE DEFAULT getdate() ENCODE az64
,updatedat TIMESTAMP WITHOUT TIME ZONE DEFAULT getdate() ENCODE az64
)
DISTSTYLE AUTO
;
CREATE TABLE IF NOT EXISTS test_spider_management.aud_rce_seo
(
auditid INTEGER ENCODE az64
,id INTEGER ENCODE az64
,rce_product_id INTEGER ENCODE az64
,rce_source_id INTEGER ENCODE az64
,seo_title VARCHAR(2000) ENCODE lzo
,seo_description VARCHAR(10000) ENCODE lzo
,seo_url VARCHAR(2000) ENCODE lzo
,seo_url_hash VARCHAR(2000) ENCODE lzo
,seo_image VARCHAR(2000) ENCODE lzo
,seo_price_amount BIGINT ENCODE az64
,seo_price_currency VARCHAR(2000) ENCODE lzo
,seo_product_band VARCHAR(2000) ENCODE lzo
,seo_product_availability VARCHAR(2000) ENCODE lzo
,seo_product_category VARCHAR(2000) ENCODE lzo
,seo_product_condition VARCHAR(2000) ENCODE lzo
,seo_product_retailer_item_id BIGINT ENCODE az64
,seo_product_robots VARCHAR(2000) ENCODE lzo
,createdat TIMESTAMP WITHOUT TIME ZONE ENCODE az64
,updatedat TIMESTAMP WITHOUT TIME ZONE ENCODE az64
,audit_createdat TIMESTAMP WITHOUT TIME ZONE DEFAULT getdate() ENCODE az64
)
DISTSTYLE AUTO
;

26
hasaki_crawler_engine/conf.json Executable file
View File

@ -0,0 +1,26 @@
{
"crawler_name": "raena_crawler_engine_hasaki",
"crawler_schema": "test_spider_management",
"category_tab": "rce_category",
"tracker_tab": "crawler_tracker_hasaki",
"product_tab": "rce_product",
"variant_tab": "rce_product_variant",
"brand_tab": "rce_brand",
"reseller_tab": "rce_reseller",
"reseller_store_tab": "rce_reseller_store",
"review_tab": "rce_ratings_reviews",
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
"review_producttags_tab": "rce_ratings_reviews_producttags",
"review_tags": "rce_tags",
"source_tab": "rce_source",
"seo_tab": "rce_seo",
"product_per_category": "1000",
"source_category": "11043145",
"db_user": "dbadmin",
"db_pass": "5qCif6eyY3Kmg4z",
"database": "analytics",
"db_host": "redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com",
"db_port": "5439",
"crawler_main": "1",
"crawler_slave_no": ""
}

View File

@ -0,0 +1,143 @@
import hashlib
import logging
import time
import psycopg2
import pandas as pd
from playwright.sync_api import sync_playwright
from hasaki_db_writer import hasaki_db_writer
from Util import translate_text_to_english
class HasakiCategories:
def __init__(self, config):
logging.info("Initializing HasakiSubCategories")
self.master_category = []
self.config = config
self.crawler_name = self.config.get("crawler_name")
self.product_limit = int(self.config.get("product_per_category"))
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'),
password=self.config.get('db_pass'), host=self.config.get('db_host'),
port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
self.cur.execute(f"""select id from {self.config.get('crawler_schema')}.{self.config.get('source_tab')} where source_name='Hasaki'""")
try:
self.rce_source_id = self.cur.fetchone()[0]
except:
logging.info("Source tab is empty. Please check. Exiting.....")
exit(1)
self.db_writer = hasaki_db_writer(config)
def __del__(self):
print("Closing connection.....")
self.conn.close()
def start_processing(self):
self.crawl_and_track("HEALTH - BEAUTY", "https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html")
df = pd.DataFrame(self.master_category, columns=['Index', 'Parent', 'Name', 'Link'])
df = df.sort_values('Index')
df = df.drop_duplicates(subset='Name', keep='first')
self.process_category(df)
def process_category(self, category):
for index, row in category.iterrows():
data = {}
data['parent_category_id'] = 0
data['rce_source_id'] = self.rce_source_id
data['rce_source_category_id'] = 0
data['rce_source_status'] = 1
data['category_name'] = str(row["Name"]).replace("'","")
data['category_page_url'] = row["Link"]
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
data['category_parent_name'] = str(row["Parent"]).replace("'","")
self.db_writer.rce_category(data)
def crawl_and_track(self, parent, url_to_visit):
self.master_category.append((0,"0", parent, url_to_visit))
print(self.master_category)
cats = self.crawl_categories(parent, url_to_visit)
time.sleep(10)
if cats:
for cat in cats:
self.master_category.append((1,)+(cat))
print((1,)+(cat))
sub_cats1 = self.crawl_categories(cat[1], cat[2])
time.sleep(10)
if sub_cats1:
for sub_cat1 in sub_cats1:
self.master_category.append((2,) + (sub_cat1))
print((2,) + (sub_cat1))
sub_cats2 = self.crawl_categories(sub_cat1[1], sub_cat1[2])
time.sleep(10)
if sub_cats2:
for sub_cat2 in sub_cats2:
self.master_category.append((3,) + (sub_cat2))
print((3,) + (sub_cat2))
sub_cats3 = self.crawl_categories(sub_cat2[1], sub_cat2[2])
time.sleep(10)
if sub_cats3:
for sub_cat3 in sub_cats3:
self.master_category.append((4,) + (sub_cat3))
print((4,) + (sub_cat3))
def crawl_categories(self, parent, url_to_visit):
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
# context = browser.new_context(
# viewport={"width": 375, "height": 667, "isMobile": True}
# )
page = browser.new_page()
# Load the webpage
page.goto(url_to_visit)
# page.goto('https://hasaki.vn/danh-muc/my-pham-high-end-c1907.html')
page.wait_for_load_state('load')
container_element = page.query_selector('.block_colaps_sticky.width_common.collaps_sticky')
if container_element:
item_elements = container_element.query_selector_all('.item_fillter')
content_elements = container_element.query_selector_all('.content_fillter')
urls = []
for item_element in item_elements:
text = item_element.query_selector('a').inner_text()
text = translate_text_to_english(text)
href = item_element.query_selector('a').get_attribute('href')
urls.append((parent, text, href))
for content_element in content_elements:
text = content_element.query_selector('a').inner_text()
text = translate_text_to_english(text)
href = content_element.query_selector('a').get_attribute('href')
urls.append((parent, text, href))
# removing previously collected data
master_urls = [item[3] for item in self.master_category]
filtered_data = [(parent, name, url) for parent, name, url in urls if url not in master_urls]
return filtered_data
browser.close()

View File

@ -0,0 +1,160 @@
import hashlib
import logging
import random
import time
import psycopg2
from playwright.sync_api import sync_playwright
from deep_translator import GoogleTranslator
from hasaki_db_writer import hasaki_db_writer
import pandas as pd
from Util import translate_text_to_english
class HasakiCategoryProducts:
def __init__(self, config):
logging.info("Initializing HasakiCategoryProducts........")
self.config = config
self.crawler_name = self.config.get("crawler_name")
self.product_limit = int(self.config.get("product_per_category"))
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'),
password=self.config.get('db_pass'), host=self.config.get('db_host'),
port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
self.cur.execute(
f"""select id from {self.config.get('crawler_schema')}.{self.config.get('source_tab')} where source_name='Hasaki'""")
try:
self.rce_source_id = self.cur.fetchone()[0]
except:
logging.info("Source tab is empty. Please check. Exiting.....")
exit(1)
self.db_writer = hasaki_db_writer(config)
def __del__(self):
print("Closing connection.....")
self.conn.close()
def start_processing(self):
logging.info("Starting crawler to collect category products.........")
sql = f"""
select * from {self.config.get('crawler_schema')}.{self.config.get('category_tab')}
where rce_source_id = {self.rce_source_id} order by id
"""
self.cur.execute(sql)
categories = self.cur.fetchall()
for category in categories:
logging.info("================= Fetching Products for : {} ====================".format(str(category[7])))
pages = self.get_pages(category[5])
time.sleep(random.randint(10,20))
self.get_product_list(urls = pages, categoryId = category[0])
def get_pages(self, url):
pages = []
pages.append(url)
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url)
page.wait_for_load_state('load')
pagination = page.query_selector(".pagination.ul-pagination").query_selector_all(".change-page")
for pagination in pagination:
if str(pagination.get_attribute('data-page')).strip() != "1":
new_url = str(pagination.get_attribute('href')).strip()
new_url = "https://hasaki.vn" + new_url
pages.append(new_url)
browser.close()
except Exception as e:
pass
finally:
return pages
def get_product_list(self,urls, categoryId):
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page_count = 1
logging.info("Found {} pages. Looping through URLS to get all products.".format(str(len(urls))))
for url in urls:
logging.info("+++++++++++++ Loading page : {} +++++++++++++++++".format(str(page_count)))
page.goto(url)
page.wait_for_load_state('load')
container_element = page.query_selector('.ProductGrid__grid.width_common')
if container_element:
item_elements = container_element.query_selector_all('.ProductGridItem__itemOuter')
item_count = 1
for item_element in item_elements:
try:
product_section = "Base Product Page " + str(page_count)
product_name = translate_text_to_english(str(item_element.query_selector('.width_common.name_sp.space_bottom_5').text_content()).strip().replace("'",""))
product_url = str(item_element.query_selector('.v3_thumb_common_sp.relative').get_attribute('href')).strip()
product_brand = translate_text_to_english(str(item_element.query_selector('.width_common.txt_color_1.space_bottom_3').text_content()).strip().replace("'",""))
product_rank = item_count
product_image = ""
try:
product_image = str(item_element.query_selector('.v3_thumb_common_sp.relative').query_selector('.img_thumb.lazy.loaded').get_attribute('src')).strip().replace("'","")
except:
pass
gift = ""
try:
gift = translate_text_to_english(str(item_element.query_selector('.block_gift_list_item').text_content()).strip().replace("'",""))
except:
pass
product_sold = 0
try:
product_sold = int(str(item_element.query_selector('.item_count_by').text_content()).strip().replace('.',''))
except:
pass
sql = f"""
insert into {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')}(crawler_name,product_section, product_name, product_url, product_image, product_sold, product_brand, gift, product_rank, categoryid)
values('{self.crawler_name}','{product_section}','{product_name.replace("'","")}','{product_url}','{product_image}',{product_sold},'{product_brand}','{gift}',{product_rank},{categoryId})
"""
logging.info(sql)
self.cur.execute(sql)
except Exception as e:
print(e)
item_count += 1
time.sleep(random.randint(10,30))
page_count += 1
browser.close()
except Exception as e:
print(e)

View File

@ -0,0 +1,47 @@
import logging
import json
import time
from hasaki_categories import HasakiCategories
from hasaki_category_products import HasakiCategoryProducts
from hasaki_product_info import HasakiProductInfo
##### Looger ######
format = "%(asctime)s: %(message)s"
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
config = {}
def main():
# hasaki_categories = HasakiCategories(config)
# hasaki_categories.start_processing()
#
# time.sleep(60)
#
# hasaki_category_products = HasakiCategoryProducts(config)
# hasaki_category_products.start_processing()
#
# time.sleep(60)
hasaki_products = HasakiProductInfo(config)
hasaki_products.start_processing()
if __name__ == "__main__":
logging.info("Starting Hasaki Crawler.......")
try:
logging.info("Loading config file.......")
with open("conf.json", "r") as jsonfile:
config = json.load(jsonfile)
logging.info("Config file loaded.......")
print(config)
main()
except Exception as e:
logging.info("Error: ".format(e))
#logging.info("Cannot load config file. Please check. Exiting......")
#send_mail()
exit(1)

View File

@ -0,0 +1,754 @@
import logging
import psycopg2
###### Looger ######
format = "%(asctime)s: %(message)s"
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
class hasaki_db_writer:
def __init__(self, config):
self.config = config
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
def __del__(self):
logging.info("Closing connection.....")
self.conn.close()
def get_id(self, schema, table):
sql = f"""
select max(id) from {schema}.{table}
"""
self.cur.execute(sql)
res = self.cur.fetchone()
if res[0] != None:
id = res[0] + 1
else:
id = 1
return id
def get_aud_id(self, schema, table):
sql = f"""
select max(auditid) from {schema}.{table}
"""
self.cur.execute(sql)
res = self.cur.fetchone()
if res[0] != None:
id = res[0] + 1
else:
id = 1
return id
def rce_category(self, data):
sql = f"""
select * from {self.config.get('crawler_schema')}.{self.config.get('category_tab')} where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']}
"""
self.cur.execute(sql)
res = self.cur.fetchone()
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('category_tab'))
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('category_tab'))
if not res:
sql = f"""
insert into {self.config.get('crawler_schema')}.{self.config.get('category_tab')}(id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,category_parent_name)
values({id_main},{data['parent_category_id']},{data['rce_source_id']},{data['rce_source_category_id']},{data['rce_source_status']},'{data['category_page_url']}','{data['category_page_url_hash']}','{data['category_name']}','{data['category_parent_name']}')
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('category_tab')}(auditid,id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat,category_parent_name)
select {id_aud},id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat,category_parent_name from {self.config.get('crawler_schema')}.{self.config.get('category_tab')}
where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
else:
if str(data['parent_category_id'])==str(res[1]) and str(data['rce_source_category_id'])==str(res[3]) and str(data['category_name']) == str(res[7]) and \
str(data['category_page_url'])==str(res[5]) and str(data['category_parent_name'])==str(res[12]):
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('category_tab')} set updatedat=GETDATE()
where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" b where a.id=b.id and b.id = "+str(res[0])
logging.info(sql)
self.cur.execute(sql)
else:
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('category_tab')} set parent_category_id={data['parent_category_id']}, rce_source_category_id = {data['rce_source_category_id']},
category_name = '{data['category_name']}', category_page_url = '{data['category_page_url']}', category_page_url_hash = '{data['category_page_url_hash']}', category_parent_name = '{data['category_parent_name']}',
updatedat=GETDATE() where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(auditid,id,parent_category_id,rce_source_id," \
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat,category_parent_name) " \
"select "+str(id_aud)+", id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \
"category_name,createdat,updatedat,category_parent_name from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \
"where category_name = '"+ str(res[7])+"'"
logging.info(sql)
self.cur.execute(sql)
def rce_product(self, data):
sql = f"""
select * from {self.config.get('crawler_schema')}.{self.config.get('product_tab')}
where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
"""
self.cur.execute(sql)
res = self.cur.fetchone()
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('product_tab'))
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('product_tab'))
if not res:
sql = f"""
insert into {self.config.get('crawler_schema')}.{self.config.get('product_tab')}(id,rce_source_product_id,rce_source_product_status,product_page_url,
product_page_url_hash,rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,
product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings,product_section,
rce_source_id,countryoforigin,rank,ships_from) values({id_main},{data['rce_source_product_id']},{data['rce_source_product_status']},'{data['product_page_url']}',
'{data['product_page_url_hash']}',{data['rce_category_id']},{data['rce_brand_id']},{data['rce_store_id']},'{data['rce_source_product_name']}','{data['product_images']}','{data['product_description']}',{data['product_sold_total']},{data['product_sold']},
{data['product_price_min']},{data['product_price_min_before_discount']},{data['product_price_max']},{data['product_price_max_before_discount']},{data['ratings']},'{data['product_section']}',
{data['rce_source_id']},'{data['countryoforigin']}',{data['rank']},'{data['ships_from']}')
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('product_tab')}(auditid,id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash,
rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount,
product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank)
select {id_aud},id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash,
rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount,
product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank from {self.config.get('crawler_schema')}.{self.config.get('product_tab')}
where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_product_id'])==str(res[1]) and str(data['rce_source_product_status'])==str(res[2]) and \
str(data['product_page_url'])==str(res[3]) and str(data['product_page_url_hash'])==str(res[4]) and str(data['rce_category_id'])==str(res[5]) and \
str(data['rce_brand_id'])==str(res[6]) and str(data['rce_store_id'])==str(res[7]) and str(data['rce_source_product_name'])==str(res[8]) and \
str(data['product_images'])==str(res[9]) and str(data['product_sold_total'])==str(res[11]) and \
str(data['product_sold'])==str(res[12]) and str(data['product_price_min'])==str(res[13]) and str(data['product_price_min_before_discount'])==str(res[14]) and \
str(data['product_price_max'])==str(res[15]) and str(data['product_price_max_before_discount'])==str(res[16]) and str(data['ratings'])==str(res[17]) \
and str(data['ships_from'])==str(res[18]) and str(data['rce_source_id'])==str(res[21]) \
and str(data['product_section'])==str(res[22]) and str(data['countryoforigin'])==str(res[23])\
and str(data['rank'])==str(res[24]):
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('product_tab')} set updatedat=GETDATE()
where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" b where a.id=b.id and b.id = "+str(res[0])
logging.info(sql)
self.cur.execute(sql)
else:
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('product_tab')} set rce_source_product_id = {data['rce_source_product_id']}, rce_source_product_status={data['rce_source_product_status']}, product_page_url='{data['product_page_url']}',
product_page_url_hash='{data['product_page_url_hash']}', rce_category_id={data['rce_category_id']}, rce_brand_id={data['rce_brand_id']}, rce_store_id={data['rce_store_id']},
rce_source_product_name='{data['rce_source_product_name']}', product_images='{data['product_images']}', product_description='{data['product_description']}', product_sold_total={data['product_sold_total']},
product_sold={data['product_sold']}, product_price_min='{data['product_price_min']}',product_price_min_before_discount='{data['product_price_min_before_discount']}',
product_price_max='{data['product_price_max']}', product_price_max_before_discount='{data['product_price_max_before_discount']}', ratings={data['ratings']},
ships_from='{data['ships_from']}',product_section='{data['product_section']}',countryoforigin='{data['countryoforigin']}',rank={data['rank']}, updatedat=GETDATE()
where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('product_tab')}(auditid,id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash,
rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount,
product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank)
select {id_aud},id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash,
rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount,
product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank from {self.config.get('crawler_schema')}.{self.config.get('product_tab')}
where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
def rce_product_variant(self, data):
sql = f"""
select * from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} where
rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
"""
self.cur.execute(sql)
res = self.cur.fetchone()
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('variant_tab'))
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('variant_tab'))
if not res:
sql = f"""
insert into {self.config.get('crawler_schema')}.{self.config.get('variant_tab')}(id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku)
values({id_main},{data['rce_source_variant_id']},{data['rce_product_id']},'{data['product_variant_name']}',{data['product_variant_price']},{data['product_variant_price_before_discount']},{data['product_variant_stock']},'{data['product_variant_sku']}')
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('variant_tab')}(auditid,id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat)
select {id_aud},id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat
from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
"""
logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_variant_id'])==str(res[1]) and str(data['rce_product_id'])==str(res[2]) and str(data['product_variant_name'])==str(res[3]) and \
str(data['product_variant_price'])==str(res[4]) and str(data['product_variant_price_before_discount'])==str(res[5]) and str(data['product_variant_stock'])==str(res[6])\
and str(data['product_variant_sku'])==str(res[9]):
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} set updatedat=GETDATE()
where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
update {self.config.get('crawler_schema')}.aud_{self.config.get('variant_tab')} a set updatedat=b.updatedat
from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} b where a.id=b.id and b.id = {res[0]}
"""
logging.info(sql)
self.cur.execute(sql)
else:
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} set rce_source_variant_id={data['rce_source_variant_id']},
rce_product_id={data['rce_product_id']},product_variant_name='{data['product_variant_name']}',product_variant_price={data['product_variant_price']},
product_variant_price_before_discount={data['product_variant_price_before_discount']},product_variant_stock={data['product_variant_stock']},
product_variant_sku={data['product_variant_sku']}, updatedat=GETDATE()
where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('variant_tab')}(auditid,id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat)
select {id_aud},id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat
from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
"""
logging.info(sql)
self.cur.execute(sql)
def rce_brand(self, data):
sql = f"""
select * from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} where rce_source_brand_id = {data['rce_source_brand_id']}
and rce_source_id = {data['rce_source_id']}
"""
self.cur.execute(sql)
res = self.cur.fetchone()
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('brand_tab'))
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('brand_tab'))
if not res:
sql = f"""
insert into {self.config.get('crawler_schema')}.{self.config.get('brand_tab')}(id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating)
values({id_main},{data['rce_source_id']},{data['rce_source_brand_id']},{data['rce_source_brand_status']},'{data['brand_page_url']}','{data['brand_page_url_hash']}','{data['brand_name']}',{data['brand_following']},{data['brand_rating']})
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('brand_tab')}(auditid,id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat)
select {id_aud}, id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')}
where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_id'])==str(res[1]) and str(data['rce_source_brand_status'])==str(res[3]) and str(data['brand_page_url'])==str(res[4]) and \
str(data['brand_page_url_hash'])==str(res[5]) and str(data['brand_name'])==str(res[6]) and str(data['rce_source_brand_id'])==str(res[2]):
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} set updatedat=GETDATE() where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
update {self.config.get('crawler_schema')}.aud_{self.config.get('brand_tab')} a set updatedat=b.updatedat
from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} b where a.id=b.id and b.id = {res[0]} and
b.rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
else:
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} set rce_source_id={data['rce_source_id']}, rce_source_brand_id={data['rce_source_brand_id']},
rce_source_brand_status={data['rce_source_brand_status']}, brand_page_url='{data['brand_page_url']}', brand_page_url_hash='{data['brand_page_url_hash']}',
brand_name='{data['brand_name']}', brand_following={data['brand_following']}, brand_rating={data['brand_rating']}, updatedat=GETDATE() where rce_source_brand_id={data['rce_source_brand_id']}
and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('brand_tab')}(auditid,id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat)
select {id_aud}, id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')}
where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
def rce_reseller(self, data):
data['reseller_name'] = data['reseller_name']
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data['reseller_name'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('reseller_tab'))
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('reseller_tab'))
if not res:
sql = f"""
insert into {self.config.get('crawler_schema')}.{self.config.get('reseller_tab')}(id,rce_source_id,rce_source_reseller_status,reseller_name)
values({id_main},'{data['rce_source_id']}','{data['rce_source_reseller_status']}','{data['reseller_name']}')
"""
#logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('reseller_tab')}(auditid,id,rce_source_id,rce_source_reseller_status,reseller_name,createdat,updatedat)
select {id_aud}, id,rce_source_id,rce_source_reseller_status,reseller_name,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('reseller_tab')}
where reseller_name='{data['reseller_name']}'
"""
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_reseller_status'])==str(res[3]) and str(data['reseller_name'])==str(res[4]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set updatedat=GETDATE() " \
"where reseller_name = '"+ str(res[4])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set rce_source_id="+str(data['rce_source_id'])+", " \
"rce_source_reseller_status="+str(data['rce_source_reseller_status'])+", reseller_name='"+str(data['reseller_name'])+"', reseller_average_rating=" \
"'"+str(data['reseller_average_rating'])+"',reseller_description='"+str(data['reseller_description'])+"', updatedat=GETDATE() where reseller_name = '"+ str(res[4])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (auditid,id,rce_source_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat) select "+str(id_aud)+", id,rce_source_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name='"+str(res[4])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_reseller_store(self, data):
data['store_page_url'] = data['store_page_url'].replace("'","''")
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url = '"+str(data['store_page_url'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('reseller_store_tab'))
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('reseller_store_tab'))
if not res:
sql = f"""
insert into {self.config.get('crawler_schema')}.{self.config.get('reseller_store_tab')}(id,rce_source_store_status,store_page_url,store_page_url_hash,rce_reseller_id,rce_source_id)
values({id_main},'{data['rce_source_store_status']}','{data['store_page_url']}','{data['store_page_url_hash']}',{data['rce_reseller_id']},{data['rce_source_id']})
"""
#logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('reseller_store_tab')}(auditid,id,rce_source_store_status,store_page_url,store_page_url_hash,rce_reseller_id,createdat,updatedat,rce_source_id)
select {id_aud}, id,rce_source_store_status,store_page_url,store_page_url_hash,rce_reseller_id,createdat,updatedat,rce_source_id from {self.config.get('crawler_schema')}.{self.config.get('reseller_store_tab')}
where store_page_url= '{data['store_page_url']}'
"""
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_store_status'])==str(res[2]) and str(data['store_page_url'])==str(res[3]) and \
str(data['store_page_url_hash'])==str(res[4]) and \
str(data['rce_reseller_id'])==str(res[6]) and str(data['rce_source_id'])==str(res[9]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set updatedat=GETDATE() " \
"where store_page_url = '"+ str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set " \
"rce_source_store_status="+str(data['rce_source_store_status'])+", store_page_url='"+str(data['store_page_url'])+"', store_page_url_hash=" \
"'"+str(data['store_page_url_hash'])+"',store_location='"+str(data['store_location'])+"', rce_reseller_id="+str(data['rce_reseller_id'])+", " \
"updatedat=GETDATE(), rce_source_id="+str(data['rce_source_id'])+" where store_page_url = '"+ str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (auditid,id,rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id) select "+id_aud+", id,rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id from " \
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url='"+str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_ratings_reviews(self, data):
sql = f"""
select * from {self.config.get('crawler_schema')}.{self.config.get('review_tab')}
where rce_product_id = {data['rce_product_id']} and username = '{data['username']}'
"""
self.cur.execute(sql)
res = self.cur.fetchone()
data['username'] = data['username'].replace("'","")
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('review_tab'))
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('review_tab'))
if not res:
sql = f"""
insert into {self.config.get('crawler_schema')}.{self.config.get('review_tab')}(id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating)
values({id_main},{data['rce_product_id']},'{data['username']}','{data['review']}','{data['img_url']}',{data['review_like_count']},'{data['user_tier']}',{data['shop_id']},'{data['video_url']}',{data['rating']})
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('review_tab')}(auditid,id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat)
select {id_aud},id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('review_tab')}
where rce_product_id = {data['rce_product_id']} and username = '{data['username']}'
"""
logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_product_id'])==str(res[1]) and str(data['username'])==str(res[2]) and str(data['review'])==str(res[3]) and \
str(data['img_url'])==str(res[4]) and str(data['review_like_count'])==str(res[5]) and str(data['user_tier'])==str(res[6]) and \
str(data['shop_id'])==str(res[7]) and str(data['video_url'])==str(res[8]) and str(data['rating'])==str(res[9]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set updatedat=GETDATE() " \
"where rce_product_id = "+ str(res[1])+" and username ='"+res[2]+"'"
logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" b where a.id=b.id and b.id = "+str(res[0])
logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set rce_product_id="+str(data['rce_product_id'])+", " \
"username='"+str(data['username'])+"', review='"+str(data['review'])+"', img_url=" \
"'"+str(data['img_url'])+"',review_like_count="+str(data['review_like_count'])+", user_tier='"+str(data['user_tier'])+"', " \
"shop_id="+str(data['shop_id'])+", video_url='"+str(data['video_url'])+"', rating='"+str(data['rating'])+"', updatedat=GETDATE() " \
"where rce_product_id = "+ str(res[1])+" and username ='"+str(data['username'])+"'"
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('review_tab')}(auditid,id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat)
select {id_aud},id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('review_tab')}
where rce_product_id = {data['rce_product_id']} and username = '{data['username']}'
"""
logging.info(sql)
self.cur.execute(sql)
def rce_seo(self, data):
sql = f"""
select * from {self.config.get('crawler_schema')}.{self.config.get('seo_tab')}
where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']}
"""
self.cur.execute(sql)
res = self.cur.fetchone()
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('seo_tab'))
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('seo_tab'))
if not res:
sql = f"""
insert into {self.config.get('crawler_schema')}.{self.config.get('seo_tab')}(id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots)
values({id_main},{data['rce_product_id']},{data['rce_source_id']},'{data['seo_title']}','{data['seo_description']}','{data['seo_url']}','{data['seo_url_hash']}','{data['seo_image']}',{data['seo_price_amount']},'{data['seo_price_currency']}','{data['seo_product_band']}','{data['seo_product_availability']}','{data['seo_product_category']}',
'{data['seo_product_condition']}',{data['seo_product_retailer_item_id']},'{data['seo_product_robots']}')
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('seo_tab')}(auditid,id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat)
select {id_aud},id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('seo_tab')}
where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
else:
if (str(data['rce_product_id']) == str(res[1]) and str(data['rce_source_id']) == str(res[2]) and str(data['seo_title']) == str(res[3]) and \
str(data['seo_description']) == str(res[4]) and str(data['seo_url']) == str(res[5]) and str(data['seo_url_hash']) == str(res[6]) and \
str(data['seo_image']) == str(res[7]) and str(data['seo_price_amount']) == str(res[8]) and str(data['seo_price_currency']) == str(res[9]) and \
str(data['seo_product_band']) == str(res[10])) and str(data['seo_product_availability']) == str(res[11]) and str(data['seo_product_category']) == str(res[12]) and \
str(data['seo_product_condition']) == str(res[13]) and str(data['seo_product_retailer_item_id']) == str(res[14]) and str(data['seo_product_robots']) == str(res[15]):
sql = "update " + self.config.get('crawler_schema') + "." + self.config.get('seo_tab') + " set updatedat=GETDATE() " \
"where rce_product_id = " + str(res[1]) + " and rce_source_id =" + str(data['rce_source_id'])
logging.info(sql)
self.cur.execute(sql)
sql = "update " + self.config.get('crawler_schema') + ".aud_" + self.config.get('seo_tab') + " a set updatedat=b.updatedat " \
"from " + self.config.get('crawler_schema') + "." + self.config.get('seo_tab') + " b where a.id=b.id and b.id = " + str(res[0])
logging.info(sql)
self.cur.execute(sql)
else:
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('seo_tab')} set rce_product_id={data['rce_product_id']}, rce_source_id={data['rce_source_id']}, seo_title='{data['seo_title']}', seo_description='{data['seo_description']}',
seo_url='{data['seo_url']}', seo_url_hash='{data['seo_url_hash']}', seo_image='{data['seo_image']}', seo_price_amount='{data['seo_price_amount']}', seo_price_currency='{data['seo_price_currency']}', seo_product_band='{data['seo_product_band']}',
seo_product_availability='{data['seo_product_availability']}', seo_product_category='{data['seo_product_category']}', seo_product_condition='{data['seo_product_condition']}', seo_product_retailer_item_id={data['seo_product_retailer_item_id']},
seo_product_robots='{data['seo_product_robots']}' where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('seo_tab')}(auditid,id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat)
select {id_aud},id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('seo_tab')}
where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
# def rce_ratings_reviews_productmodels(self,data):
#
# sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id = "+str(data['rce_rating_id'])
# self.cur.execute(sql)
# res = self.cur.fetchone()
#
#
# if not res:
#
# sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" (rce_rating_id,model_id) " \
# "values("+str(data['rce_rating_id'])+",'"+str(data['model_id'])+"')"
# #logging.info(sql)
#
# self.cur.execute(sql)
#
# sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
# "createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
# ""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(data['rce_rating_id'])+""
# #logging.info(sql)
#
# self.cur.execute(sql)
#
# else:
#
# if str(data['rce_rating_id'])==str(res[1]) and str(data['model_id'])==str(res[2]):
#
# sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set updatedat=GETDATE() " \
# "where rce_rating_id = "+ str(res[1])
# #logging.info(sql)
# self.cur.execute(sql)
#
# sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" a set updatedat=b.updatedat " \
# "from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" b where a.id=b.id and b.id = "+str(res[0])
# #logging.info(sql)
# self.cur.execute(sql)
# else:
#
# sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set model_id="+str(data['model_id'])+", " \
# "updatedat=GETDATE() where rce_source_store_id = "+ str(res[1])
# #logging.info(sql)
# self.cur.execute(sql)
#
# sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
# "createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
# ""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(res[1])+""
# #logging.info(sql)
#
# self.cur.execute(sql)
#
#
# def rce_tags(self,data):
#
# sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description = '"+str(data['description'])+"'"
# self.cur.execute(sql)
# res = self.cur.fetchone()
#
#
# if not res:
#
# sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" (id,description) " \
# "values("+str(data['id'])+",'"+str(data['description'])+"')"
# #logging.info(sql)
#
# self.cur.execute(sql)
#
# sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
# "createdat,updatedat) select id,description,createdat,updatedat from " \
# ""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(data['description'])+"'"
# #logging.info(sql)
#
# self.cur.execute(sql)
#
# else:
#
# if str(data['description'])==str(res[1]):
#
# sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set updatedat=GETDATE() " \
# "where description = '"+ str(res[1])+"'"
# #logging.info(sql)
# self.cur.execute(sql)
#
# sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" a set updatedat=b.updatedat " \
# "from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
# #logging.info(sql)
# self.cur.execute(sql)
# else:
#
# sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set description='"+str(data['description'])+"', " \
# "updatedat=GETDATE() where description = "+ str(res[1])
# #logging.info(sql)
# self.cur.execute(sql)
#
# sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
# "createdat,updatedat) select id,description,createdat,updatedat from " \
# ""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(res[1])+"'"
# #logging.info(sql)
#
# self.cur.execute(sql)
#
#
# def rce_ratings_reviews_producttags(self,data):
#
# sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id = '"+str(data['rce_rating_id'])+"'"
# self.cur.execute(sql)
# res = self.cur.fetchone()
#
#
# if not res:
#
# sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" (rce_rating_id,tag_ids) " \
# "values("+str(data['rce_rating_id'])+",'"+str(data['tag_ids'])+"')"
# #logging.info(sql)
#
# self.cur.execute(sql)
#
# sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
# "createdat,updatedat) select id,rce_rating_id,tag_ids,createdat,updatedat from " \
# ""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id='"+str(data['rce_rating_id'])+"'"
# #logging.info(sql)
#
# self.cur.execute(sql)
#
# else:
#
# if str(data['rce_rating_id'])==str(res[1]):
#
# sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set updatedat=GETDATE() " \
# "where rce_rating_id = '"+ str(res[1])+"'"
# #logging.info(sql)
# self.cur.execute(sql)
#
# sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" a set updatedat=b.updatedat " \
# "from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
# #logging.info(sql)
# self.cur.execute(sql)
# else:
#
# sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set rce_rating_id='"+str(data['rce_rating_id'])+"', " \
# "updatedat=GETDATE() where rce_rating_id = "+ str(res[1])
# #logging.info(sql)
# self.cur.execute(sql)
#
# sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
# "createdat,updatedat) select id,description,createdat,updatedat from " \
# ""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where description='"+str(res[1])+"'"
# #logging.info(sql)
#
# self.cur.execute(sql)
#
#

View File

@ -0,0 +1,454 @@
import hashlib
import logging
import random
import string
import time
import re
import psycopg2
from playwright.sync_api import sync_playwright
from deep_translator import GoogleTranslator
from hasaki_db_writer import hasaki_db_writer
import pandas as pd
from bs4 import BeautifulSoup
from Util import translate_text_to_english
class HasakiProductInfo:
def __init__(self, config):
logging.info("Initializing HasakiProductInfo")
self.pattern = r'[' + string.punctuation + ']'
self.config = config
self.crawler_name = self.config.get("crawler_name")
self.product_limit = int(self.config.get("product_per_category"))
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'),
password=self.config.get('db_pass'), host=self.config.get('db_host'),
port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
self.cur.execute(
f"""select id from {self.config.get('crawler_schema')}.{self.config.get('source_tab')} where source_name='Hasaki'""")
try:
self.rce_source_id = self.cur.fetchone()[0]
except:
logging.info("Source tab is empty. Please check. Exiting.....")
exit(1)
self.db_writer = hasaki_db_writer(config)
def __del__(self):
print("Closing connection.....")
self.conn.close()
def start_processing(self):
logging.info("Starting to collect product info from Hasaki........")
logging.info("Fetching product list from DB......")
sql = f"""
select * from {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} where flag = 0
order by categoryid, product_section, product_rank
"""
self.cur.execute(sql)
rows = self.cur.fetchall()
logging.info("Found {} products.......".format(str(len(rows))))
cnt = 1
for row in rows:
logging.info("========= Fetching product info {}/{}: {} =========".format(str(cnt),str(len(rows)),row[3]))
try:
self.get_product_info(row)
except:
pass
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1
where categoryid={row[9]}, product_section='{row[1]}', product_rank={row[8]}, product_url='{row[3]}'
"""
self.cur.execute(sql)
cnt += 1
def get_product_info(self, data):
raw_data = self.get_raw_product_data(data[3])
print(raw_data)
if raw_data:
self.product_info(data, raw_data)
self.rating_info(raw_data)
self.seo_info(raw_data)
def get_raw_product_data(self, url):
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(
user_agent="Mozilla/5.0 (iPhone X; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1")
page = context.new_page()
page.goto(url)
with page.expect_response("**/wap/v2/product/detail**") as response:
api_requests = response.value.json()
browser.close()
return api_requests
def product_info(self, data, raw_data):
#region rce_brand
data_brand = {}
data_brand['rce_source_id'] = self.rce_source_id
data_brand['rce_source_brand_status'] = 1
data_brand['rce_source_brand_id'] = 0
data_brand['brand_page_url'] = ""
data_brand['brand_page_url_hash'] = ""
data_brand['brand_name'] = ""
data_brand['brand_following'] = ""
data_brand['brand_rating'] = ""
try:
data_brand['rce_source_brand_id'] = raw_data['brand']['id']
try:
data_brand['brand_page_url'] = "https://hasaki.vn/" + raw_data['brand']['url'] + ".html"
data_brand['brand_page_url'] = str(data_brand['brand_page_url']).replace("'","")
data_brand['brand_page_url_hash'] = hashlib.md5(data_brand['brand_page_url'].encode('utf-8')).hexdigest()
except:
pass
try:
data_brand['brand_name'] = translate_text_to_english(str(raw_data['brand']['name']).replace("'",""))
except:
pass
try:
data_brand['brand_following'] = raw_data['brand']['following']
except:
pass
try:
data_brand['brand_rating'] = raw_data['brand']['rating']
except:
pass
try:
self.db_writer.rce_brand(data_brand)
except Exception as e:
logging.info(e)
except:
pass
#endregion
#region rce_product
data_product = {}
try:
data_product['rce_source_product_id'] = raw_data['id']
data_product['rce_source_id'] = self.rce_source_id
data_product['rce_source_product_status'] = 1
data_product['product_page_url'] = str(raw_data['url']).replace("'","")
data_product['product_page_url_hash'] = hashlib.md5(data_product['product_page_url'].encode('utf-8')).hexdigest()
data_product['rce_category_id'] = data[9]
data_product['rce_store_id'] = 0
data_product['rce_source_product_name'] = str(raw_data['name']) + str(raw_data['alt_name'])
data_product['rce_source_product_name'] = translate_text_to_english(str(re.sub(self.pattern, '', data_product['rce_source_product_name'])))
data_product['rce_source_product_name'] = str(data_product['rce_source_product_name']).replace("'", "")
data_product['product_images'] = data[4]
data_product['product_description'] = ""
try:
description_raw = raw_data['description']
soup = BeautifulSoup(description_raw, 'html.parser')
data_product['product_description'] = translate_text_to_english(re.sub(self.pattern, '',soup.get_text()).replace("'",""))
data_product['product_description'] = str(data_product['product_description']).replace("'","")
except:
pass
data_product['rce_brand_id'] = ""
try:
sql = f"""
select id from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} where
rce_source_id = {self.rce_source_id} and rce_source_brand_id = {raw_data['brand']['id']}
"""
self.cur.execute(sql)
res = self.cur.fetchone()
data_product['rce_brand_id'] = res[0]
except:
pass
data_product['product_sold_total'] = 0
data_product['product_sold'] = 0
try:
data_product['product_sold'] = raw_data['bought']
except:
pass
data_product['product_price_min'] = 0
data_product['product_price_max'] = 0
try:
data_product['product_price_min'] = raw_data['int_final_price']
data_product['product_price_max'] = raw_data['int_final_price']
except:
pass
data_product['product_price_min_before_discount'] = 0
data_product['product_price_max_before_discount'] = 0
try:
data_product['product_price_min_before_discount'] = raw_data['price']
data_product['product_price_max_before_discount'] = raw_data['price']
except:
pass
data_product['ratings'] = 0.0
try:
data_product['ratings'] = raw_data['rating']['avg_rate']
except:
pass
data_product['ships_from'] = ""
data_product['product_section'] = data[1]
data_product['countryoforigin'] = ""
data_product['rank'] = data[8]
try:
self.db_writer.rce_product(data_product)
except Exception as e:
logging.info(e)
#region rce_product_variant
variant_items = raw_data['attribute']['items']
df_variant = pd.DataFrame({}, columns=['product_variant_name', 'rce_source_variant_id', 'product_variant_price',
'product_variant_stock', 'product_variant_sku'])
data_variant = {}
for variant in variant_items:
for item in variant['options']:
data_variant['product_variant_name'] = item['long_label']
for product in item['products']:
data_variant['rce_source_variant_id'] = product['id']
data_variant['product_variant_price'] = product['price']
data_variant['product_variant_stock'] = product['quantity']
data_variant['product_variant_sku'] = product['sku']
# variants_arr.append(data_variant)
tmp = pd.DataFrame([[data_variant['product_variant_name'],
data_variant['rce_source_variant_id'],
data_variant['product_variant_price'],
data_variant['product_variant_stock'],
data_variant['product_variant_sku']]],
columns=['product_variant_name', 'rce_source_variant_id',
'product_variant_price',
'product_variant_stock', 'product_variant_sku'])
df_variant = pd.concat([df_variant, tmp])
df_variant_merged = df_variant.groupby('product_variant_sku').agg({
'product_variant_name': ' '.join,
'rce_source_variant_id': 'first',
'product_variant_price': 'first',
'product_variant_stock': 'first'
}).reset_index()
#print(df_variant_merged.to_string())
for index, row in df_variant_merged.iterrows():
try:
data_variant = {}
data_variant['rce_source_variant_id'] = row['rce_source_variant_id']
data_variant['product_variant_name'] = translate_text_to_english(row['product_variant_name'])
data_variant['product_variant_name'] = re.sub(self.pattern, '', data_variant['product_variant_name']).replace("'","")
data_variant['product_variant_price'] = row['product_variant_price']
data_variant['product_variant_price_before_discount'] = 0
data_variant['product_variant_stock'] = row['product_variant_stock']
data_variant['product_variant_sku'] = row['product_variant_sku']
data_variant['rce_product_id'] = ""
sql = f"""
select id from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where
rce_source_product_id = {data_product['rce_source_product_id']} and rce_source_id = {data_product['rce_source_id']}
"""
self.cur.execute(sql)
data_variant['rce_product_id'] = self.cur.fetchone()[0]
try:
self.db_writer.rce_product_variant(data_variant)
except Exception as e:
logging.info(e)
except:
pass
#endregion
except:
pass
#endregion
def rating_info(self, raw_data):
try:
reviews1 = []
reviews2 = []
try:
reviews1 = raw_data['short_rating_data']['image_reviews']
except:
pass
try:
reviews2 = raw_data['short_rating_data']['reviews']
except:
pass
reviews = reviews1 + reviews2
for review in reviews:
data_review = {}
data_review["rce_product_id"] = ""
data_review["username"] = ""
data_review["review"] = ""
data_review["img_url"] = ""
data_review["review_like_count"] = 0
data_review["user_tier"] = ""
data_review["shop_id"] = 0
data_review["video_url"] = ""
data_review["rating"] = ""
sql = f"""
select id from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where
rce_source_product_id = {raw_data['id']} and rce_source_id = {self.rce_source_id}
"""
self.cur.execute(sql)
data_review["rce_product_id"] = self.cur.fetchone()[0]
try:
data_review["username"] = str(review['user_fullname']).replace("'", "")
except:
pass
try:
data_review["review"] = translate_text_to_english(review['content']).replace("'", "")
except:
pass
try:
data_review["rating"] = review['rating']['star']
except:
pass
try:
self.db_writer.rce_ratings_reviews(data_review)
except Exception as e:
logging.info(e)
except Exception as e:
logging.info(e)
def seo_info(self, raw_data):
try:
data_seo = {}
data_seo['rce_product_id'] = 0
data_seo['rce_source_id'] = self.rce_source_id
data_seo['seo_title'] = ""
data_seo['seo_description'] = ""
data_seo['seo_url'] = ""
data_seo['seo_url_hash'] = ""
data_seo['seo_image'] = ""
data_seo['seo_price_amount'] = 0
data_seo['seo_price_currency'] = ""
data_seo['seo_product_band'] = ""
data_seo['seo_product_availability'] = ""
data_seo['seo_product_category'] = ""
data_seo['seo_product_condition'] = ""
data_seo['seo_product_retailer_item_id'] = 0
data_seo['seo_product_robots'] = ""
sql = f"""
select id from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where
rce_source_product_id = {raw_data['id']} and rce_source_id = {self.rce_source_id}
"""
self.cur.execute(sql)
data_seo['rce_product_id'] = self.cur.fetchone()[0]
try: data_seo['seo_title'] = translate_text_to_english(raw_data['seo']['og:title']).replace("'","")
except: pass
try: data_seo['seo_description'] = translate_text_to_english(raw_data['seo']['og:description']).replace("'","")
except: pass
try: data_seo['seo_url'] = str(raw_data['seo']['og:url']).replace("'","")
except: pass
try: data_seo['seo_image'] = str(raw_data['seo']['og:image']).replace("'","")
except: pass
try: data_seo['seo_price_amount'] = raw_data['seo']['price:amount']
except: pass
try: data_seo['seo_price_currency'] = str(raw_data['seo']['price:currency']).replace("'","")
except: pass
try: data_seo['seo_product_band'] = translate_text_to_english(raw_data['seo']['product:band']).replace("'","")
except: pass
try: data_seo['seo_product_availability'] = str(raw_data['seo']['product:availability']).replace("'","")
except: pass
try: data_seo['seo_product_category'] = translate_text_to_english(raw_data['seo']['product:category']).replace("'","")
except: pass
try: data_seo['seo_product_condition'] = translate_text_to_english(raw_data['seo']['product:condition']).replace("'","")
except: pass
try: data_seo['seo_product_retailer_item_id'] = raw_data['seo']['product:retailer_item_id']
except: pass
try: data_seo['seo_product_robots'] = raw_data['seo']['product:robots']
except: pass
try:
self.db_writer.rce_seo(data_seo)
except Exception as e:
logging.info(e)
except:
pass

View File

@ -0,0 +1,63 @@
import time
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
import pandas as pd
# Launch the Playwright browser in mobile mode
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
context = browser.new_context(user_agent="Mozilla/5.0 (iPhone X; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1")
page = context.new_page()
page.goto("https://hasaki.vn/san-pham/nuoc-tay-trang-tuoi-mat-l-oreal-3-in-1-danh-cho-da-dau-da-hon-hop-400ml-19325.html")
page.wait_for_load_state('load')
#time.sleep(10)
# Capture the underlying API request URL
#api_requests = page.evaluate('''() => window.fetch('https://hasaki.vn/wap/v2/product/detail').then(response => response.json())''')
#print(api_requests)
with page.expect_response("**/wap/v2/product/detail**") as response:
data = response.value.json()
variant_items = data['attribute']['items']
df = pd.DataFrame({}, columns=['product_variant_name','rce_source_variant_id','product_variant_price','product_variant_stock','product_variant_sku'])
data_variant = {}
for variant in variant_items:
for item in variant['options']:
data_variant['product_variant_name'] = item['long_label']
for product in item['products']:
data_variant['rce_source_variant_id'] = product['id']
data_variant['rce_product_id'] = ""
data_variant['product_variant_price'] = product['price']
data_variant['product_variant_price_before_discount'] = ""
data_variant['product_variant_stock'] = product['quantity']
data_variant['product_variant_sku'] = product['sku']
#variants_arr.append(data_variant)
tmp = pd.DataFrame([[data_variant['product_variant_name'],data_variant['rce_source_variant_id'],data_variant['product_variant_price'],data_variant['product_variant_stock'],data_variant['product_variant_sku']]],
columns=['product_variant_name', 'rce_source_variant_id', 'product_variant_price',
'product_variant_stock', 'product_variant_sku'])
df = pd.concat([df, tmp])
print(data_variant)
df = df.sort_values(by=['product_variant_sku'])
print(df.to_string())
print("======================================")
merged_df = df.groupby('product_variant_sku').agg({
'product_variant_name': ' '.join,
'rce_source_variant_id': 'first',
'product_variant_price': 'first',
'product_variant_stock': 'first'
}).reset_index()
print(merged_df.to_string())
# Close the browser
browser.close()

View File

@ -0,0 +1,25 @@
import asyncio
from playwright.async_api import async_playwright
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch()
context = await browser.new_context()
page = await context.new_page()
# Enable request interception
await page.route('https://hasaki.vn/wap/v2/product/detail', lambda route: route.continue_())
# Navigate to the website URL
await page.goto('https://hasaki.vn/san-pham/nuoc-hoa-hong-khong-mui-klairs-danh-cho-da-nhay-cam-180ml-65994.html')
# Wait for the API request to be made
response = await page.wait_for_event('request', predicate=lambda req: 'v2/product/detail' in req.url)
json_response = await response.response.json()
print(json_response)
await browser.close()
asyncio.run(main())