added Hasaki crawler
This commit is contained in:
parent
e2568e7979
commit
45e6965679
|
@ -0,0 +1,12 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="DataSourceManagerImpl" format="xml" multifile-model="true">
|
||||||
|
<data-source source="LOCAL" name="ProdRedshift" uuid="e343ac8d-80ff-44dc-8d8d-5d18fc755a70">
|
||||||
|
<driver-ref>redshift</driver-ref>
|
||||||
|
<synchronize>true</synchronize>
|
||||||
|
<jdbc-driver>com.amazon.redshift.jdbc.Driver</jdbc-driver>
|
||||||
|
<jdbc-url>jdbc:redshift://redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com:5439/analytics</jdbc-url>
|
||||||
|
<working-dir>$ProjectFileDir$</working-dir>
|
||||||
|
</data-source>
|
||||||
|
</component>
|
||||||
|
</project>
|
|
@ -0,0 +1,6 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="SqlDialectMappings">
|
||||||
|
<file url="file://$PROJECT_DIR$/hasaki_crawler_engine/changes.sql" dialect="Redshift" />
|
||||||
|
</component>
|
||||||
|
</project>
|
|
@ -0,0 +1,24 @@
|
||||||
|
from deep_translator import GoogleTranslator
|
||||||
|
|
||||||
|
# def translate_text_to_english(text):
|
||||||
|
# if text:
|
||||||
|
# translated = GoogleTranslator(source='auto', target='en').translate(text)
|
||||||
|
# return translated
|
||||||
|
# return text
|
||||||
|
#
|
||||||
|
|
||||||
|
def translate_text_to_english(text):
|
||||||
|
if text:
|
||||||
|
chunk_size = 4800
|
||||||
|
text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
|
||||||
|
|
||||||
|
translated_chunks = []
|
||||||
|
for chunk in text_chunks:
|
||||||
|
translated_chunk = GoogleTranslator(source='auto', target='en').translate(chunk)
|
||||||
|
translated_chunks.append(translated_chunk)
|
||||||
|
|
||||||
|
translated_text = ' '.join(translated_chunks)
|
||||||
|
|
||||||
|
return translated_text
|
||||||
|
|
||||||
|
return text
|
|
@ -0,0 +1,78 @@
|
||||||
|
ALTER TABLE test_spider_management.rce_category ADD category_parent_name varchar(24000) NULL;
|
||||||
|
ALTER TABLE test_spider_management.aud_rce_category ADD category_parent_name varchar(24000) NULL;
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS test_spider_management.crawler_tracker_hasaki
|
||||||
|
(
|
||||||
|
crawler_name VARCHAR(24000) ENCODE lzo
|
||||||
|
,product_section VARCHAR(24000) ENCODE lzo
|
||||||
|
,product_name VARCHAR(24000) ENCODE lzo
|
||||||
|
,product_url VARCHAR(24000) ENCODE lzo
|
||||||
|
,product_image VARCHAR(24000) ENCODE lzo
|
||||||
|
,product_sold INTEGER NOT NULL ENCODE az64
|
||||||
|
,product_brand VARCHAR(24000) ENCODE lzo
|
||||||
|
,gift VARCHAR(24000) ENCODE lzo
|
||||||
|
,product_rank INTEGER NOT NULL ENCODE az64
|
||||||
|
,categoryid INTEGER NOT NULL ENCODE az64
|
||||||
|
,flag SMALLINT DEFAULT 0 ENCODE az64
|
||||||
|
)
|
||||||
|
DISTSTYLE AUTO
|
||||||
|
;
|
||||||
|
|
||||||
|
ALTER TABLE test_spider_management.rce_brand ADD brand_following int8 NULL;
|
||||||
|
ALTER TABLE test_spider_management.rce_brand ADD brand_rating int8 NULL;
|
||||||
|
ALTER TABLE test_spider_management.aud_rce_brand ADD brand_following int8 NULL;
|
||||||
|
ALTER TABLE test_spider_management.aud_rce_brand ADD brand_rating int8 NULL;
|
||||||
|
|
||||||
|
ALTER TABLE test_spider_management.rce_product_variant ADD product_variant_sku varchar(1000) NULL;
|
||||||
|
ALTER TABLE test_spider_management.aud_rce_product_variant ADD product_variant_sku varchar(1000) NULL;
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS test_spider_management.rce_seo
|
||||||
|
(
|
||||||
|
id INTEGER ENCODE az64
|
||||||
|
,rce_product_id INTEGER ENCODE az64
|
||||||
|
,rce_source_id INTEGER ENCODE az64
|
||||||
|
,seo_title VARCHAR(2000) ENCODE lzo
|
||||||
|
,seo_description VARCHAR(10000) ENCODE lzo
|
||||||
|
,seo_url VARCHAR(2000) ENCODE lzo
|
||||||
|
,seo_url_hash VARCHAR(2000) ENCODE lzo
|
||||||
|
,seo_image VARCHAR(2000) ENCODE lzo
|
||||||
|
,seo_price_amount BIGINT ENCODE az64
|
||||||
|
,seo_price_currency VARCHAR(2000) ENCODE lzo
|
||||||
|
,seo_product_band VARCHAR(2000) ENCODE lzo
|
||||||
|
,seo_product_availability VARCHAR(2000) ENCODE lzo
|
||||||
|
,seo_product_category VARCHAR(2000) ENCODE lzo
|
||||||
|
,seo_product_condition VARCHAR(2000) ENCODE lzo
|
||||||
|
,seo_product_retailer_item_id BIGINT ENCODE az64
|
||||||
|
,seo_product_robots VARCHAR(2000) ENCODE lzo
|
||||||
|
,createdat TIMESTAMP WITHOUT TIME ZONE DEFAULT getdate() ENCODE az64
|
||||||
|
,updatedat TIMESTAMP WITHOUT TIME ZONE DEFAULT getdate() ENCODE az64
|
||||||
|
)
|
||||||
|
DISTSTYLE AUTO
|
||||||
|
;
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS test_spider_management.aud_rce_seo
|
||||||
|
(
|
||||||
|
auditid INTEGER ENCODE az64
|
||||||
|
,id INTEGER ENCODE az64
|
||||||
|
,rce_product_id INTEGER ENCODE az64
|
||||||
|
,rce_source_id INTEGER ENCODE az64
|
||||||
|
,seo_title VARCHAR(2000) ENCODE lzo
|
||||||
|
,seo_description VARCHAR(10000) ENCODE lzo
|
||||||
|
,seo_url VARCHAR(2000) ENCODE lzo
|
||||||
|
,seo_url_hash VARCHAR(2000) ENCODE lzo
|
||||||
|
,seo_image VARCHAR(2000) ENCODE lzo
|
||||||
|
,seo_price_amount BIGINT ENCODE az64
|
||||||
|
,seo_price_currency VARCHAR(2000) ENCODE lzo
|
||||||
|
,seo_product_band VARCHAR(2000) ENCODE lzo
|
||||||
|
,seo_product_availability VARCHAR(2000) ENCODE lzo
|
||||||
|
,seo_product_category VARCHAR(2000) ENCODE lzo
|
||||||
|
,seo_product_condition VARCHAR(2000) ENCODE lzo
|
||||||
|
,seo_product_retailer_item_id BIGINT ENCODE az64
|
||||||
|
,seo_product_robots VARCHAR(2000) ENCODE lzo
|
||||||
|
,createdat TIMESTAMP WITHOUT TIME ZONE ENCODE az64
|
||||||
|
,updatedat TIMESTAMP WITHOUT TIME ZONE ENCODE az64
|
||||||
|
,audit_createdat TIMESTAMP WITHOUT TIME ZONE DEFAULT getdate() ENCODE az64
|
||||||
|
)
|
||||||
|
DISTSTYLE AUTO
|
||||||
|
;
|
||||||
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
{
|
||||||
|
"crawler_name": "raena_crawler_engine_hasaki",
|
||||||
|
"crawler_schema": "test_spider_management",
|
||||||
|
"category_tab": "rce_category",
|
||||||
|
"tracker_tab": "crawler_tracker_hasaki",
|
||||||
|
"product_tab": "rce_product",
|
||||||
|
"variant_tab": "rce_product_variant",
|
||||||
|
"brand_tab": "rce_brand",
|
||||||
|
"reseller_tab": "rce_reseller",
|
||||||
|
"reseller_store_tab": "rce_reseller_store",
|
||||||
|
"review_tab": "rce_ratings_reviews",
|
||||||
|
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
|
||||||
|
"review_producttags_tab": "rce_ratings_reviews_producttags",
|
||||||
|
"review_tags": "rce_tags",
|
||||||
|
"source_tab": "rce_source",
|
||||||
|
"seo_tab": "rce_seo",
|
||||||
|
"product_per_category": "1000",
|
||||||
|
"source_category": "11043145",
|
||||||
|
"db_user": "dbadmin",
|
||||||
|
"db_pass": "5qCif6eyY3Kmg4z",
|
||||||
|
"database": "analytics",
|
||||||
|
"db_host": "redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com",
|
||||||
|
"db_port": "5439",
|
||||||
|
"crawler_main": "1",
|
||||||
|
"crawler_slave_no": ""
|
||||||
|
}
|
|
@ -0,0 +1,143 @@
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
import psycopg2
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
from hasaki_db_writer import hasaki_db_writer
|
||||||
|
from Util import translate_text_to_english
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class HasakiCategories:
|
||||||
|
def __init__(self, config):
|
||||||
|
logging.info("Initializing HasakiSubCategories")
|
||||||
|
self.master_category = []
|
||||||
|
self.config = config
|
||||||
|
self.crawler_name = self.config.get("crawler_name")
|
||||||
|
self.product_limit = int(self.config.get("product_per_category"))
|
||||||
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'),
|
||||||
|
password=self.config.get('db_pass'), host=self.config.get('db_host'),
|
||||||
|
port=self.config.get('db_port'))
|
||||||
|
self.conn.autocommit = True
|
||||||
|
self.cur = self.conn.cursor()
|
||||||
|
self.cur.execute(f"""select id from {self.config.get('crawler_schema')}.{self.config.get('source_tab')} where source_name='Hasaki'""")
|
||||||
|
try:
|
||||||
|
self.rce_source_id = self.cur.fetchone()[0]
|
||||||
|
except:
|
||||||
|
logging.info("Source tab is empty. Please check. Exiting.....")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
self.db_writer = hasaki_db_writer(config)
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
print("Closing connection.....")
|
||||||
|
self.conn.close()
|
||||||
|
|
||||||
|
def start_processing(self):
|
||||||
|
|
||||||
|
self.crawl_and_track("HEALTH - BEAUTY", "https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html")
|
||||||
|
|
||||||
|
df = pd.DataFrame(self.master_category, columns=['Index', 'Parent', 'Name', 'Link'])
|
||||||
|
|
||||||
|
df = df.sort_values('Index')
|
||||||
|
|
||||||
|
df = df.drop_duplicates(subset='Name', keep='first')
|
||||||
|
|
||||||
|
self.process_category(df)
|
||||||
|
|
||||||
|
|
||||||
|
def process_category(self, category):
|
||||||
|
|
||||||
|
for index, row in category.iterrows():
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
data['parent_category_id'] = 0
|
||||||
|
data['rce_source_id'] = self.rce_source_id
|
||||||
|
data['rce_source_category_id'] = 0
|
||||||
|
data['rce_source_status'] = 1
|
||||||
|
data['category_name'] = str(row["Name"]).replace("'","")
|
||||||
|
data['category_page_url'] = row["Link"]
|
||||||
|
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
data['category_parent_name'] = str(row["Parent"]).replace("'","")
|
||||||
|
|
||||||
|
self.db_writer.rce_category(data)
|
||||||
|
|
||||||
|
|
||||||
|
def crawl_and_track(self, parent, url_to_visit):
|
||||||
|
self.master_category.append((0,"0", parent, url_to_visit))
|
||||||
|
|
||||||
|
print(self.master_category)
|
||||||
|
|
||||||
|
cats = self.crawl_categories(parent, url_to_visit)
|
||||||
|
time.sleep(10)
|
||||||
|
if cats:
|
||||||
|
for cat in cats:
|
||||||
|
self.master_category.append((1,)+(cat))
|
||||||
|
print((1,)+(cat))
|
||||||
|
|
||||||
|
sub_cats1 = self.crawl_categories(cat[1], cat[2])
|
||||||
|
time.sleep(10)
|
||||||
|
if sub_cats1:
|
||||||
|
for sub_cat1 in sub_cats1:
|
||||||
|
self.master_category.append((2,) + (sub_cat1))
|
||||||
|
print((2,) + (sub_cat1))
|
||||||
|
|
||||||
|
sub_cats2 = self.crawl_categories(sub_cat1[1], sub_cat1[2])
|
||||||
|
time.sleep(10)
|
||||||
|
if sub_cats2:
|
||||||
|
for sub_cat2 in sub_cats2:
|
||||||
|
self.master_category.append((3,) + (sub_cat2))
|
||||||
|
print((3,) + (sub_cat2))
|
||||||
|
|
||||||
|
sub_cats3 = self.crawl_categories(sub_cat2[1], sub_cat2[2])
|
||||||
|
time.sleep(10)
|
||||||
|
if sub_cats3:
|
||||||
|
for sub_cat3 in sub_cats3:
|
||||||
|
self.master_category.append((4,) + (sub_cat3))
|
||||||
|
print((4,) + (sub_cat3))
|
||||||
|
|
||||||
|
def crawl_categories(self, parent, url_to_visit):
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
browser = p.chromium.launch(headless=True)
|
||||||
|
# context = browser.new_context(
|
||||||
|
# viewport={"width": 375, "height": 667, "isMobile": True}
|
||||||
|
# )
|
||||||
|
page = browser.new_page()
|
||||||
|
|
||||||
|
# Load the webpage
|
||||||
|
page.goto(url_to_visit)
|
||||||
|
# page.goto('https://hasaki.vn/danh-muc/my-pham-high-end-c1907.html')
|
||||||
|
|
||||||
|
page.wait_for_load_state('load')
|
||||||
|
|
||||||
|
container_element = page.query_selector('.block_colaps_sticky.width_common.collaps_sticky')
|
||||||
|
|
||||||
|
if container_element:
|
||||||
|
item_elements = container_element.query_selector_all('.item_fillter')
|
||||||
|
content_elements = container_element.query_selector_all('.content_fillter')
|
||||||
|
|
||||||
|
urls = []
|
||||||
|
|
||||||
|
for item_element in item_elements:
|
||||||
|
text = item_element.query_selector('a').inner_text()
|
||||||
|
text = translate_text_to_english(text)
|
||||||
|
href = item_element.query_selector('a').get_attribute('href')
|
||||||
|
urls.append((parent, text, href))
|
||||||
|
|
||||||
|
for content_element in content_elements:
|
||||||
|
text = content_element.query_selector('a').inner_text()
|
||||||
|
text = translate_text_to_english(text)
|
||||||
|
href = content_element.query_selector('a').get_attribute('href')
|
||||||
|
urls.append((parent, text, href))
|
||||||
|
|
||||||
|
# removing previously collected data
|
||||||
|
master_urls = [item[3] for item in self.master_category]
|
||||||
|
filtered_data = [(parent, name, url) for parent, name, url in urls if url not in master_urls]
|
||||||
|
|
||||||
|
return filtered_data
|
||||||
|
|
||||||
|
browser.close()
|
||||||
|
|
|
@ -0,0 +1,160 @@
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
import psycopg2
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
from deep_translator import GoogleTranslator
|
||||||
|
from hasaki_db_writer import hasaki_db_writer
|
||||||
|
import pandas as pd
|
||||||
|
from Util import translate_text_to_english
|
||||||
|
class HasakiCategoryProducts:
|
||||||
|
def __init__(self, config):
|
||||||
|
logging.info("Initializing HasakiCategoryProducts........")
|
||||||
|
self.config = config
|
||||||
|
self.crawler_name = self.config.get("crawler_name")
|
||||||
|
self.product_limit = int(self.config.get("product_per_category"))
|
||||||
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'),
|
||||||
|
password=self.config.get('db_pass'), host=self.config.get('db_host'),
|
||||||
|
port=self.config.get('db_port'))
|
||||||
|
self.conn.autocommit = True
|
||||||
|
self.cur = self.conn.cursor()
|
||||||
|
self.cur.execute(
|
||||||
|
f"""select id from {self.config.get('crawler_schema')}.{self.config.get('source_tab')} where source_name='Hasaki'""")
|
||||||
|
try:
|
||||||
|
self.rce_source_id = self.cur.fetchone()[0]
|
||||||
|
except:
|
||||||
|
logging.info("Source tab is empty. Please check. Exiting.....")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
self.db_writer = hasaki_db_writer(config)
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
print("Closing connection.....")
|
||||||
|
self.conn.close()
|
||||||
|
|
||||||
|
def start_processing(self):
|
||||||
|
|
||||||
|
logging.info("Starting crawler to collect category products.........")
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
select * from {self.config.get('crawler_schema')}.{self.config.get('category_tab')}
|
||||||
|
where rce_source_id = {self.rce_source_id} order by id
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
categories = self.cur.fetchall()
|
||||||
|
|
||||||
|
for category in categories:
|
||||||
|
logging.info("================= Fetching Products for : {} ====================".format(str(category[7])))
|
||||||
|
pages = self.get_pages(category[5])
|
||||||
|
|
||||||
|
time.sleep(random.randint(10,20))
|
||||||
|
|
||||||
|
self.get_product_list(urls = pages, categoryId = category[0])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_pages(self, url):
|
||||||
|
|
||||||
|
pages = []
|
||||||
|
pages.append(url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with sync_playwright() as p:
|
||||||
|
browser = p.chromium.launch(headless=True)
|
||||||
|
|
||||||
|
page = browser.new_page()
|
||||||
|
page.goto(url)
|
||||||
|
|
||||||
|
page.wait_for_load_state('load')
|
||||||
|
|
||||||
|
pagination = page.query_selector(".pagination.ul-pagination").query_selector_all(".change-page")
|
||||||
|
|
||||||
|
for pagination in pagination:
|
||||||
|
if str(pagination.get_attribute('data-page')).strip() != "1":
|
||||||
|
new_url = str(pagination.get_attribute('href')).strip()
|
||||||
|
new_url = "https://hasaki.vn" + new_url
|
||||||
|
pages.append(new_url)
|
||||||
|
browser.close()
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
finally:
|
||||||
|
return pages
|
||||||
|
|
||||||
|
def get_product_list(self,urls, categoryId):
|
||||||
|
|
||||||
|
try:
|
||||||
|
with sync_playwright() as p:
|
||||||
|
browser = p.chromium.launch(headless=True)
|
||||||
|
|
||||||
|
page = browser.new_page()
|
||||||
|
|
||||||
|
page_count = 1
|
||||||
|
|
||||||
|
logging.info("Found {} pages. Looping through URLS to get all products.".format(str(len(urls))))
|
||||||
|
for url in urls:
|
||||||
|
logging.info("+++++++++++++ Loading page : {} +++++++++++++++++".format(str(page_count)))
|
||||||
|
|
||||||
|
page.goto(url)
|
||||||
|
|
||||||
|
page.wait_for_load_state('load')
|
||||||
|
|
||||||
|
container_element = page.query_selector('.ProductGrid__grid.width_common')
|
||||||
|
if container_element:
|
||||||
|
item_elements = container_element.query_selector_all('.ProductGridItem__itemOuter')
|
||||||
|
item_count = 1
|
||||||
|
for item_element in item_elements:
|
||||||
|
try:
|
||||||
|
product_section = "Base Product Page " + str(page_count)
|
||||||
|
product_name = translate_text_to_english(str(item_element.query_selector('.width_common.name_sp.space_bottom_5').text_content()).strip().replace("'",""))
|
||||||
|
product_url = str(item_element.query_selector('.v3_thumb_common_sp.relative').get_attribute('href')).strip()
|
||||||
|
product_brand = translate_text_to_english(str(item_element.query_selector('.width_common.txt_color_1.space_bottom_3').text_content()).strip().replace("'",""))
|
||||||
|
product_rank = item_count
|
||||||
|
|
||||||
|
product_image = ""
|
||||||
|
try:
|
||||||
|
product_image = str(item_element.query_selector('.v3_thumb_common_sp.relative').query_selector('.img_thumb.lazy.loaded').get_attribute('src')).strip().replace("'","")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
gift = ""
|
||||||
|
try:
|
||||||
|
gift = translate_text_to_english(str(item_element.query_selector('.block_gift_list_item').text_content()).strip().replace("'",""))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
product_sold = 0
|
||||||
|
try:
|
||||||
|
product_sold = int(str(item_element.query_selector('.item_count_by').text_content()).strip().replace('.',''))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')}(crawler_name,product_section, product_name, product_url, product_image, product_sold, product_brand, gift, product_rank, categoryid)
|
||||||
|
values('{self.crawler_name}','{product_section}','{product_name.replace("'","")}','{product_url}','{product_image}',{product_sold},'{product_brand}','{gift}',{product_rank},{categoryId})
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
item_count += 1
|
||||||
|
|
||||||
|
time.sleep(random.randint(10,30))
|
||||||
|
|
||||||
|
page_count += 1
|
||||||
|
|
||||||
|
browser.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
import logging
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
|
||||||
|
from hasaki_categories import HasakiCategories
|
||||||
|
from hasaki_category_products import HasakiCategoryProducts
|
||||||
|
from hasaki_product_info import HasakiProductInfo
|
||||||
|
|
||||||
|
##### Looger ######
|
||||||
|
format = "%(asctime)s: %(message)s"
|
||||||
|
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
config = {}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# hasaki_categories = HasakiCategories(config)
|
||||||
|
# hasaki_categories.start_processing()
|
||||||
|
#
|
||||||
|
# time.sleep(60)
|
||||||
|
#
|
||||||
|
# hasaki_category_products = HasakiCategoryProducts(config)
|
||||||
|
# hasaki_category_products.start_processing()
|
||||||
|
#
|
||||||
|
# time.sleep(60)
|
||||||
|
|
||||||
|
hasaki_products = HasakiProductInfo(config)
|
||||||
|
hasaki_products.start_processing()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.info("Starting Hasaki Crawler.......")
|
||||||
|
try:
|
||||||
|
logging.info("Loading config file.......")
|
||||||
|
with open("conf.json", "r") as jsonfile:
|
||||||
|
config = json.load(jsonfile)
|
||||||
|
logging.info("Config file loaded.......")
|
||||||
|
print(config)
|
||||||
|
|
||||||
|
main()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.info("Error: ".format(e))
|
||||||
|
#logging.info("Cannot load config file. Please check. Exiting......")
|
||||||
|
#send_mail()
|
||||||
|
exit(1)
|
|
@ -0,0 +1,754 @@
|
||||||
|
import logging
|
||||||
|
import psycopg2
|
||||||
|
|
||||||
|
###### Looger ######
|
||||||
|
format = "%(asctime)s: %(message)s"
|
||||||
|
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
class hasaki_db_writer:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
||||||
|
self.conn.autocommit = True
|
||||||
|
self.cur = self.conn.cursor()
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
logging.info("Closing connection.....")
|
||||||
|
self.conn.close()
|
||||||
|
|
||||||
|
def get_id(self, schema, table):
|
||||||
|
sql = f"""
|
||||||
|
select max(id) from {schema}.{table}
|
||||||
|
"""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
if res[0] != None:
|
||||||
|
id = res[0] + 1
|
||||||
|
else:
|
||||||
|
id = 1
|
||||||
|
|
||||||
|
return id
|
||||||
|
|
||||||
|
def get_aud_id(self, schema, table):
|
||||||
|
sql = f"""
|
||||||
|
select max(auditid) from {schema}.{table}
|
||||||
|
"""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
if res[0] != None:
|
||||||
|
id = res[0] + 1
|
||||||
|
else:
|
||||||
|
id = 1
|
||||||
|
|
||||||
|
return id
|
||||||
|
|
||||||
|
def rce_category(self, data):
|
||||||
|
sql = f"""
|
||||||
|
select * from {self.config.get('crawler_schema')}.{self.config.get('category_tab')} where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('category_tab'))
|
||||||
|
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('category_tab'))
|
||||||
|
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.{self.config.get('category_tab')}(id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,category_parent_name)
|
||||||
|
values({id_main},{data['parent_category_id']},{data['rce_source_id']},{data['rce_source_category_id']},{data['rce_source_status']},'{data['category_page_url']}','{data['category_page_url_hash']}','{data['category_name']}','{data['category_parent_name']}')
|
||||||
|
"""
|
||||||
|
logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('category_tab')}(auditid,id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat,category_parent_name)
|
||||||
|
select {id_aud},id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat,category_parent_name from {self.config.get('crawler_schema')}.{self.config.get('category_tab')}
|
||||||
|
where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
if str(data['parent_category_id'])==str(res[1]) and str(data['rce_source_category_id'])==str(res[3]) and str(data['category_name']) == str(res[7]) and \
|
||||||
|
str(data['category_page_url'])==str(res[5]) and str(data['category_parent_name'])==str(res[12]):
|
||||||
|
sql = f"""
|
||||||
|
update {self.config.get('crawler_schema')}.{self.config.get('category_tab')} set updatedat=GETDATE()
|
||||||
|
where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = f"""
|
||||||
|
update {self.config.get('crawler_schema')}.{self.config.get('category_tab')} set parent_category_id={data['parent_category_id']}, rce_source_category_id = {data['rce_source_category_id']},
|
||||||
|
category_name = '{data['category_name']}', category_page_url = '{data['category_page_url']}', category_page_url_hash = '{data['category_page_url_hash']}', category_parent_name = '{data['category_parent_name']}',
|
||||||
|
updatedat=GETDATE() where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(auditid,id,parent_category_id,rce_source_id," \
|
||||||
|
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat,category_parent_name) " \
|
||||||
|
"select "+str(id_aud)+", id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \
|
||||||
|
"category_name,createdat,updatedat,category_parent_name from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \
|
||||||
|
"where category_name = '"+ str(res[7])+"'"
|
||||||
|
logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_product(self, data):
|
||||||
|
sql = f"""
|
||||||
|
select * from {self.config.get('crawler_schema')}.{self.config.get('product_tab')}
|
||||||
|
where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('product_tab'))
|
||||||
|
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('product_tab'))
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.{self.config.get('product_tab')}(id,rce_source_product_id,rce_source_product_status,product_page_url,
|
||||||
|
product_page_url_hash,rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,
|
||||||
|
product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings,product_section,
|
||||||
|
rce_source_id,countryoforigin,rank,ships_from) values({id_main},{data['rce_source_product_id']},{data['rce_source_product_status']},'{data['product_page_url']}',
|
||||||
|
'{data['product_page_url_hash']}',{data['rce_category_id']},{data['rce_brand_id']},{data['rce_store_id']},'{data['rce_source_product_name']}','{data['product_images']}','{data['product_description']}',{data['product_sold_total']},{data['product_sold']},
|
||||||
|
{data['product_price_min']},{data['product_price_min_before_discount']},{data['product_price_max']},{data['product_price_max_before_discount']},{data['ratings']},'{data['product_section']}',
|
||||||
|
{data['rce_source_id']},'{data['countryoforigin']}',{data['rank']},'{data['ships_from']}')
|
||||||
|
"""
|
||||||
|
logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('product_tab')}(auditid,id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash,
|
||||||
|
rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount,
|
||||||
|
product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank)
|
||||||
|
select {id_aud},id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash,
|
||||||
|
rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount,
|
||||||
|
product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank from {self.config.get('crawler_schema')}.{self.config.get('product_tab')}
|
||||||
|
where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_source_product_id'])==str(res[1]) and str(data['rce_source_product_status'])==str(res[2]) and \
|
||||||
|
str(data['product_page_url'])==str(res[3]) and str(data['product_page_url_hash'])==str(res[4]) and str(data['rce_category_id'])==str(res[5]) and \
|
||||||
|
str(data['rce_brand_id'])==str(res[6]) and str(data['rce_store_id'])==str(res[7]) and str(data['rce_source_product_name'])==str(res[8]) and \
|
||||||
|
str(data['product_images'])==str(res[9]) and str(data['product_sold_total'])==str(res[11]) and \
|
||||||
|
str(data['product_sold'])==str(res[12]) and str(data['product_price_min'])==str(res[13]) and str(data['product_price_min_before_discount'])==str(res[14]) and \
|
||||||
|
str(data['product_price_max'])==str(res[15]) and str(data['product_price_max_before_discount'])==str(res[16]) and str(data['ratings'])==str(res[17]) \
|
||||||
|
and str(data['ships_from'])==str(res[18]) and str(data['rce_source_id'])==str(res[21]) \
|
||||||
|
and str(data['product_section'])==str(res[22]) and str(data['countryoforigin'])==str(res[23])\
|
||||||
|
and str(data['rank'])==str(res[24]):
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
update {self.config.get('crawler_schema')}.{self.config.get('product_tab')} set updatedat=GETDATE()
|
||||||
|
where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = f"""
|
||||||
|
update {self.config.get('crawler_schema')}.{self.config.get('product_tab')} set rce_source_product_id = {data['rce_source_product_id']}, rce_source_product_status={data['rce_source_product_status']}, product_page_url='{data['product_page_url']}',
|
||||||
|
product_page_url_hash='{data['product_page_url_hash']}', rce_category_id={data['rce_category_id']}, rce_brand_id={data['rce_brand_id']}, rce_store_id={data['rce_store_id']},
|
||||||
|
rce_source_product_name='{data['rce_source_product_name']}', product_images='{data['product_images']}', product_description='{data['product_description']}', product_sold_total={data['product_sold_total']},
|
||||||
|
product_sold={data['product_sold']}, product_price_min='{data['product_price_min']}',product_price_min_before_discount='{data['product_price_min_before_discount']}',
|
||||||
|
product_price_max='{data['product_price_max']}', product_price_max_before_discount='{data['product_price_max_before_discount']}', ratings={data['ratings']},
|
||||||
|
ships_from='{data['ships_from']}',product_section='{data['product_section']}',countryoforigin='{data['countryoforigin']}',rank={data['rank']}, updatedat=GETDATE()
|
||||||
|
where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('product_tab')}(auditid,id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash,
|
||||||
|
rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount,
|
||||||
|
product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank)
|
||||||
|
select {id_aud},id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash,
|
||||||
|
rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount,
|
||||||
|
product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank from {self.config.get('crawler_schema')}.{self.config.get('product_tab')}
|
||||||
|
where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
def rce_product_variant(self, data):
|
||||||
|
sql = f"""
|
||||||
|
select * from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} where
|
||||||
|
rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
|
||||||
|
"""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('variant_tab'))
|
||||||
|
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('variant_tab'))
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.{self.config.get('variant_tab')}(id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku)
|
||||||
|
values({id_main},{data['rce_source_variant_id']},{data['rce_product_id']},'{data['product_variant_name']}',{data['product_variant_price']},{data['product_variant_price_before_discount']},{data['product_variant_stock']},'{data['product_variant_sku']}')
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('variant_tab')}(auditid,id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat)
|
||||||
|
select {id_aud},id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat
|
||||||
|
from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
if str(data['rce_source_variant_id'])==str(res[1]) and str(data['rce_product_id'])==str(res[2]) and str(data['product_variant_name'])==str(res[3]) and \
|
||||||
|
str(data['product_variant_price'])==str(res[4]) and str(data['product_variant_price_before_discount'])==str(res[5]) and str(data['product_variant_stock'])==str(res[6])\
|
||||||
|
and str(data['product_variant_sku'])==str(res[9]):
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
update {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} set updatedat=GETDATE()
|
||||||
|
where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
|
||||||
|
"""
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
update {self.config.get('crawler_schema')}.aud_{self.config.get('variant_tab')} a set updatedat=b.updatedat
|
||||||
|
from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} b where a.id=b.id and b.id = {res[0]}
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
update {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} set rce_source_variant_id={data['rce_source_variant_id']},
|
||||||
|
rce_product_id={data['rce_product_id']},product_variant_name='{data['product_variant_name']}',product_variant_price={data['product_variant_price']},
|
||||||
|
product_variant_price_before_discount={data['product_variant_price_before_discount']},product_variant_stock={data['product_variant_stock']},
|
||||||
|
product_variant_sku={data['product_variant_sku']}, updatedat=GETDATE()
|
||||||
|
where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('variant_tab')}(auditid,id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat)
|
||||||
|
select {id_aud},id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat
|
||||||
|
from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
def rce_brand(self, data):
|
||||||
|
sql = f"""
|
||||||
|
select * from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} where rce_source_brand_id = {data['rce_source_brand_id']}
|
||||||
|
and rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('brand_tab'))
|
||||||
|
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('brand_tab'))
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.{self.config.get('brand_tab')}(id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating)
|
||||||
|
values({id_main},{data['rce_source_id']},{data['rce_source_brand_id']},{data['rce_source_brand_status']},'{data['brand_page_url']}','{data['brand_page_url_hash']}','{data['brand_name']}',{data['brand_following']},{data['brand_rating']})
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('brand_tab')}(auditid,id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat)
|
||||||
|
select {id_aud}, id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')}
|
||||||
|
where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_source_id'])==str(res[1]) and str(data['rce_source_brand_status'])==str(res[3]) and str(data['brand_page_url'])==str(res[4]) and \
|
||||||
|
str(data['brand_page_url_hash'])==str(res[5]) and str(data['brand_name'])==str(res[6]) and str(data['rce_source_brand_id'])==str(res[2]):
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
update {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} set updatedat=GETDATE() where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
update {self.config.get('crawler_schema')}.aud_{self.config.get('brand_tab')} a set updatedat=b.updatedat
|
||||||
|
from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} b where a.id=b.id and b.id = {res[0]} and
|
||||||
|
b.rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = f"""
|
||||||
|
update {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} set rce_source_id={data['rce_source_id']}, rce_source_brand_id={data['rce_source_brand_id']},
|
||||||
|
rce_source_brand_status={data['rce_source_brand_status']}, brand_page_url='{data['brand_page_url']}', brand_page_url_hash='{data['brand_page_url_hash']}',
|
||||||
|
brand_name='{data['brand_name']}', brand_following={data['brand_following']}, brand_rating={data['brand_rating']}, updatedat=GETDATE() where rce_source_brand_id={data['rce_source_brand_id']}
|
||||||
|
and rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('brand_tab')}(auditid,id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat)
|
||||||
|
select {id_aud}, id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')}
|
||||||
|
where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_reseller(self, data):
|
||||||
|
data['reseller_name'] = data['reseller_name']
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data['reseller_name'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('reseller_tab'))
|
||||||
|
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('reseller_tab'))
|
||||||
|
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.{self.config.get('reseller_tab')}(id,rce_source_id,rce_source_reseller_status,reseller_name)
|
||||||
|
values({id_main},'{data['rce_source_id']}','{data['rce_source_reseller_status']}','{data['reseller_name']}')
|
||||||
|
"""
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('reseller_tab')}(auditid,id,rce_source_id,rce_source_reseller_status,reseller_name,createdat,updatedat)
|
||||||
|
select {id_aud}, id,rce_source_id,rce_source_reseller_status,reseller_name,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('reseller_tab')}
|
||||||
|
where reseller_name='{data['reseller_name']}'
|
||||||
|
"""
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_source_reseller_status'])==str(res[3]) and str(data['reseller_name'])==str(res[4]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set updatedat=GETDATE() " \
|
||||||
|
"where reseller_name = '"+ str(res[4])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set rce_source_id="+str(data['rce_source_id'])+", " \
|
||||||
|
"rce_source_reseller_status="+str(data['rce_source_reseller_status'])+", reseller_name='"+str(data['reseller_name'])+"', reseller_average_rating=" \
|
||||||
|
"'"+str(data['reseller_average_rating'])+"',reseller_description='"+str(data['reseller_description'])+"', updatedat=GETDATE() where reseller_name = '"+ str(res[4])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (auditid,id,rce_source_id,rce_source_reseller_status," \
|
||||||
|
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat) select "+str(id_aud)+", id,rce_source_id,rce_source_reseller_status," \
|
||||||
|
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name='"+str(res[4])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_reseller_store(self, data):
|
||||||
|
|
||||||
|
data['store_page_url'] = data['store_page_url'].replace("'","''")
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url = '"+str(data['store_page_url'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('reseller_store_tab'))
|
||||||
|
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('reseller_store_tab'))
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.{self.config.get('reseller_store_tab')}(id,rce_source_store_status,store_page_url,store_page_url_hash,rce_reseller_id,rce_source_id)
|
||||||
|
values({id_main},'{data['rce_source_store_status']}','{data['store_page_url']}','{data['store_page_url_hash']}',{data['rce_reseller_id']},{data['rce_source_id']})
|
||||||
|
"""
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('reseller_store_tab')}(auditid,id,rce_source_store_status,store_page_url,store_page_url_hash,rce_reseller_id,createdat,updatedat,rce_source_id)
|
||||||
|
select {id_aud}, id,rce_source_store_status,store_page_url,store_page_url_hash,rce_reseller_id,createdat,updatedat,rce_source_id from {self.config.get('crawler_schema')}.{self.config.get('reseller_store_tab')}
|
||||||
|
where store_page_url= '{data['store_page_url']}'
|
||||||
|
"""
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_source_store_status'])==str(res[2]) and str(data['store_page_url'])==str(res[3]) and \
|
||||||
|
str(data['store_page_url_hash'])==str(res[4]) and \
|
||||||
|
str(data['rce_reseller_id'])==str(res[6]) and str(data['rce_source_id'])==str(res[9]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set updatedat=GETDATE() " \
|
||||||
|
"where store_page_url = '"+ str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set " \
|
||||||
|
"rce_source_store_status="+str(data['rce_source_store_status'])+", store_page_url='"+str(data['store_page_url'])+"', store_page_url_hash=" \
|
||||||
|
"'"+str(data['store_page_url_hash'])+"',store_location='"+str(data['store_location'])+"', rce_reseller_id="+str(data['rce_reseller_id'])+", " \
|
||||||
|
"updatedat=GETDATE(), rce_source_id="+str(data['rce_source_id'])+" where store_page_url = '"+ str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (auditid,id,rce_source_store_status," \
|
||||||
|
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id) select "+id_aud+", id,rce_source_store_status," \
|
||||||
|
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url='"+str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_ratings_reviews(self, data):
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
select * from {self.config.get('crawler_schema')}.{self.config.get('review_tab')}
|
||||||
|
where rce_product_id = {data['rce_product_id']} and username = '{data['username']}'
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
data['username'] = data['username'].replace("'","")
|
||||||
|
|
||||||
|
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('review_tab'))
|
||||||
|
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('review_tab'))
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.{self.config.get('review_tab')}(id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating)
|
||||||
|
values({id_main},{data['rce_product_id']},'{data['username']}','{data['review']}','{data['img_url']}',{data['review_like_count']},'{data['user_tier']}',{data['shop_id']},'{data['video_url']}',{data['rating']})
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('review_tab')}(auditid,id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat)
|
||||||
|
select {id_aud},id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('review_tab')}
|
||||||
|
where rce_product_id = {data['rce_product_id']} and username = '{data['username']}'
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_product_id'])==str(res[1]) and str(data['username'])==str(res[2]) and str(data['review'])==str(res[3]) and \
|
||||||
|
str(data['img_url'])==str(res[4]) and str(data['review_like_count'])==str(res[5]) and str(data['user_tier'])==str(res[6]) and \
|
||||||
|
str(data['shop_id'])==str(res[7]) and str(data['video_url'])==str(res[8]) and str(data['rating'])==str(res[9]):
|
||||||
|
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set updatedat=GETDATE() " \
|
||||||
|
"where rce_product_id = "+ str(res[1])+" and username ='"+res[2]+"'"
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set rce_product_id="+str(data['rce_product_id'])+", " \
|
||||||
|
"username='"+str(data['username'])+"', review='"+str(data['review'])+"', img_url=" \
|
||||||
|
"'"+str(data['img_url'])+"',review_like_count="+str(data['review_like_count'])+", user_tier='"+str(data['user_tier'])+"', " \
|
||||||
|
"shop_id="+str(data['shop_id'])+", video_url='"+str(data['video_url'])+"', rating='"+str(data['rating'])+"', updatedat=GETDATE() " \
|
||||||
|
"where rce_product_id = "+ str(res[1])+" and username ='"+str(data['username'])+"'"
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('review_tab')}(auditid,id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat)
|
||||||
|
select {id_aud},id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('review_tab')}
|
||||||
|
where rce_product_id = {data['rce_product_id']} and username = '{data['username']}'
|
||||||
|
"""
|
||||||
|
logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_seo(self, data):
|
||||||
|
sql = f"""
|
||||||
|
select * from {self.config.get('crawler_schema')}.{self.config.get('seo_tab')}
|
||||||
|
where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('seo_tab'))
|
||||||
|
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('seo_tab'))
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.{self.config.get('seo_tab')}(id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots)
|
||||||
|
values({id_main},{data['rce_product_id']},{data['rce_source_id']},'{data['seo_title']}','{data['seo_description']}','{data['seo_url']}','{data['seo_url_hash']}','{data['seo_image']}',{data['seo_price_amount']},'{data['seo_price_currency']}','{data['seo_product_band']}','{data['seo_product_availability']}','{data['seo_product_category']}',
|
||||||
|
'{data['seo_product_condition']}',{data['seo_product_retailer_item_id']},'{data['seo_product_robots']}')
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('seo_tab')}(auditid,id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat)
|
||||||
|
select {id_aud},id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('seo_tab')}
|
||||||
|
where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if (str(data['rce_product_id']) == str(res[1]) and str(data['rce_source_id']) == str(res[2]) and str(data['seo_title']) == str(res[3]) and \
|
||||||
|
str(data['seo_description']) == str(res[4]) and str(data['seo_url']) == str(res[5]) and str(data['seo_url_hash']) == str(res[6]) and \
|
||||||
|
str(data['seo_image']) == str(res[7]) and str(data['seo_price_amount']) == str(res[8]) and str(data['seo_price_currency']) == str(res[9]) and \
|
||||||
|
str(data['seo_product_band']) == str(res[10])) and str(data['seo_product_availability']) == str(res[11]) and str(data['seo_product_category']) == str(res[12]) and \
|
||||||
|
str(data['seo_product_condition']) == str(res[13]) and str(data['seo_product_retailer_item_id']) == str(res[14]) and str(data['seo_product_robots']) == str(res[15]):
|
||||||
|
|
||||||
|
sql = "update " + self.config.get('crawler_schema') + "." + self.config.get('seo_tab') + " set updatedat=GETDATE() " \
|
||||||
|
"where rce_product_id = " + str(res[1]) + " and rce_source_id =" + str(data['rce_source_id'])
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update " + self.config.get('crawler_schema') + ".aud_" + self.config.get('seo_tab') + " a set updatedat=b.updatedat " \
|
||||||
|
"from " + self.config.get('crawler_schema') + "." + self.config.get('seo_tab') + " b where a.id=b.id and b.id = " + str(res[0])
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
update {self.config.get('crawler_schema')}.{self.config.get('seo_tab')} set rce_product_id={data['rce_product_id']}, rce_source_id={data['rce_source_id']}, seo_title='{data['seo_title']}', seo_description='{data['seo_description']}',
|
||||||
|
seo_url='{data['seo_url']}', seo_url_hash='{data['seo_url_hash']}', seo_image='{data['seo_image']}', seo_price_amount='{data['seo_price_amount']}', seo_price_currency='{data['seo_price_currency']}', seo_product_band='{data['seo_product_band']}',
|
||||||
|
seo_product_availability='{data['seo_product_availability']}', seo_product_category='{data['seo_product_category']}', seo_product_condition='{data['seo_product_condition']}', seo_product_retailer_item_id={data['seo_product_retailer_item_id']},
|
||||||
|
seo_product_robots='{data['seo_product_robots']}' where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('seo_tab')}(auditid,id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat)
|
||||||
|
select {id_aud},id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('seo_tab')}
|
||||||
|
where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']}
|
||||||
|
"""
|
||||||
|
logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# def rce_ratings_reviews_productmodels(self,data):
|
||||||
|
#
|
||||||
|
# sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id = "+str(data['rce_rating_id'])
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
# res = self.cur.fetchone()
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# if not res:
|
||||||
|
#
|
||||||
|
# sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" (rce_rating_id,model_id) " \
|
||||||
|
# "values("+str(data['rce_rating_id'])+",'"+str(data['model_id'])+"')"
|
||||||
|
# #logging.info(sql)
|
||||||
|
#
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
#
|
||||||
|
# sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
|
||||||
|
# "createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
|
||||||
|
# ""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(data['rce_rating_id'])+""
|
||||||
|
# #logging.info(sql)
|
||||||
|
#
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
#
|
||||||
|
# else:
|
||||||
|
#
|
||||||
|
# if str(data['rce_rating_id'])==str(res[1]) and str(data['model_id'])==str(res[2]):
|
||||||
|
#
|
||||||
|
# sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set updatedat=GETDATE() " \
|
||||||
|
# "where rce_rating_id = "+ str(res[1])
|
||||||
|
# #logging.info(sql)
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
#
|
||||||
|
# sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
# "from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
# #logging.info(sql)
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
# else:
|
||||||
|
#
|
||||||
|
# sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set model_id="+str(data['model_id'])+", " \
|
||||||
|
# "updatedat=GETDATE() where rce_source_store_id = "+ str(res[1])
|
||||||
|
# #logging.info(sql)
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
#
|
||||||
|
# sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
|
||||||
|
# "createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
|
||||||
|
# ""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(res[1])+""
|
||||||
|
# #logging.info(sql)
|
||||||
|
#
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# def rce_tags(self,data):
|
||||||
|
#
|
||||||
|
# sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description = '"+str(data['description'])+"'"
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
# res = self.cur.fetchone()
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# if not res:
|
||||||
|
#
|
||||||
|
# sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" (id,description) " \
|
||||||
|
# "values("+str(data['id'])+",'"+str(data['description'])+"')"
|
||||||
|
# #logging.info(sql)
|
||||||
|
#
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
#
|
||||||
|
# sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
|
||||||
|
# "createdat,updatedat) select id,description,createdat,updatedat from " \
|
||||||
|
# ""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(data['description'])+"'"
|
||||||
|
# #logging.info(sql)
|
||||||
|
#
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
#
|
||||||
|
# else:
|
||||||
|
#
|
||||||
|
# if str(data['description'])==str(res[1]):
|
||||||
|
#
|
||||||
|
# sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set updatedat=GETDATE() " \
|
||||||
|
# "where description = '"+ str(res[1])+"'"
|
||||||
|
# #logging.info(sql)
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
#
|
||||||
|
# sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
# "from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
# #logging.info(sql)
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
# else:
|
||||||
|
#
|
||||||
|
# sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set description='"+str(data['description'])+"', " \
|
||||||
|
# "updatedat=GETDATE() where description = "+ str(res[1])
|
||||||
|
# #logging.info(sql)
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
#
|
||||||
|
# sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
|
||||||
|
# "createdat,updatedat) select id,description,createdat,updatedat from " \
|
||||||
|
# ""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(res[1])+"'"
|
||||||
|
# #logging.info(sql)
|
||||||
|
#
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# def rce_ratings_reviews_producttags(self,data):
|
||||||
|
#
|
||||||
|
# sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id = '"+str(data['rce_rating_id'])+"'"
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
# res = self.cur.fetchone()
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# if not res:
|
||||||
|
#
|
||||||
|
# sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" (rce_rating_id,tag_ids) " \
|
||||||
|
# "values("+str(data['rce_rating_id'])+",'"+str(data['tag_ids'])+"')"
|
||||||
|
# #logging.info(sql)
|
||||||
|
#
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
#
|
||||||
|
# sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
|
||||||
|
# "createdat,updatedat) select id,rce_rating_id,tag_ids,createdat,updatedat from " \
|
||||||
|
# ""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id='"+str(data['rce_rating_id'])+"'"
|
||||||
|
# #logging.info(sql)
|
||||||
|
#
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
#
|
||||||
|
# else:
|
||||||
|
#
|
||||||
|
# if str(data['rce_rating_id'])==str(res[1]):
|
||||||
|
#
|
||||||
|
# sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set updatedat=GETDATE() " \
|
||||||
|
# "where rce_rating_id = '"+ str(res[1])+"'"
|
||||||
|
# #logging.info(sql)
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
#
|
||||||
|
# sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
# "from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
# #logging.info(sql)
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
# else:
|
||||||
|
#
|
||||||
|
# sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set rce_rating_id='"+str(data['rce_rating_id'])+"', " \
|
||||||
|
# "updatedat=GETDATE() where rce_rating_id = "+ str(res[1])
|
||||||
|
# #logging.info(sql)
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
#
|
||||||
|
# sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
|
||||||
|
# "createdat,updatedat) select id,description,createdat,updatedat from " \
|
||||||
|
# ""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where description='"+str(res[1])+"'"
|
||||||
|
# #logging.info(sql)
|
||||||
|
#
|
||||||
|
# self.cur.execute(sql)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
|
|
@ -0,0 +1,454 @@
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import random
|
||||||
|
import string
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import psycopg2
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
from deep_translator import GoogleTranslator
|
||||||
|
from hasaki_db_writer import hasaki_db_writer
|
||||||
|
import pandas as pd
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from Util import translate_text_to_english
|
||||||
|
|
||||||
|
class HasakiProductInfo:
|
||||||
|
def __init__(self, config):
|
||||||
|
logging.info("Initializing HasakiProductInfo")
|
||||||
|
self.pattern = r'[' + string.punctuation + ']'
|
||||||
|
self.config = config
|
||||||
|
self.crawler_name = self.config.get("crawler_name")
|
||||||
|
self.product_limit = int(self.config.get("product_per_category"))
|
||||||
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'),
|
||||||
|
password=self.config.get('db_pass'), host=self.config.get('db_host'),
|
||||||
|
port=self.config.get('db_port'))
|
||||||
|
self.conn.autocommit = True
|
||||||
|
self.cur = self.conn.cursor()
|
||||||
|
self.cur.execute(
|
||||||
|
f"""select id from {self.config.get('crawler_schema')}.{self.config.get('source_tab')} where source_name='Hasaki'""")
|
||||||
|
try:
|
||||||
|
self.rce_source_id = self.cur.fetchone()[0]
|
||||||
|
except:
|
||||||
|
logging.info("Source tab is empty. Please check. Exiting.....")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
self.db_writer = hasaki_db_writer(config)
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
print("Closing connection.....")
|
||||||
|
self.conn.close()
|
||||||
|
|
||||||
|
def start_processing(self):
|
||||||
|
logging.info("Starting to collect product info from Hasaki........")
|
||||||
|
|
||||||
|
logging.info("Fetching product list from DB......")
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
select * from {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} where flag = 0
|
||||||
|
order by categoryid, product_section, product_rank
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
rows = self.cur.fetchall()
|
||||||
|
logging.info("Found {} products.......".format(str(len(rows))))
|
||||||
|
cnt = 1
|
||||||
|
for row in rows:
|
||||||
|
logging.info("========= Fetching product info {}/{}: {} =========".format(str(cnt),str(len(rows)),row[3]))
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.get_product_info(row)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1
|
||||||
|
where categoryid={row[9]}, product_section='{row[1]}', product_rank={row[8]}, product_url='{row[3]}'
|
||||||
|
"""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
cnt += 1
|
||||||
|
|
||||||
|
|
||||||
|
def get_product_info(self, data):
|
||||||
|
|
||||||
|
raw_data = self.get_raw_product_data(data[3])
|
||||||
|
|
||||||
|
print(raw_data)
|
||||||
|
|
||||||
|
if raw_data:
|
||||||
|
self.product_info(data, raw_data)
|
||||||
|
|
||||||
|
self.rating_info(raw_data)
|
||||||
|
|
||||||
|
self.seo_info(raw_data)
|
||||||
|
|
||||||
|
|
||||||
|
def get_raw_product_data(self, url):
|
||||||
|
with sync_playwright() as p:
|
||||||
|
browser = p.chromium.launch(headless=True)
|
||||||
|
context = browser.new_context(
|
||||||
|
user_agent="Mozilla/5.0 (iPhone X; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1")
|
||||||
|
page = context.new_page()
|
||||||
|
|
||||||
|
page.goto(url)
|
||||||
|
|
||||||
|
with page.expect_response("**/wap/v2/product/detail**") as response:
|
||||||
|
api_requests = response.value.json()
|
||||||
|
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
return api_requests
|
||||||
|
|
||||||
|
def product_info(self, data, raw_data):
|
||||||
|
|
||||||
|
#region rce_brand
|
||||||
|
|
||||||
|
data_brand = {}
|
||||||
|
|
||||||
|
data_brand['rce_source_id'] = self.rce_source_id
|
||||||
|
data_brand['rce_source_brand_status'] = 1
|
||||||
|
data_brand['rce_source_brand_id'] = 0
|
||||||
|
data_brand['brand_page_url'] = ""
|
||||||
|
data_brand['brand_page_url_hash'] = ""
|
||||||
|
data_brand['brand_name'] = ""
|
||||||
|
data_brand['brand_following'] = ""
|
||||||
|
data_brand['brand_rating'] = ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
data_brand['rce_source_brand_id'] = raw_data['brand']['id']
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_brand['brand_page_url'] = "https://hasaki.vn/" + raw_data['brand']['url'] + ".html"
|
||||||
|
data_brand['brand_page_url'] = str(data_brand['brand_page_url']).replace("'","")
|
||||||
|
data_brand['brand_page_url_hash'] = hashlib.md5(data_brand['brand_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_brand['brand_name'] = translate_text_to_english(str(raw_data['brand']['name']).replace("'",""))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_brand['brand_following'] = raw_data['brand']['following']
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_brand['brand_rating'] = raw_data['brand']['rating']
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_brand(data_brand)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
#region rce_product
|
||||||
|
|
||||||
|
data_product = {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
data_product['rce_source_product_id'] = raw_data['id']
|
||||||
|
data_product['rce_source_id'] = self.rce_source_id
|
||||||
|
data_product['rce_source_product_status'] = 1
|
||||||
|
data_product['product_page_url'] = str(raw_data['url']).replace("'","")
|
||||||
|
data_product['product_page_url_hash'] = hashlib.md5(data_product['product_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
data_product['rce_category_id'] = data[9]
|
||||||
|
data_product['rce_store_id'] = 0
|
||||||
|
|
||||||
|
data_product['rce_source_product_name'] = str(raw_data['name']) + str(raw_data['alt_name'])
|
||||||
|
data_product['rce_source_product_name'] = translate_text_to_english(str(re.sub(self.pattern, '', data_product['rce_source_product_name'])))
|
||||||
|
data_product['rce_source_product_name'] = str(data_product['rce_source_product_name']).replace("'", "")
|
||||||
|
|
||||||
|
data_product['product_images'] = data[4]
|
||||||
|
|
||||||
|
data_product['product_description'] = ""
|
||||||
|
try:
|
||||||
|
|
||||||
|
description_raw = raw_data['description']
|
||||||
|
soup = BeautifulSoup(description_raw, 'html.parser')
|
||||||
|
data_product['product_description'] = translate_text_to_english(re.sub(self.pattern, '',soup.get_text()).replace("'",""))
|
||||||
|
data_product['product_description'] = str(data_product['product_description']).replace("'","")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
data_product['rce_brand_id'] = ""
|
||||||
|
try:
|
||||||
|
sql = f"""
|
||||||
|
select id from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} where
|
||||||
|
rce_source_id = {self.rce_source_id} and rce_source_brand_id = {raw_data['brand']['id']}
|
||||||
|
"""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
data_product['rce_brand_id'] = res[0]
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
data_product['product_sold_total'] = 0
|
||||||
|
|
||||||
|
data_product['product_sold'] = 0
|
||||||
|
try:
|
||||||
|
data_product['product_sold'] = raw_data['bought']
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
data_product['product_price_min'] = 0
|
||||||
|
data_product['product_price_max'] = 0
|
||||||
|
try:
|
||||||
|
data_product['product_price_min'] = raw_data['int_final_price']
|
||||||
|
data_product['product_price_max'] = raw_data['int_final_price']
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
data_product['product_price_min_before_discount'] = 0
|
||||||
|
data_product['product_price_max_before_discount'] = 0
|
||||||
|
try:
|
||||||
|
data_product['product_price_min_before_discount'] = raw_data['price']
|
||||||
|
data_product['product_price_max_before_discount'] = raw_data['price']
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
data_product['ratings'] = 0.0
|
||||||
|
try:
|
||||||
|
data_product['ratings'] = raw_data['rating']['avg_rate']
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
data_product['ships_from'] = ""
|
||||||
|
data_product['product_section'] = data[1]
|
||||||
|
data_product['countryoforigin'] = ""
|
||||||
|
data_product['rank'] = data[8]
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_product(data_product)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
|
||||||
|
#region rce_product_variant
|
||||||
|
|
||||||
|
variant_items = raw_data['attribute']['items']
|
||||||
|
|
||||||
|
df_variant = pd.DataFrame({}, columns=['product_variant_name', 'rce_source_variant_id', 'product_variant_price',
|
||||||
|
'product_variant_stock', 'product_variant_sku'])
|
||||||
|
|
||||||
|
data_variant = {}
|
||||||
|
for variant in variant_items:
|
||||||
|
for item in variant['options']:
|
||||||
|
data_variant['product_variant_name'] = item['long_label']
|
||||||
|
for product in item['products']:
|
||||||
|
data_variant['rce_source_variant_id'] = product['id']
|
||||||
|
data_variant['product_variant_price'] = product['price']
|
||||||
|
data_variant['product_variant_stock'] = product['quantity']
|
||||||
|
data_variant['product_variant_sku'] = product['sku']
|
||||||
|
|
||||||
|
# variants_arr.append(data_variant)
|
||||||
|
|
||||||
|
tmp = pd.DataFrame([[data_variant['product_variant_name'],
|
||||||
|
data_variant['rce_source_variant_id'],
|
||||||
|
data_variant['product_variant_price'],
|
||||||
|
data_variant['product_variant_stock'],
|
||||||
|
data_variant['product_variant_sku']]],
|
||||||
|
columns=['product_variant_name', 'rce_source_variant_id',
|
||||||
|
'product_variant_price',
|
||||||
|
'product_variant_stock', 'product_variant_sku'])
|
||||||
|
df_variant = pd.concat([df_variant, tmp])
|
||||||
|
|
||||||
|
df_variant_merged = df_variant.groupby('product_variant_sku').agg({
|
||||||
|
'product_variant_name': ' '.join,
|
||||||
|
'rce_source_variant_id': 'first',
|
||||||
|
'product_variant_price': 'first',
|
||||||
|
'product_variant_stock': 'first'
|
||||||
|
}).reset_index()
|
||||||
|
|
||||||
|
#print(df_variant_merged.to_string())
|
||||||
|
|
||||||
|
for index, row in df_variant_merged.iterrows():
|
||||||
|
try:
|
||||||
|
data_variant = {}
|
||||||
|
|
||||||
|
data_variant['rce_source_variant_id'] = row['rce_source_variant_id']
|
||||||
|
data_variant['product_variant_name'] = translate_text_to_english(row['product_variant_name'])
|
||||||
|
data_variant['product_variant_name'] = re.sub(self.pattern, '', data_variant['product_variant_name']).replace("'","")
|
||||||
|
data_variant['product_variant_price'] = row['product_variant_price']
|
||||||
|
data_variant['product_variant_price_before_discount'] = 0
|
||||||
|
data_variant['product_variant_stock'] = row['product_variant_stock']
|
||||||
|
data_variant['product_variant_sku'] = row['product_variant_sku']
|
||||||
|
|
||||||
|
data_variant['rce_product_id'] = ""
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
select id from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where
|
||||||
|
rce_source_product_id = {data_product['rce_source_product_id']} and rce_source_id = {data_product['rce_source_id']}
|
||||||
|
"""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
data_variant['rce_product_id'] = self.cur.fetchone()[0]
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_product_variant(data_variant)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
#endregion
|
||||||
|
|
||||||
|
def rating_info(self, raw_data):
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
reviews1 = []
|
||||||
|
reviews2 = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
reviews1 = raw_data['short_rating_data']['image_reviews']
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
reviews2 = raw_data['short_rating_data']['reviews']
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
reviews = reviews1 + reviews2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for review in reviews:
|
||||||
|
data_review = {}
|
||||||
|
|
||||||
|
data_review["rce_product_id"] = ""
|
||||||
|
data_review["username"] = ""
|
||||||
|
data_review["review"] = ""
|
||||||
|
data_review["img_url"] = ""
|
||||||
|
data_review["review_like_count"] = 0
|
||||||
|
data_review["user_tier"] = ""
|
||||||
|
data_review["shop_id"] = 0
|
||||||
|
data_review["video_url"] = ""
|
||||||
|
data_review["rating"] = ""
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
select id from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where
|
||||||
|
rce_source_product_id = {raw_data['id']} and rce_source_id = {self.rce_source_id}
|
||||||
|
"""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
data_review["rce_product_id"] = self.cur.fetchone()[0]
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_review["username"] = str(review['user_fullname']).replace("'", "")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_review["review"] = translate_text_to_english(review['content']).replace("'", "")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_review["rating"] = review['rating']['star']
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_ratings_reviews(data_review)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def seo_info(self, raw_data):
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_seo = {}
|
||||||
|
|
||||||
|
data_seo['rce_product_id'] = 0
|
||||||
|
data_seo['rce_source_id'] = self.rce_source_id
|
||||||
|
data_seo['seo_title'] = ""
|
||||||
|
data_seo['seo_description'] = ""
|
||||||
|
data_seo['seo_url'] = ""
|
||||||
|
data_seo['seo_url_hash'] = ""
|
||||||
|
data_seo['seo_image'] = ""
|
||||||
|
data_seo['seo_price_amount'] = 0
|
||||||
|
data_seo['seo_price_currency'] = ""
|
||||||
|
data_seo['seo_product_band'] = ""
|
||||||
|
data_seo['seo_product_availability'] = ""
|
||||||
|
data_seo['seo_product_category'] = ""
|
||||||
|
data_seo['seo_product_condition'] = ""
|
||||||
|
data_seo['seo_product_retailer_item_id'] = 0
|
||||||
|
data_seo['seo_product_robots'] = ""
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
select id from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where
|
||||||
|
rce_source_product_id = {raw_data['id']} and rce_source_id = {self.rce_source_id}
|
||||||
|
"""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
data_seo['rce_product_id'] = self.cur.fetchone()[0]
|
||||||
|
|
||||||
|
try: data_seo['seo_title'] = translate_text_to_english(raw_data['seo']['og:title']).replace("'","")
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_seo['seo_description'] = translate_text_to_english(raw_data['seo']['og:description']).replace("'","")
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_seo['seo_url'] = str(raw_data['seo']['og:url']).replace("'","")
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_seo['seo_image'] = str(raw_data['seo']['og:image']).replace("'","")
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_seo['seo_price_amount'] = raw_data['seo']['price:amount']
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_seo['seo_price_currency'] = str(raw_data['seo']['price:currency']).replace("'","")
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_seo['seo_product_band'] = translate_text_to_english(raw_data['seo']['product:band']).replace("'","")
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_seo['seo_product_availability'] = str(raw_data['seo']['product:availability']).replace("'","")
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_seo['seo_product_category'] = translate_text_to_english(raw_data['seo']['product:category']).replace("'","")
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_seo['seo_product_condition'] = translate_text_to_english(raw_data['seo']['product:condition']).replace("'","")
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_seo['seo_product_retailer_item_id'] = raw_data['seo']['product:retailer_item_id']
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_seo['seo_product_robots'] = raw_data['seo']['product:robots']
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_seo(data_seo)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,63 @@
|
||||||
|
import time
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# Launch the Playwright browser in mobile mode
|
||||||
|
with sync_playwright() as p:
|
||||||
|
browser = p.chromium.launch(headless=False)
|
||||||
|
context = browser.new_context(user_agent="Mozilla/5.0 (iPhone X; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1")
|
||||||
|
page = context.new_page()
|
||||||
|
|
||||||
|
page.goto("https://hasaki.vn/san-pham/nuoc-tay-trang-tuoi-mat-l-oreal-3-in-1-danh-cho-da-dau-da-hon-hop-400ml-19325.html")
|
||||||
|
page.wait_for_load_state('load')
|
||||||
|
#time.sleep(10)
|
||||||
|
|
||||||
|
# Capture the underlying API request URL
|
||||||
|
#api_requests = page.evaluate('''() => window.fetch('https://hasaki.vn/wap/v2/product/detail').then(response => response.json())''')
|
||||||
|
#print(api_requests)
|
||||||
|
|
||||||
|
with page.expect_response("**/wap/v2/product/detail**") as response:
|
||||||
|
data = response.value.json()
|
||||||
|
|
||||||
|
variant_items = data['attribute']['items']
|
||||||
|
|
||||||
|
df = pd.DataFrame({}, columns=['product_variant_name','rce_source_variant_id','product_variant_price','product_variant_stock','product_variant_sku'])
|
||||||
|
|
||||||
|
data_variant = {}
|
||||||
|
for variant in variant_items:
|
||||||
|
for item in variant['options']:
|
||||||
|
data_variant['product_variant_name'] = item['long_label']
|
||||||
|
for product in item['products']:
|
||||||
|
data_variant['rce_source_variant_id'] = product['id']
|
||||||
|
data_variant['rce_product_id'] = ""
|
||||||
|
data_variant['product_variant_price'] = product['price']
|
||||||
|
data_variant['product_variant_price_before_discount'] = ""
|
||||||
|
data_variant['product_variant_stock'] = product['quantity']
|
||||||
|
data_variant['product_variant_sku'] = product['sku']
|
||||||
|
|
||||||
|
#variants_arr.append(data_variant)
|
||||||
|
|
||||||
|
tmp = pd.DataFrame([[data_variant['product_variant_name'],data_variant['rce_source_variant_id'],data_variant['product_variant_price'],data_variant['product_variant_stock'],data_variant['product_variant_sku']]],
|
||||||
|
columns=['product_variant_name', 'rce_source_variant_id', 'product_variant_price',
|
||||||
|
'product_variant_stock', 'product_variant_sku'])
|
||||||
|
df = pd.concat([df, tmp])
|
||||||
|
|
||||||
|
print(data_variant)
|
||||||
|
|
||||||
|
df = df.sort_values(by=['product_variant_sku'])
|
||||||
|
print(df.to_string())
|
||||||
|
|
||||||
|
print("======================================")
|
||||||
|
|
||||||
|
merged_df = df.groupby('product_variant_sku').agg({
|
||||||
|
'product_variant_name': ' '.join,
|
||||||
|
'rce_source_variant_id': 'first',
|
||||||
|
'product_variant_price': 'first',
|
||||||
|
'product_variant_stock': 'first'
|
||||||
|
}).reset_index()
|
||||||
|
|
||||||
|
print(merged_df.to_string())
|
||||||
|
|
||||||
|
# Close the browser
|
||||||
|
browser.close()
|
|
@ -0,0 +1,25 @@
|
||||||
|
import asyncio
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
async with async_playwright() as p:
|
||||||
|
browser = await p.chromium.launch()
|
||||||
|
context = await browser.new_context()
|
||||||
|
|
||||||
|
page = await context.new_page()
|
||||||
|
|
||||||
|
# Enable request interception
|
||||||
|
await page.route('https://hasaki.vn/wap/v2/product/detail', lambda route: route.continue_())
|
||||||
|
|
||||||
|
# Navigate to the website URL
|
||||||
|
await page.goto('https://hasaki.vn/san-pham/nuoc-hoa-hong-khong-mui-klairs-danh-cho-da-nhay-cam-180ml-65994.html')
|
||||||
|
|
||||||
|
# Wait for the API request to be made
|
||||||
|
response = await page.wait_for_event('request', predicate=lambda req: 'v2/product/detail' in req.url)
|
||||||
|
json_response = await response.response.json()
|
||||||
|
|
||||||
|
print(json_response)
|
||||||
|
|
||||||
|
await browser.close()
|
||||||
|
|
||||||
|
asyncio.run(main())
|
Loading…
Reference in New Issue