170 lines
6.4 KiB
Python
170 lines
6.4 KiB
Python
import hashlib
|
|
import logging
|
|
import time
|
|
import psycopg2
|
|
import pandas as pd
|
|
from pyvirtualdisplay import Display
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
from hasaki_db_writer import hasaki_db_writer
|
|
from Util import translate_text_to_english
|
|
|
|
|
|
###### Looger ######
|
|
logname = '/home/ubuntu/logs/hasaki_crawler.log'
|
|
#logname = 'hasaki_crawler.log'
|
|
logging.basicConfig(filename=logname,
|
|
filemode='a',
|
|
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
|
datefmt="%Y-%m-%d %H:%M:%S",
|
|
level=logging.INFO)
|
|
|
|
|
|
|
|
|
|
|
|
class HasakiCategories:
|
|
def __init__(self, config):
|
|
logging.info("Initializing HasakiSubCategories")
|
|
self.master_category = []
|
|
self.config = config
|
|
self.crawler_name = self.config.get("crawler_name")
|
|
self.product_limit = int(self.config.get("product_per_category"))
|
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'),
|
|
password=self.config.get('db_pass'), host=self.config.get('db_host'),
|
|
port=self.config.get('db_port'))
|
|
self.conn.autocommit = True
|
|
self.cur = self.conn.cursor()
|
|
self.cur.execute(f"""select id from {self.config.get('crawler_schema')}.{self.config.get('source_tab')} where source_name='Hasaki'""")
|
|
try:
|
|
self.rce_source_id = self.cur.fetchone()[0]
|
|
except:
|
|
logging.info("Source tab is empty. Please check. Exiting.....")
|
|
exit(1)
|
|
|
|
self.db_writer = hasaki_db_writer(config)
|
|
|
|
self.display = Display(visible=0, size=(800, 600))
|
|
self.display.start()
|
|
|
|
def __del__(self):
|
|
print("Closing connection.....")
|
|
self.conn.close()
|
|
|
|
|
|
def start_processing(self):
|
|
|
|
self.crawl_and_track("HEALTH - BEAUTY", "https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html")
|
|
|
|
df = pd.DataFrame(self.master_category, columns=['Index', 'Parent', 'Name', 'Link'])
|
|
|
|
df = df.sort_values('Index')
|
|
|
|
df = df.drop_duplicates(subset='Name', keep='first')
|
|
|
|
self.process_category(df)
|
|
|
|
self.display.stop()
|
|
|
|
|
|
def process_category(self, category):
|
|
|
|
for index, row in category.iterrows():
|
|
data = {}
|
|
|
|
data['parent_category_id'] = 0
|
|
data['rce_source_id'] = self.rce_source_id
|
|
data['rce_source_category_id'] = 0
|
|
data['rce_source_status'] = 1
|
|
data['category_name'] = str(row["Name"]).replace("'","")
|
|
data['category_page_url'] = row["Link"]
|
|
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
|
|
data['category_parent_name'] = str(row["Parent"]).replace("'","")
|
|
|
|
self.db_writer.rce_category(data)
|
|
|
|
|
|
def crawl_and_track(self, parent, url_to_visit):
|
|
self.master_category.append((0,"0", parent, url_to_visit))
|
|
|
|
print(self.master_category)
|
|
|
|
cats = self.crawl_categories(parent, url_to_visit)
|
|
time.sleep(10)
|
|
if cats:
|
|
for cat in cats:
|
|
self.master_category.append((1,)+(cat))
|
|
print((1,)+(cat))
|
|
|
|
sub_cats1 = self.crawl_categories(cat[1], cat[2])
|
|
time.sleep(3)
|
|
if sub_cats1:
|
|
for sub_cat1 in sub_cats1:
|
|
self.master_category.append((2,) + (sub_cat1))
|
|
print((2,) + (sub_cat1))
|
|
|
|
sub_cats2 = self.crawl_categories(sub_cat1[1], sub_cat1[2])
|
|
time.sleep(3)
|
|
if sub_cats2:
|
|
for sub_cat2 in sub_cats2:
|
|
self.master_category.append((3,) + (sub_cat2))
|
|
print((3,) + (sub_cat2))
|
|
|
|
sub_cats3 = self.crawl_categories(sub_cat2[1], sub_cat2[2])
|
|
time.sleep(3)
|
|
if sub_cats3:
|
|
for sub_cat3 in sub_cats3:
|
|
self.master_category.append((4,) + (sub_cat3))
|
|
print((4,) + (sub_cat3))
|
|
|
|
sub_cats4 = self.crawl_categories(sub_cat3[1], sub_cat3[2])
|
|
time.sleep(3)
|
|
if sub_cats4:
|
|
for sub_cat4 in sub_cats4:
|
|
self.master_category.append((4,) + (sub_cat4))
|
|
print((5,) + (sub_cat4))
|
|
|
|
def crawl_categories(self, parent, url_to_visit):
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
# context = browser.new_context(
|
|
# viewport={"width": 375, "height": 667, "isMobile": True}
|
|
# )
|
|
page = browser.new_page()
|
|
|
|
# Load the webpage
|
|
page.goto(url_to_visit)
|
|
# page.goto('https://hasaki.vn/danh-muc/my-pham-high-end-c1907.html')
|
|
|
|
page.wait_for_load_state('load')
|
|
|
|
container_element = page.query_selector('.block_colaps_sticky.width_common.collaps_sticky')
|
|
|
|
if container_element:
|
|
item_elements = container_element.query_selector_all('.item_fillter')
|
|
content_elements = container_element.query_selector_all('.content_fillter')
|
|
|
|
urls = []
|
|
|
|
for item_element in item_elements:
|
|
text = item_element.query_selector('a').inner_text()
|
|
text = translate_text_to_english(text)
|
|
href = item_element.query_selector('a').get_attribute('href')
|
|
urls.append((parent, text, href))
|
|
|
|
for content_element in content_elements:
|
|
text = content_element.query_selector('a').inner_text()
|
|
text = translate_text_to_english(text)
|
|
href = content_element.query_selector('a').get_attribute('href')
|
|
urls.append((parent, text, href))
|
|
|
|
# removing previously collected data
|
|
master_urls = [item[3] for item in self.master_category]
|
|
filtered_data = [(parent, name, url) for parent, name, url in urls if url not in master_urls]
|
|
|
|
return filtered_data
|
|
|
|
browser.close()
|
|
|