import hashlib import logging import time import psycopg2 import pandas as pd from pyvirtualdisplay import Display from playwright.sync_api import sync_playwright from hasaki_db_writer import hasaki_db_writer from Util import translate_text_to_english ###### Looger ###### logging.basicConfig(filename="/home/ubuntu/logs/hasaki_crawler.log", filemode='a', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO) class HasakiCategories: def __init__(self, config): try: logging.info("Initializing HasakiSubCategories") self.master_category = [] self.config = config self.crawler_name = self.config.get("crawler_name") self.product_limit = int(self.config.get("product_per_category")) self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) self.conn.autocommit = True self.cur = self.conn.cursor() self.cur.execute(f"""select id from {self.config.get('crawler_schema')}.{self.config.get('source_tab')} where source_name='Hasaki'""") try: self.rce_source_id = self.cur.fetchone()[0] except: logging.info("Source tab is empty. Please check. Exiting.....") exit(1) self.db_writer = hasaki_db_writer(config) self.display = Display(visible=0, size=(800, 600)) self.display.start() except Exception as e: logging.info(e) def __del__(self): logging.info("Closing connection.....") self.conn.close() def start_processing(self): self.crawl_and_track("HEALTH - BEAUTY", "https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html") df = pd.DataFrame(self.master_category, columns=['Index', 'Parent', 'Name', 'Link']) df = df.sort_values('Index') df = df.drop_duplicates(subset='Name', keep='first') self.process_category(df) self.display.stop() def process_category(self, category): for index, row in category.iterrows(): data = {} data['parent_category_id'] = 0 data['rce_source_id'] = self.rce_source_id data['rce_source_category_id'] = 0 data['rce_source_status'] = 1 data['category_name'] = str(row["Name"]).replace("'","") data['category_page_url'] = row["Link"] data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest() data['category_parent_name'] = str(row["Parent"]).replace("'","") self.db_writer.rce_category(data) def crawl_and_track(self, parent, url_to_visit): self.master_category.append((0,"0", parent, url_to_visit)) logging.info(self.master_category) cats = self.crawl_categories(parent, url_to_visit) time.sleep(10) if cats: for cat in cats: self.master_category.append((1,)+(cat)) logging.info((1,)+(cat)) sub_cats1 = self.crawl_categories(cat[1], cat[2]) time.sleep(3) if sub_cats1: for sub_cat1 in sub_cats1: self.master_category.append((2,) + (sub_cat1)) logging.info((2,) + (sub_cat1)) sub_cats2 = self.crawl_categories(sub_cat1[1], sub_cat1[2]) time.sleep(3) if sub_cats2: for sub_cat2 in sub_cats2: self.master_category.append((3,) + (sub_cat2)) logging.info((3,) + (sub_cat2)) sub_cats3 = self.crawl_categories(sub_cat2[1], sub_cat2[2]) time.sleep(3) if sub_cats3: for sub_cat3 in sub_cats3: self.master_category.append((4,) + (sub_cat3)) logging.info((4,) + (sub_cat3)) sub_cats4 = self.crawl_categories(sub_cat3[1], sub_cat3[2]) time.sleep(3) if sub_cats4: for sub_cat4 in sub_cats4: self.master_category.append((4,) + (sub_cat4)) logging.info((5,) + (sub_cat4)) def crawl_categories(self, parent, url_to_visit): with sync_playwright() as p: browser = p.chromium.launch(headless=True) # context = browser.new_context( # viewport={"width": 375, "height": 667, "isMobile": True} # ) page = browser.new_page() # Load the webpage page.goto(url_to_visit) # page.goto('https://hasaki.vn/danh-muc/my-pham-high-end-c1907.html') page.wait_for_load_state('load') container_element = page.query_selector('.block_colaps_sticky.width_common.collaps_sticky') if container_element: item_elements = container_element.query_selector_all('.item_fillter') content_elements = container_element.query_selector_all('.content_fillter') urls = [] for item_element in item_elements: text = item_element.query_selector('a').inner_text() text = translate_text_to_english(text) href = item_element.query_selector('a').get_attribute('href') urls.append((parent, text, href)) for content_element in content_elements: text = content_element.query_selector('a').inner_text() text = translate_text_to_english(text) href = content_element.query_selector('a').get_attribute('href') urls.append((parent, text, href)) # removing previously collected data master_urls = [item[3] for item in self.master_category] filtered_data = [(parent, name, url) for parent, name, url in urls if url not in master_urls] return filtered_data browser.close()