raena-crawler-engine/hasaki_crawler_engine/hasaki_categories.py

165 lines
6.4 KiB
Python
Raw Normal View History

2024-03-14 05:16:59 +00:00
import hashlib
import logging
import time
import psycopg2
import pandas as pd
2024-04-01 07:31:33 +00:00
from pyvirtualdisplay import Display
2024-03-14 05:16:59 +00:00
from playwright.sync_api import sync_playwright
from hasaki_db_writer import hasaki_db_writer
from Util import translate_text_to_english
2024-04-01 07:59:12 +00:00
###### Looger ######
logging.basicConfig(filename="/home/ubuntu/logs/hasaki_crawler.log",
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO)
2024-04-01 07:31:33 +00:00
2024-03-14 05:16:59 +00:00
class HasakiCategories:
def __init__(self, config):
logging.info("Initializing HasakiSubCategories")
self.master_category = []
self.config = config
self.crawler_name = self.config.get("crawler_name")
self.product_limit = int(self.config.get("product_per_category"))
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'),
password=self.config.get('db_pass'), host=self.config.get('db_host'),
port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
self.cur.execute(f"""select id from {self.config.get('crawler_schema')}.{self.config.get('source_tab')} where source_name='Hasaki'""")
try:
self.rce_source_id = self.cur.fetchone()[0]
except:
logging.info("Source tab is empty. Please check. Exiting.....")
exit(1)
self.db_writer = hasaki_db_writer(config)
2024-04-01 07:31:33 +00:00
self.display = Display(visible=0, size=(800, 600))
self.display.start()
2024-03-14 05:16:59 +00:00
def __del__(self):
print("Closing connection.....")
self.conn.close()
2024-04-01 07:43:38 +00:00
2024-03-14 05:16:59 +00:00
def start_processing(self):
self.crawl_and_track("HEALTH - BEAUTY", "https://hasaki.vn/danh-muc/suc-khoe-lam-dep-c3.html")
df = pd.DataFrame(self.master_category, columns=['Index', 'Parent', 'Name', 'Link'])
df = df.sort_values('Index')
df = df.drop_duplicates(subset='Name', keep='first')
self.process_category(df)
2024-04-01 07:43:38 +00:00
self.display.stop()
2024-03-14 05:16:59 +00:00
def process_category(self, category):
for index, row in category.iterrows():
data = {}
data['parent_category_id'] = 0
data['rce_source_id'] = self.rce_source_id
data['rce_source_category_id'] = 0
data['rce_source_status'] = 1
data['category_name'] = str(row["Name"]).replace("'","")
data['category_page_url'] = row["Link"]
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
data['category_parent_name'] = str(row["Parent"]).replace("'","")
self.db_writer.rce_category(data)
def crawl_and_track(self, parent, url_to_visit):
self.master_category.append((0,"0", parent, url_to_visit))
print(self.master_category)
cats = self.crawl_categories(parent, url_to_visit)
time.sleep(10)
if cats:
for cat in cats:
self.master_category.append((1,)+(cat))
print((1,)+(cat))
sub_cats1 = self.crawl_categories(cat[1], cat[2])
2024-03-14 09:32:49 +00:00
time.sleep(3)
2024-03-14 05:16:59 +00:00
if sub_cats1:
for sub_cat1 in sub_cats1:
self.master_category.append((2,) + (sub_cat1))
print((2,) + (sub_cat1))
sub_cats2 = self.crawl_categories(sub_cat1[1], sub_cat1[2])
2024-03-14 09:32:49 +00:00
time.sleep(3)
2024-03-14 05:16:59 +00:00
if sub_cats2:
for sub_cat2 in sub_cats2:
self.master_category.append((3,) + (sub_cat2))
print((3,) + (sub_cat2))
sub_cats3 = self.crawl_categories(sub_cat2[1], sub_cat2[2])
2024-03-14 09:32:49 +00:00
time.sleep(3)
2024-03-14 05:16:59 +00:00
if sub_cats3:
for sub_cat3 in sub_cats3:
self.master_category.append((4,) + (sub_cat3))
print((4,) + (sub_cat3))
2024-03-14 09:32:49 +00:00
sub_cats4 = self.crawl_categories(sub_cat3[1], sub_cat3[2])
time.sleep(3)
if sub_cats4:
for sub_cat4 in sub_cats4:
self.master_category.append((4,) + (sub_cat4))
print((5,) + (sub_cat4))
2024-03-14 05:16:59 +00:00
def crawl_categories(self, parent, url_to_visit):
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
# context = browser.new_context(
# viewport={"width": 375, "height": 667, "isMobile": True}
# )
page = browser.new_page()
# Load the webpage
page.goto(url_to_visit)
# page.goto('https://hasaki.vn/danh-muc/my-pham-high-end-c1907.html')
page.wait_for_load_state('load')
container_element = page.query_selector('.block_colaps_sticky.width_common.collaps_sticky')
if container_element:
item_elements = container_element.query_selector_all('.item_fillter')
content_elements = container_element.query_selector_all('.content_fillter')
urls = []
for item_element in item_elements:
text = item_element.query_selector('a').inner_text()
text = translate_text_to_english(text)
href = item_element.query_selector('a').get_attribute('href')
urls.append((parent, text, href))
for content_element in content_elements:
text = content_element.query_selector('a').inner_text()
text = translate_text_to_english(text)
href = content_element.query_selector('a').get_attribute('href')
urls.append((parent, text, href))
# removing previously collected data
master_urls = [item[3] for item in self.master_category]
filtered_data = [(parent, name, url) for parent, name, url in urls if url not in master_urls]
return filtered_data
browser.close()