From 085cd089475adeb09477f5930e9d6dd7bdaf3649 Mon Sep 17 00:00:00 2001 From: "shariar@raenabeauty.com" Date: Mon, 1 Apr 2024 12:05:11 +0400 Subject: [PATCH] added Hasaki crawler --- hasaki_crawler_engine/hasaki_categories.py | 14 ++--- .../hasaki_category_products.py | 8 +-- hasaki_crawler_engine/hasaki_product_info.py | 8 +-- hasaki_crawler_engine/test.py | 51 ------------------- hasaki_crawler_engine/test2.py | 45 ---------------- hasaki_crawler_engine/test3.py | 34 ------------- 6 files changed, 15 insertions(+), 145 deletions(-) delete mode 100644 hasaki_crawler_engine/test.py delete mode 100644 hasaki_crawler_engine/test2.py delete mode 100644 hasaki_crawler_engine/test3.py diff --git a/hasaki_crawler_engine/hasaki_categories.py b/hasaki_crawler_engine/hasaki_categories.py index d5e4ed2..ea787c6 100644 --- a/hasaki_crawler_engine/hasaki_categories.py +++ b/hasaki_crawler_engine/hasaki_categories.py @@ -46,7 +46,7 @@ class HasakiCategories: logging.info(e) def __del__(self): - print("Closing connection.....") + logging.info("Closing connection.....") self.conn.close() @@ -85,42 +85,42 @@ class HasakiCategories: def crawl_and_track(self, parent, url_to_visit): self.master_category.append((0,"0", parent, url_to_visit)) - print(self.master_category) + logging.info(self.master_category) cats = self.crawl_categories(parent, url_to_visit) time.sleep(10) if cats: for cat in cats: self.master_category.append((1,)+(cat)) - print((1,)+(cat)) + logging.info((1,)+(cat)) sub_cats1 = self.crawl_categories(cat[1], cat[2]) time.sleep(3) if sub_cats1: for sub_cat1 in sub_cats1: self.master_category.append((2,) + (sub_cat1)) - print((2,) + (sub_cat1)) + logging.info((2,) + (sub_cat1)) sub_cats2 = self.crawl_categories(sub_cat1[1], sub_cat1[2]) time.sleep(3) if sub_cats2: for sub_cat2 in sub_cats2: self.master_category.append((3,) + (sub_cat2)) - print((3,) + (sub_cat2)) + logging.info((3,) + (sub_cat2)) sub_cats3 = self.crawl_categories(sub_cat2[1], sub_cat2[2]) time.sleep(3) if sub_cats3: for sub_cat3 in sub_cats3: self.master_category.append((4,) + (sub_cat3)) - print((4,) + (sub_cat3)) + logging.info((4,) + (sub_cat3)) sub_cats4 = self.crawl_categories(sub_cat3[1], sub_cat3[2]) time.sleep(3) if sub_cats4: for sub_cat4 in sub_cats4: self.master_category.append((4,) + (sub_cat4)) - print((5,) + (sub_cat4)) + logging.info((5,) + (sub_cat4)) def crawl_categories(self, parent, url_to_visit): diff --git a/hasaki_crawler_engine/hasaki_category_products.py b/hasaki_crawler_engine/hasaki_category_products.py index 0be34d4..2cabf09 100644 --- a/hasaki_crawler_engine/hasaki_category_products.py +++ b/hasaki_crawler_engine/hasaki_category_products.py @@ -39,7 +39,7 @@ class HasakiCategoryProducts: self.display.start() def __del__(self): - print("Closing connection.....") + logging.info("Closing connection.....") self.conn.close() def start_processing(self): @@ -80,7 +80,7 @@ class HasakiCategoryProducts: for element in top_search_element: url = element.query_selector(".top_big_search").query_selector('a').get_attribute('href').strip() - print(url) + logging.info(url) browser.close() @@ -192,7 +192,7 @@ class HasakiCategoryProducts: logging.info("Product already present. skipping.....") except Exception as e: - print(e) + logging.info(e) item_count += 1 @@ -202,7 +202,7 @@ class HasakiCategoryProducts: browser.close() except Exception as e: - print(e) + logging.info(e) diff --git a/hasaki_crawler_engine/hasaki_product_info.py b/hasaki_crawler_engine/hasaki_product_info.py index d21bad9..d457ffa 100644 --- a/hasaki_crawler_engine/hasaki_product_info.py +++ b/hasaki_crawler_engine/hasaki_product_info.py @@ -52,7 +52,7 @@ class HasakiProductInfo: self.display.start() def __del__(self): - print("Closing connection.....") + logging.info("Closing connection.....") self.conn.close() @@ -96,7 +96,7 @@ class HasakiProductInfo: raw_data = self.get_raw_product_data(data[3]) - print(raw_data) + logging.info(raw_data) if raw_data: self.product_info(data, raw_data) @@ -174,7 +174,7 @@ class HasakiProductInfo: if request.response: if '/wap/v2/product/detail' in request.url: encoding = request.response.headers.get('content-encoding') - # print(encoding) + # logging.info(encoding) if encoding: iteminfo = brotli.decompress(request.response.body) else: @@ -358,7 +358,7 @@ class HasakiProductInfo: 'product_variant_stock': 'first' }).reset_index() - #print(df_variant_merged.to_string()) + #logging.info(df_variant_merged.to_string()) for index, row in df_variant_merged.iterrows(): try: diff --git a/hasaki_crawler_engine/test.py b/hasaki_crawler_engine/test.py deleted file mode 100644 index 00c0bcc..0000000 --- a/hasaki_crawler_engine/test.py +++ /dev/null @@ -1,51 +0,0 @@ -import time -import logging - -import playwright -from fake_useragent import UserAgent -from playwright.sync_api import sync_playwright - -def get_raw_product_data(url): - retries = 2 - for _ in range(retries): - try: - with sync_playwright() as p: - browser = p.chromium.launch( - headless=False, - args=[ - "--disable-dev-shm-usage", - "--disable-blink-features=AutomationControlled", - "--disable-component-extensions-with-background-pages" - ] - ) - ua = UserAgent() - random_mobile_ua = ua.random - logging.info("Using user agent: {}".format(random_mobile_ua)) - - context = browser.new_context(user_agent=random_mobile_ua) - context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") - page = context.new_page() - - api_requests = {} - - try: - page.goto(url, timeout=5000) - time.sleep(1) - page.reload() - with page.expect_response("**/wap/v2/product/detail**") as response: - api_requests = response.value.json() - except playwright._impl._errors.TimeoutError: - logging.info("Timeout occurred. Retrying.....") - continue # Retry without closing the browser - finally: - browser.close() - - return api_requests - except Exception as e: - logging.error(f"An error occurred: {str(e)}") - logging.info("Retrying...") - - return None - - -print(get_raw_product_data("https://hasaki.vn/san-pham/mat-na-naruko-y-di-nhan-do-duong-sang-da-25ml-moi-92613.html")) \ No newline at end of file diff --git a/hasaki_crawler_engine/test2.py b/hasaki_crawler_engine/test2.py deleted file mode 100644 index 3e3fe57..0000000 --- a/hasaki_crawler_engine/test2.py +++ /dev/null @@ -1,45 +0,0 @@ -from seleniumwire import webdriver -from selenium.webdriver.chrome.service import Service -from webdriver_manager.chrome import ChromeDriverManager -from fake_useragent import UserAgent -import brotli -import json - - -def get_raw_product(url): - ua = UserAgent(platforms='mobile') - random_mobile_ua = ua.random - - op = webdriver.ChromeOptions() - op.add_argument(f"user-agent={random_mobile_ua}") - op.add_experimental_option("useAutomationExtension", False) - op.add_argument('--no-sandbox') - op.add_argument('--disable-notifications') - op.add_argument("--lang=en-GB") - op.headless = False - - driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op) - - driver.get(url) - - iteminfo = "" - - for request in driver.requests: - if request.response: - if '/wap/v2/product/detail' in request.url: - encoding = request.response.headers.get('content-encoding') - # print(encoding) - if encoding: - iteminfo = brotli.decompress(request.response.body) - else: - iteminfo = request.response.body - - - - driver.quit() - - iteminfo_json = json.loads(iteminfo) - return iteminfo_json - - -print(get_raw_product('https://hasaki.vn/san-pham/nuoc-tay-trang-bioderma-danh-cho-da-nhay-cam-500ml-9740.html')) \ No newline at end of file diff --git a/hasaki_crawler_engine/test3.py b/hasaki_crawler_engine/test3.py deleted file mode 100644 index e0c5ab1..0000000 --- a/hasaki_crawler_engine/test3.py +++ /dev/null @@ -1,34 +0,0 @@ -import asyncio -from playwright.async_api import async_playwright - -async def capture_api_response(url): - async with async_playwright() as p: - browser = await p.chromium.launch() - context = await browser.new_context(user_agent="Mozilla/5.0 (iPhone; CPU iPhone OS 13_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1") - page = await context.new_page() - - async def capture_and_retry(): - response = None - retry_count = 0 - while not response and retry_count < 3: # Retry up to 3 times - try: - await page.goto(url) - response = await page.expect_response(lambda resp: "wap/v2/product/detail" in resp.url) - if not response: - print(f"No API response received. Retrying...") - retry_count += 1 - await asyncio.sleep(5) # Retry after 5 seconds - except Exception as e: - print(f"Error occurred: {e}") - retry_count += 1 - await asyncio.sleep(5) # Retry after 5 seconds - - if response: - print(f"API response captured: {await response.text()}") - # Handle the API response here - else: - print("No API response received after multiple attempts.") - - await capture_and_retry() - -asyncio.run(capture_api_response("https://hasaki.vn/san-pham/son-duong-moi-khong-mau-dhc-ho-tro-giam-tham-moi-1-5g-6710.html"))