From fe52f64d4bf2dfc75bc00612dd97d6f39557bc09 Mon Sep 17 00:00:00 2001 From: "shariar@raenabeauty.com" Date: Sun, 17 Mar 2024 12:01:57 +0400 Subject: [PATCH] added Hasaki crawler --- hasaki_crawler_engine/hasaki_product_info.py | 3 +- hasaki_crawler_engine/test.py | 80 ++++++++++++-------- hasaki_crawler_engine/test3.py | 34 +++++++++ 3 files changed, 83 insertions(+), 34 deletions(-) create mode 100644 hasaki_crawler_engine/test3.py diff --git a/hasaki_crawler_engine/hasaki_product_info.py b/hasaki_crawler_engine/hasaki_product_info.py index a1e12f3..2695352 100644 --- a/hasaki_crawler_engine/hasaki_product_info.py +++ b/hasaki_crawler_engine/hasaki_product_info.py @@ -71,7 +71,7 @@ class HasakiProductInfo: cnt += 1 - time.sleep(random.randint(7, 23)) + #time.sleep(random.randint(7, 23)) def get_product_info(self, data): @@ -119,6 +119,7 @@ class HasakiProductInfo: except playwright._impl._errors.TimeoutError: logging.info("Timeout occurred. Retrying.....") page.reload() + continue with page.expect_response("**/wap/v2/product/detail**") as response: api_requests = response.value.json() finally: diff --git a/hasaki_crawler_engine/test.py b/hasaki_crawler_engine/test.py index 6eb517f..00c0bcc 100644 --- a/hasaki_crawler_engine/test.py +++ b/hasaki_crawler_engine/test.py @@ -1,37 +1,51 @@ -import playwright -from playwright.sync_api import sync_playwright -from fake_useragent import UserAgent +import time import logging +import playwright +from fake_useragent import UserAgent +from playwright.sync_api import sync_playwright + +def get_raw_product_data(url): + retries = 2 + for _ in range(retries): + try: + with sync_playwright() as p: + browser = p.chromium.launch( + headless=False, + args=[ + "--disable-dev-shm-usage", + "--disable-blink-features=AutomationControlled", + "--disable-component-extensions-with-background-pages" + ] + ) + ua = UserAgent() + random_mobile_ua = ua.random + logging.info("Using user agent: {}".format(random_mobile_ua)) + + context = browser.new_context(user_agent=random_mobile_ua) + context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") + page = context.new_page() + + api_requests = {} + + try: + page.goto(url, timeout=5000) + time.sleep(1) + page.reload() + with page.expect_response("**/wap/v2/product/detail**") as response: + api_requests = response.value.json() + except playwright._impl._errors.TimeoutError: + logging.info("Timeout occurred. Retrying.....") + continue # Retry without closing the browser + finally: + browser.close() + + return api_requests + except Exception as e: + logging.error(f"An error occurred: {str(e)}") + logging.info("Retrying...") + + return None -with sync_playwright() as p: - browser = p.chromium.launch(headless=False, args=[ - "--disable-dev-shm-usage", - "--disable-blink-features=AutomationControlled", - "--disable-component-extensions-with-background-pages" - ]) - ua = UserAgent(platforms='mobile') - random_mobile_ua = ua.random - logging.info("using user agent: {}".format(random_mobile_ua)) - - context = browser.new_context(user_agent=random_mobile_ua) - context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") - page = context.new_page() - - try: - - page.goto("https://hasaki.vn/san-pham/kem-duong-skin1004-lam-diu-da-chiet-xuat-rau-ma-75ml-89637.html", - timeout=5000) - with page.expect_response("**/wap/v2/product/detail**") as response: - api_requests = response.value.json() - except playwright._impl._errors.TimeoutError: - logging.info("Timeout occurred. Retrying.....") - page.reload() - with page.expect_response("**/wap/v2/product/detail**") as response: - api_requests = response.value.json() - - - browser.close() - - print(api_requests) +print(get_raw_product_data("https://hasaki.vn/san-pham/mat-na-naruko-y-di-nhan-do-duong-sang-da-25ml-moi-92613.html")) \ No newline at end of file diff --git a/hasaki_crawler_engine/test3.py b/hasaki_crawler_engine/test3.py new file mode 100644 index 0000000..e0c5ab1 --- /dev/null +++ b/hasaki_crawler_engine/test3.py @@ -0,0 +1,34 @@ +import asyncio +from playwright.async_api import async_playwright + +async def capture_api_response(url): + async with async_playwright() as p: + browser = await p.chromium.launch() + context = await browser.new_context(user_agent="Mozilla/5.0 (iPhone; CPU iPhone OS 13_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1") + page = await context.new_page() + + async def capture_and_retry(): + response = None + retry_count = 0 + while not response and retry_count < 3: # Retry up to 3 times + try: + await page.goto(url) + response = await page.expect_response(lambda resp: "wap/v2/product/detail" in resp.url) + if not response: + print(f"No API response received. Retrying...") + retry_count += 1 + await asyncio.sleep(5) # Retry after 5 seconds + except Exception as e: + print(f"Error occurred: {e}") + retry_count += 1 + await asyncio.sleep(5) # Retry after 5 seconds + + if response: + print(f"API response captured: {await response.text()}") + # Handle the API response here + else: + print("No API response received after multiple attempts.") + + await capture_and_retry() + +asyncio.run(capture_api_response("https://hasaki.vn/san-pham/son-duong-moi-khong-mau-dhc-ho-tro-giam-tham-moi-1-5g-6710.html"))