From 82375773851709cdb02db0d0a1a3134f59bcd5f2 Mon Sep 17 00:00:00 2001 From: "shariar@raenabeauty.com" Date: Wed, 3 Apr 2024 12:36:43 +0400 Subject: [PATCH] added Hasaki crawler --- hasaki_crawler_engine/test.py | 53 ++++++++++++++++++++++++++ hasaki_crawler_engine/test2.py | 19 +++++++++ hasaki_crawler_engine/test_selenium.py | 45 ++++++++++++++++++++++ 3 files changed, 117 insertions(+) create mode 100644 hasaki_crawler_engine/test.py create mode 100644 hasaki_crawler_engine/test2.py create mode 100644 hasaki_crawler_engine/test_selenium.py diff --git a/hasaki_crawler_engine/test.py b/hasaki_crawler_engine/test.py new file mode 100644 index 0000000..be4b03e --- /dev/null +++ b/hasaki_crawler_engine/test.py @@ -0,0 +1,53 @@ +import logging +from playwright.sync_api import sync_playwright +from fake_useragent import UserAgent +import time +import playwright +from playwright_stealth import stealth_sync +def get_raw_product_data(url): + retries = 1 + for _ in range(retries): + try: + with sync_playwright() as p: + browser = p.chromium.launch( + headless=False, + args=[ + "--disable-dev-shm-usage", + "--disable-blink-features=AutomationControlled", + "--disable-component-extensions-with-background-pages" + ] + ) + ua = UserAgent(platforms='mobile') + random_mobile_ua = ua.random + logging.info("using user agent: {}".format(random_mobile_ua)) + + context = browser.new_context(user_agent=random_mobile_ua) + context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") + page = context.new_page() + stealth_sync(page) + + api_requests = {} + + try: + page.goto(url, timeout=5000) + time.sleep(1) + page.reload() + with page.expect_response("**/wap/v2/product/detail**") as response: + api_requests = response.value.json() + except playwright._impl._errors.TimeoutError: + logging.info("Timeout occurred. Retrying.....") + page.reload() + with page.expect_response("**/wap/v2/product/detail**") as response: + api_requests = response.value.json() + finally: + browser.close() + + return api_requests + except Exception as e: + logging.error(f"An error occurred: {str(e)}") + logging.info("Retrying...") + + print(api_requests) + + +get_raw_product_data('https://hasaki.vn/san-pham/tinh-chat-chong-nang-sunplay-hieu-chinh-sac-da-50g-xanh-87613.html') \ No newline at end of file diff --git a/hasaki_crawler_engine/test2.py b/hasaki_crawler_engine/test2.py new file mode 100644 index 0000000..34fa4fa --- /dev/null +++ b/hasaki_crawler_engine/test2.py @@ -0,0 +1,19 @@ +import asyncio +from playwright.async_api import async_playwright + +async def bypass_cloudflare(): + async with async_playwright() as p: + browser = await p.chromium.launch(headless=False) + page = await browser.new_page() + await page.goto('https://hasaki.vn/san-pham/tinh-chat-chong-nang-sunplay-hieu-chinh-sac-da-50g-xanh-87613.html') # Specify the URL of the website you want to access + + # Add any necessary code here to interact with the webpage + + await asyncio.sleep(10) # Wait for some time for Cloudflare checks + + content = await page.content() + print(content) + + await browser.close() + +asyncio.run(bypass_cloudflare()) diff --git a/hasaki_crawler_engine/test_selenium.py b/hasaki_crawler_engine/test_selenium.py new file mode 100644 index 0000000..876f9b0 --- /dev/null +++ b/hasaki_crawler_engine/test_selenium.py @@ -0,0 +1,45 @@ +from seleniumwire import webdriver +from selenium.webdriver.chrome.service import Service +from webdriver_manager.chrome import ChromeDriverManager +import logging +from fake_useragent import UserAgent +import brotli +import seleniumwire.undetected_chromedriver as uc +import json + +def get_raw_product_data_selenium(url): + ua = UserAgent(platforms='mobile') + random_mobile_ua = ua.random + logging.info("using user agent: {}".format(random_mobile_ua)) + + op = uc.ChromeOptions() + op.add_argument(f"user-agent={random_mobile_ua}") + op.add_experimental_option("useAutomationExtension", False) + op.add_argument('--no-sandbox') + op.add_argument('--disable-notifications') + op.add_argument("--lang=en-GB") + op.headless = False + + driver = uc.Chrome(service=Service(ChromeDriverManager().install()), options=op) + + driver.get(url) + + iteminfo = "" + + for request in driver.requests: + if request.response: + if '/wap/v2/product/detail' in request.url: + encoding = request.response.headers.get('content-encoding') + # logging.info(encoding) + if encoding: + iteminfo = brotli.decompress(request.response.body) + else: + iteminfo = request.response.body + + driver.quit() + + iteminfo_json = json.loads(iteminfo) + print(iteminfo_json) + + +get_raw_product_data_selenium('https://hasaki.vn/san-pham/tinh-chat-chong-nang-sunplay-hieu-chinh-sac-da-50g-xanh-87613.html') \ No newline at end of file