raena-crawler-engine/hasaki_crawler_engine/test.py

53 lines
2.1 KiB
Python
Raw Normal View History

2024-04-03 08:36:43 +00:00
import logging
from playwright.sync_api import sync_playwright
from fake_useragent import UserAgent
import time
import playwright
from playwright_stealth import stealth_sync
def get_raw_product_data(url):
retries = 1
for _ in range(retries):
try:
with sync_playwright() as p:
browser = p.chromium.launch(
headless=False,
args=[
"--disable-dev-shm-usage",
"--disable-blink-features=AutomationControlled",
"--disable-component-extensions-with-background-pages"
]
)
ua = UserAgent(platforms='mobile')
random_mobile_ua = ua.random
logging.info("using user agent: {}".format(random_mobile_ua))
context = browser.new_context(user_agent=random_mobile_ua)
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
page = context.new_page()
stealth_sync(page)
api_requests = {}
try:
page.goto(url, timeout=5000)
time.sleep(1)
page.reload()
with page.expect_response("**/wap/v2/product/detail**") as response:
api_requests = response.value.json()
except playwright._impl._errors.TimeoutError:
logging.info("Timeout occurred. Retrying.....")
page.reload()
with page.expect_response("**/wap/v2/product/detail**") as response:
api_requests = response.value.json()
finally:
browser.close()
return api_requests
except Exception as e:
logging.error(f"An error occurred: {str(e)}")
logging.info("Retrying...")
print(api_requests)
get_raw_product_data('https://hasaki.vn/san-pham/tinh-chat-chong-nang-sunplay-hieu-chinh-sac-da-50g-xanh-87613.html')