raena-crawler-engine/hasaki_crawler_engine/test_selenium.py

48 lines
1.4 KiB
Python
Raw Normal View History

2024-04-03 08:36:43 +00:00
import logging
from fake_useragent import UserAgent
import brotli
import seleniumwire.undetected_chromedriver as uc
2024-04-03 09:42:04 +00:00
from selenium_stealth import stealth
2024-04-03 08:36:43 +00:00
import json
2024-04-03 08:41:01 +00:00
import time
2024-04-03 08:36:43 +00:00
def get_raw_product_data_selenium(url):
ua = UserAgent(platforms='mobile')
random_mobile_ua = ua.random
logging.info("using user agent: {}".format(random_mobile_ua))
op = uc.ChromeOptions()
op.add_argument(f"user-agent={random_mobile_ua}")
2024-04-03 09:42:04 +00:00
op.add_argument('--blink-settings=imagesEnabled=false')
2024-04-03 08:38:05 +00:00
#op.add_experimental_option("useAutomationExtension", False)
#op.add_argument('--no-sandbox')
#op.add_argument('--disable-notifications')
#op.add_argument("--lang=en-GB")
2024-04-03 08:36:43 +00:00
op.headless = False
2024-04-03 08:40:04 +00:00
driver = uc.Chrome(version_main=122, options=op)
2024-04-03 09:45:15 +00:00
stealth(driver)
2024-04-03 08:36:43 +00:00
driver.get(url)
2024-04-03 08:41:01 +00:00
time.sleep(100)
2024-04-03 08:36:43 +00:00
iteminfo = ""
for request in driver.requests:
if request.response:
if '/wap/v2/product/detail' in request.url:
encoding = request.response.headers.get('content-encoding')
# logging.info(encoding)
if encoding:
iteminfo = brotli.decompress(request.response.body)
else:
iteminfo = request.response.body
driver.quit()
iteminfo_json = json.loads(iteminfo)
print(iteminfo_json)
get_raw_product_data_selenium('https://hasaki.vn/san-pham/tinh-chat-chong-nang-sunplay-hieu-chinh-sac-da-50g-xanh-87613.html')