raena-crawler-engine/hasaki_crawler_engine/test2.py

43 lines
1.3 KiB
Python
Raw Normal View History

2024-03-15 07:02:44 +00:00
from seleniumwire import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
2024-03-18 08:12:28 +00:00
from fake_useragent import UserAgent
2024-03-18 08:16:05 +00:00
import brotli
import json
2024-03-18 08:12:28 +00:00
2024-03-18 08:16:05 +00:00
def get_raw_product(url):
ua = UserAgent(platforms='mobile')
random_mobile_ua = ua.random
2024-03-15 07:02:44 +00:00
2024-03-18 08:16:05 +00:00
op = webdriver.ChromeOptions()
op.add_argument(f"user-agent={random_mobile_ua}")
op.add_experimental_option("useAutomationExtension", False)
op.add_argument('--no-sandbox')
op.add_argument('--disable-notifications')
op.add_argument("--lang=en-GB")
op.headless = False
2024-03-15 07:02:44 +00:00
2024-03-18 08:16:05 +00:00
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op)
2024-03-15 07:02:44 +00:00
2024-03-18 08:16:05 +00:00
driver.get(url)
2024-03-15 07:02:44 +00:00
2024-03-18 08:16:05 +00:00
for request in driver.requests:
if request.response:
if '/wap/v2/product/detail' in request.url:
encoding = request.response.headers.get('content-encoding')
# print(encoding)
if encoding:
iteminfo = brotli.decompress(request.response.body)
else:
iteminfo = request.response.body
2024-03-15 07:02:44 +00:00
2024-03-18 08:20:43 +00:00
2024-03-15 07:02:44 +00:00
2024-03-18 08:16:05 +00:00
driver.quit()
2024-03-18 08:20:43 +00:00
iteminfo_json = json.loads(iteminfo)
2024-03-18 08:16:05 +00:00
return iteminfo_json
get_raw_product('https://hasaki.vn/san-pham/nuoc-tay-trang-bioderma-danh-cho-da-nhay-cam-500ml-9740.html')