From 67df30ff1d7c6fbcfe8959e09f5e6db1b2e77a54 Mon Sep 17 00:00:00 2001 From: "shariar@raenabeauty.com" Date: Mon, 18 Mar 2024 12:16:05 +0400 Subject: [PATCH] added Hasaki crawler --- hasaki_crawler_engine/test2.py | 62 +++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/hasaki_crawler_engine/test2.py b/hasaki_crawler_engine/test2.py index 50c42ae..081118f 100644 --- a/hasaki_crawler_engine/test2.py +++ b/hasaki_crawler_engine/test2.py @@ -2,35 +2,41 @@ from seleniumwire import webdriver from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager from fake_useragent import UserAgent +import brotli +import json -ua = UserAgent(platforms='mobile') -random_mobile_ua = ua.random +def get_raw_product(url): + ua = UserAgent(platforms='mobile') + random_mobile_ua = ua.random + + op = webdriver.ChromeOptions() + op.add_argument(f"user-agent={random_mobile_ua}") + op.add_experimental_option("useAutomationExtension", False) + op.add_argument('--no-sandbox') + op.add_argument('--disable-notifications') + op.add_argument("--lang=en-GB") + op.headless = False + + driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op) + + driver.get(url) + + for request in driver.requests: + if request.response: + if '/wap/v2/product/detail' in request.url: + encoding = request.response.headers.get('content-encoding') + # print(encoding) + if encoding: + iteminfo = brotli.decompress(request.response.body) + else: + iteminfo = request.response.body + + iteminfo_json = json.loads(iteminfo) + + driver.quit() + + return iteminfo_json -op = webdriver.ChromeOptions() -# hight = str(random.randint(640,1280)) -# width = str(random.randint(1024,1920)) -# op.add_argument("window-size="+width+","+hight+"") -op.add_argument(f"user-agent={random_mobile_ua}") -op.add_experimental_option("useAutomationExtension", False) -op.add_argument('--no-sandbox') -op.add_argument('--disable-notifications') -op.add_argument("--lang=en-GB") -op.add_argument("--log-level=3") -op.headless = False - -driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op) - -# Access a website -driver.get('https://hasaki.vn/san-pham/nuoc-tay-trang-bioderma-danh-cho-da-nhay-cam-500ml-9740.html') - -# Get all requests made by the browser -for request in driver.requests: - if request.response: - if '/wap/v2/product/detail' in request.url: - iteminfo = request.response.body - print(iteminfo) - -# Quit the driver -driver.quit() +get_raw_product('https://hasaki.vn/san-pham/nuoc-tay-trang-bioderma-danh-cho-da-nhay-cam-500ml-9740.html') \ No newline at end of file