diff --git a/hasaki_crawler_engine/hasaki_product_info.py b/hasaki_crawler_engine/hasaki_product_info.py index da2c278..9db5faf 100644 --- a/hasaki_crawler_engine/hasaki_product_info.py +++ b/hasaki_crawler_engine/hasaki_product_info.py @@ -2,7 +2,6 @@ import hashlib import logging import string import re - import playwright import psycopg2 from playwright.sync_api import sync_playwright @@ -13,6 +12,11 @@ from Util import translate_text_to_english from fake_useragent import UserAgent import time import random +from seleniumwire import webdriver +from selenium.webdriver.chrome.service import Service +from webdriver_manager.chrome import ChromeDriverManager +import brotli +import json class HasakiProductInfo: def __init__(self, config): @@ -88,7 +92,7 @@ class HasakiProductInfo: self.seo_info(raw_data) def get_raw_product_data(self, url): - retries = 2 + retries = 1 for _ in range(retries): try: with sync_playwright() as p: @@ -129,7 +133,42 @@ class HasakiProductInfo: logging.error(f"An error occurred: {str(e)}") logging.info("Retrying...") - return None + api_requests = self.get_raw_product_data_selenium(url) + + return api_requests + + def get_raw_product_data_selenium(self, url): + ua = UserAgent(platforms='mobile') + random_mobile_ua = ua.random + logging.info("using user agent: {}".format(random_mobile_ua)) + + op = webdriver.ChromeOptions() + op.add_argument(f"user-agent={random_mobile_ua}") + op.add_experimental_option("useAutomationExtension", False) + op.add_argument('--no-sandbox') + op.add_argument('--disable-notifications') + op.add_argument("--lang=en-GB") + op.headless = False + + driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op) + + driver.get(url) + + for request in driver.requests: + if request.response: + if '/wap/v2/product/detail' in request.url: + encoding = request.response.headers.get('content-encoding') + # print(encoding) + if encoding: + iteminfo = brotli.decompress(request.response.body) + else: + iteminfo = request.response.body + + iteminfo_json = json.loads(iteminfo) + + driver.quit() + + return iteminfo_json def product_info(self, data, raw_data): diff --git a/hasaki_crawler_engine/test2.py b/hasaki_crawler_engine/test2.py index 7ba1fe5..50c42ae 100644 --- a/hasaki_crawler_engine/test2.py +++ b/hasaki_crawler_engine/test2.py @@ -1,17 +1,18 @@ from seleniumwire import webdriver -import random from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager +from fake_useragent import UserAgent + + +ua = UserAgent(platforms='mobile') +random_mobile_ua = ua.random -# Set mobile emulation options -mobile_emulation = { - "deviceName": "iPhone X" -} op = webdriver.ChromeOptions() # hight = str(random.randint(640,1280)) # width = str(random.randint(1024,1920)) # op.add_argument("window-size="+width+","+hight+"") +op.add_argument(f"user-agent={random_mobile_ua}") op.add_experimental_option("useAutomationExtension", False) op.add_argument('--no-sandbox') op.add_argument('--disable-notifications') @@ -19,13 +20,10 @@ op.add_argument("--lang=en-GB") op.add_argument("--log-level=3") op.headless = False - - - driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op) # Access a website -driver.get('https://hasaki.vn') +driver.get('https://hasaki.vn/san-pham/nuoc-tay-trang-bioderma-danh-cho-da-nhay-cam-500ml-9740.html') # Get all requests made by the browser for request in driver.requests: diff --git a/shopee_crawler_engine/shopee_products.py b/shopee_crawler_engine/shopee_products.py index 382d730..576f7fb 100755 --- a/shopee_crawler_engine/shopee_products.py +++ b/shopee_crawler_engine/shopee_products.py @@ -1,7 +1,6 @@ import hashlib import logging import sys - from selenium.webdriver.remote.remote_connection import LOGGER LOGGER.setLevel(logging.WARNING) import string