added Hasaki crawler

This commit is contained in:
Shariar Imtiaz 2024-03-18 12:12:28 +04:00
parent 71f946c29e
commit 8ee5ccd632
3 changed files with 49 additions and 13 deletions

View File

@ -2,7 +2,6 @@ import hashlib
import logging import logging
import string import string
import re import re
import playwright import playwright
import psycopg2 import psycopg2
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
@ -13,6 +12,11 @@ from Util import translate_text_to_english
from fake_useragent import UserAgent from fake_useragent import UserAgent
import time import time
import random import random
from seleniumwire import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import brotli
import json
class HasakiProductInfo: class HasakiProductInfo:
def __init__(self, config): def __init__(self, config):
@ -88,7 +92,7 @@ class HasakiProductInfo:
self.seo_info(raw_data) self.seo_info(raw_data)
def get_raw_product_data(self, url): def get_raw_product_data(self, url):
retries = 2 retries = 1
for _ in range(retries): for _ in range(retries):
try: try:
with sync_playwright() as p: with sync_playwright() as p:
@ -129,7 +133,42 @@ class HasakiProductInfo:
logging.error(f"An error occurred: {str(e)}") logging.error(f"An error occurred: {str(e)}")
logging.info("Retrying...") logging.info("Retrying...")
return None api_requests = self.get_raw_product_data_selenium(url)
return api_requests
def get_raw_product_data_selenium(self, url):
ua = UserAgent(platforms='mobile')
random_mobile_ua = ua.random
logging.info("using user agent: {}".format(random_mobile_ua))
op = webdriver.ChromeOptions()
op.add_argument(f"user-agent={random_mobile_ua}")
op.add_experimental_option("useAutomationExtension", False)
op.add_argument('--no-sandbox')
op.add_argument('--disable-notifications')
op.add_argument("--lang=en-GB")
op.headless = False
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op)
driver.get(url)
for request in driver.requests:
if request.response:
if '/wap/v2/product/detail' in request.url:
encoding = request.response.headers.get('content-encoding')
# print(encoding)
if encoding:
iteminfo = brotli.decompress(request.response.body)
else:
iteminfo = request.response.body
iteminfo_json = json.loads(iteminfo)
driver.quit()
return iteminfo_json
def product_info(self, data, raw_data): def product_info(self, data, raw_data):

View File

@ -1,17 +1,18 @@
from seleniumwire import webdriver from seleniumwire import webdriver
import random
from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager from webdriver_manager.chrome import ChromeDriverManager
from fake_useragent import UserAgent
ua = UserAgent(platforms='mobile')
random_mobile_ua = ua.random
# Set mobile emulation options
mobile_emulation = {
"deviceName": "iPhone X"
}
op = webdriver.ChromeOptions() op = webdriver.ChromeOptions()
# hight = str(random.randint(640,1280)) # hight = str(random.randint(640,1280))
# width = str(random.randint(1024,1920)) # width = str(random.randint(1024,1920))
# op.add_argument("window-size="+width+","+hight+"") # op.add_argument("window-size="+width+","+hight+"")
op.add_argument(f"user-agent={random_mobile_ua}")
op.add_experimental_option("useAutomationExtension", False) op.add_experimental_option("useAutomationExtension", False)
op.add_argument('--no-sandbox') op.add_argument('--no-sandbox')
op.add_argument('--disable-notifications') op.add_argument('--disable-notifications')
@ -19,13 +20,10 @@ op.add_argument("--lang=en-GB")
op.add_argument("--log-level=3") op.add_argument("--log-level=3")
op.headless = False op.headless = False
driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op) driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op)
# Access a website # Access a website
driver.get('https://hasaki.vn') driver.get('https://hasaki.vn/san-pham/nuoc-tay-trang-bioderma-danh-cho-da-nhay-cam-500ml-9740.html')
# Get all requests made by the browser # Get all requests made by the browser
for request in driver.requests: for request in driver.requests:

View File

@ -1,7 +1,6 @@
import hashlib import hashlib
import logging import logging
import sys import sys
from selenium.webdriver.remote.remote_connection import LOGGER from selenium.webdriver.remote.remote_connection import LOGGER
LOGGER.setLevel(logging.WARNING) LOGGER.setLevel(logging.WARNING)
import string import string