added Hasaki crawler

This commit is contained in:
Shariar Imtiaz 2024-03-18 12:16:05 +04:00
parent 28f584f829
commit 67df30ff1d
1 changed files with 34 additions and 28 deletions

View File

@ -2,35 +2,41 @@ from seleniumwire import webdriver
from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager from webdriver_manager.chrome import ChromeDriverManager
from fake_useragent import UserAgent from fake_useragent import UserAgent
import brotli
import json
def get_raw_product(url):
ua = UserAgent(platforms='mobile') ua = UserAgent(platforms='mobile')
random_mobile_ua = ua.random random_mobile_ua = ua.random
op = webdriver.ChromeOptions() op = webdriver.ChromeOptions()
# hight = str(random.randint(640,1280))
# width = str(random.randint(1024,1920))
# op.add_argument("window-size="+width+","+hight+"")
op.add_argument(f"user-agent={random_mobile_ua}") op.add_argument(f"user-agent={random_mobile_ua}")
op.add_experimental_option("useAutomationExtension", False) op.add_experimental_option("useAutomationExtension", False)
op.add_argument('--no-sandbox') op.add_argument('--no-sandbox')
op.add_argument('--disable-notifications') op.add_argument('--disable-notifications')
op.add_argument("--lang=en-GB") op.add_argument("--lang=en-GB")
op.add_argument("--log-level=3")
op.headless = False op.headless = False
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op) driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op)
# Access a website driver.get(url)
driver.get('https://hasaki.vn/san-pham/nuoc-tay-trang-bioderma-danh-cho-da-nhay-cam-500ml-9740.html')
# Get all requests made by the browser
for request in driver.requests: for request in driver.requests:
if request.response: if request.response:
if '/wap/v2/product/detail' in request.url: if '/wap/v2/product/detail' in request.url:
encoding = request.response.headers.get('content-encoding')
# print(encoding)
if encoding:
iteminfo = brotli.decompress(request.response.body)
else:
iteminfo = request.response.body iteminfo = request.response.body
print(iteminfo)
# Quit the driver iteminfo_json = json.loads(iteminfo)
driver.quit() driver.quit()
return iteminfo_json
get_raw_product('https://hasaki.vn/san-pham/nuoc-tay-trang-bioderma-danh-cho-da-nhay-cam-500ml-9740.html')