added Hasaki crawler
This commit is contained in:
parent
71f946c29e
commit
8ee5ccd632
|
@ -2,7 +2,6 @@ import hashlib
|
||||||
import logging
|
import logging
|
||||||
import string
|
import string
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import playwright
|
import playwright
|
||||||
import psycopg2
|
import psycopg2
|
||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright
|
||||||
|
@ -13,6 +12,11 @@ from Util import translate_text_to_english
|
||||||
from fake_useragent import UserAgent
|
from fake_useragent import UserAgent
|
||||||
import time
|
import time
|
||||||
import random
|
import random
|
||||||
|
from seleniumwire import webdriver
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
import brotli
|
||||||
|
import json
|
||||||
|
|
||||||
class HasakiProductInfo:
|
class HasakiProductInfo:
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
|
@ -88,7 +92,7 @@ class HasakiProductInfo:
|
||||||
self.seo_info(raw_data)
|
self.seo_info(raw_data)
|
||||||
|
|
||||||
def get_raw_product_data(self, url):
|
def get_raw_product_data(self, url):
|
||||||
retries = 2
|
retries = 1
|
||||||
for _ in range(retries):
|
for _ in range(retries):
|
||||||
try:
|
try:
|
||||||
with sync_playwright() as p:
|
with sync_playwright() as p:
|
||||||
|
@ -129,7 +133,42 @@ class HasakiProductInfo:
|
||||||
logging.error(f"An error occurred: {str(e)}")
|
logging.error(f"An error occurred: {str(e)}")
|
||||||
logging.info("Retrying...")
|
logging.info("Retrying...")
|
||||||
|
|
||||||
return None
|
api_requests = self.get_raw_product_data_selenium(url)
|
||||||
|
|
||||||
|
return api_requests
|
||||||
|
|
||||||
|
def get_raw_product_data_selenium(self, url):
|
||||||
|
ua = UserAgent(platforms='mobile')
|
||||||
|
random_mobile_ua = ua.random
|
||||||
|
logging.info("using user agent: {}".format(random_mobile_ua))
|
||||||
|
|
||||||
|
op = webdriver.ChromeOptions()
|
||||||
|
op.add_argument(f"user-agent={random_mobile_ua}")
|
||||||
|
op.add_experimental_option("useAutomationExtension", False)
|
||||||
|
op.add_argument('--no-sandbox')
|
||||||
|
op.add_argument('--disable-notifications')
|
||||||
|
op.add_argument("--lang=en-GB")
|
||||||
|
op.headless = False
|
||||||
|
|
||||||
|
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op)
|
||||||
|
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
for request in driver.requests:
|
||||||
|
if request.response:
|
||||||
|
if '/wap/v2/product/detail' in request.url:
|
||||||
|
encoding = request.response.headers.get('content-encoding')
|
||||||
|
# print(encoding)
|
||||||
|
if encoding:
|
||||||
|
iteminfo = brotli.decompress(request.response.body)
|
||||||
|
else:
|
||||||
|
iteminfo = request.response.body
|
||||||
|
|
||||||
|
iteminfo_json = json.loads(iteminfo)
|
||||||
|
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
return iteminfo_json
|
||||||
|
|
||||||
def product_info(self, data, raw_data):
|
def product_info(self, data, raw_data):
|
||||||
|
|
||||||
|
|
|
@ -1,17 +1,18 @@
|
||||||
from seleniumwire import webdriver
|
from seleniumwire import webdriver
|
||||||
import random
|
|
||||||
from selenium.webdriver.chrome.service import Service
|
from selenium.webdriver.chrome.service import Service
|
||||||
from webdriver_manager.chrome import ChromeDriverManager
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
from fake_useragent import UserAgent
|
||||||
|
|
||||||
|
|
||||||
|
ua = UserAgent(platforms='mobile')
|
||||||
|
random_mobile_ua = ua.random
|
||||||
|
|
||||||
# Set mobile emulation options
|
|
||||||
mobile_emulation = {
|
|
||||||
"deviceName": "iPhone X"
|
|
||||||
}
|
|
||||||
|
|
||||||
op = webdriver.ChromeOptions()
|
op = webdriver.ChromeOptions()
|
||||||
# hight = str(random.randint(640,1280))
|
# hight = str(random.randint(640,1280))
|
||||||
# width = str(random.randint(1024,1920))
|
# width = str(random.randint(1024,1920))
|
||||||
# op.add_argument("window-size="+width+","+hight+"")
|
# op.add_argument("window-size="+width+","+hight+"")
|
||||||
|
op.add_argument(f"user-agent={random_mobile_ua}")
|
||||||
op.add_experimental_option("useAutomationExtension", False)
|
op.add_experimental_option("useAutomationExtension", False)
|
||||||
op.add_argument('--no-sandbox')
|
op.add_argument('--no-sandbox')
|
||||||
op.add_argument('--disable-notifications')
|
op.add_argument('--disable-notifications')
|
||||||
|
@ -19,13 +20,10 @@ op.add_argument("--lang=en-GB")
|
||||||
op.add_argument("--log-level=3")
|
op.add_argument("--log-level=3")
|
||||||
op.headless = False
|
op.headless = False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op)
|
driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op)
|
||||||
|
|
||||||
# Access a website
|
# Access a website
|
||||||
driver.get('https://hasaki.vn')
|
driver.get('https://hasaki.vn/san-pham/nuoc-tay-trang-bioderma-danh-cho-da-nhay-cam-500ml-9740.html')
|
||||||
|
|
||||||
# Get all requests made by the browser
|
# Get all requests made by the browser
|
||||||
for request in driver.requests:
|
for request in driver.requests:
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from selenium.webdriver.remote.remote_connection import LOGGER
|
from selenium.webdriver.remote.remote_connection import LOGGER
|
||||||
LOGGER.setLevel(logging.WARNING)
|
LOGGER.setLevel(logging.WARNING)
|
||||||
import string
|
import string
|
||||||
|
|
Loading…
Reference in New Issue