added Hasaki crawler
This commit is contained in:
parent
e98ad29800
commit
8237577385
|
@ -0,0 +1,53 @@
|
|||
import logging
|
||||
from playwright.sync_api import sync_playwright
|
||||
from fake_useragent import UserAgent
|
||||
import time
|
||||
import playwright
|
||||
from playwright_stealth import stealth_sync
|
||||
def get_raw_product_data(url):
|
||||
retries = 1
|
||||
for _ in range(retries):
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(
|
||||
headless=False,
|
||||
args=[
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--disable-component-extensions-with-background-pages"
|
||||
]
|
||||
)
|
||||
ua = UserAgent(platforms='mobile')
|
||||
random_mobile_ua = ua.random
|
||||
logging.info("using user agent: {}".format(random_mobile_ua))
|
||||
|
||||
context = browser.new_context(user_agent=random_mobile_ua)
|
||||
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
||||
page = context.new_page()
|
||||
stealth_sync(page)
|
||||
|
||||
api_requests = {}
|
||||
|
||||
try:
|
||||
page.goto(url, timeout=5000)
|
||||
time.sleep(1)
|
||||
page.reload()
|
||||
with page.expect_response("**/wap/v2/product/detail**") as response:
|
||||
api_requests = response.value.json()
|
||||
except playwright._impl._errors.TimeoutError:
|
||||
logging.info("Timeout occurred. Retrying.....")
|
||||
page.reload()
|
||||
with page.expect_response("**/wap/v2/product/detail**") as response:
|
||||
api_requests = response.value.json()
|
||||
finally:
|
||||
browser.close()
|
||||
|
||||
return api_requests
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred: {str(e)}")
|
||||
logging.info("Retrying...")
|
||||
|
||||
print(api_requests)
|
||||
|
||||
|
||||
get_raw_product_data('https://hasaki.vn/san-pham/tinh-chat-chong-nang-sunplay-hieu-chinh-sac-da-50g-xanh-87613.html')
|
|
@ -0,0 +1,19 @@
|
|||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def bypass_cloudflare():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=False)
|
||||
page = await browser.new_page()
|
||||
await page.goto('https://hasaki.vn/san-pham/tinh-chat-chong-nang-sunplay-hieu-chinh-sac-da-50g-xanh-87613.html') # Specify the URL of the website you want to access
|
||||
|
||||
# Add any necessary code here to interact with the webpage
|
||||
|
||||
await asyncio.sleep(10) # Wait for some time for Cloudflare checks
|
||||
|
||||
content = await page.content()
|
||||
print(content)
|
||||
|
||||
await browser.close()
|
||||
|
||||
asyncio.run(bypass_cloudflare())
|
|
@ -0,0 +1,45 @@
|
|||
from seleniumwire import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
import logging
|
||||
from fake_useragent import UserAgent
|
||||
import brotli
|
||||
import seleniumwire.undetected_chromedriver as uc
|
||||
import json
|
||||
|
||||
def get_raw_product_data_selenium(url):
|
||||
ua = UserAgent(platforms='mobile')
|
||||
random_mobile_ua = ua.random
|
||||
logging.info("using user agent: {}".format(random_mobile_ua))
|
||||
|
||||
op = uc.ChromeOptions()
|
||||
op.add_argument(f"user-agent={random_mobile_ua}")
|
||||
op.add_experimental_option("useAutomationExtension", False)
|
||||
op.add_argument('--no-sandbox')
|
||||
op.add_argument('--disable-notifications')
|
||||
op.add_argument("--lang=en-GB")
|
||||
op.headless = False
|
||||
|
||||
driver = uc.Chrome(service=Service(ChromeDriverManager().install()), options=op)
|
||||
|
||||
driver.get(url)
|
||||
|
||||
iteminfo = ""
|
||||
|
||||
for request in driver.requests:
|
||||
if request.response:
|
||||
if '/wap/v2/product/detail' in request.url:
|
||||
encoding = request.response.headers.get('content-encoding')
|
||||
# logging.info(encoding)
|
||||
if encoding:
|
||||
iteminfo = brotli.decompress(request.response.body)
|
||||
else:
|
||||
iteminfo = request.response.body
|
||||
|
||||
driver.quit()
|
||||
|
||||
iteminfo_json = json.loads(iteminfo)
|
||||
print(iteminfo_json)
|
||||
|
||||
|
||||
get_raw_product_data_selenium('https://hasaki.vn/san-pham/tinh-chat-chong-nang-sunplay-hieu-chinh-sac-da-50g-xanh-87613.html')
|
Loading…
Reference in New Issue