added Hasaki crawler
This commit is contained in:
parent
ed49c2e464
commit
fe52f64d4b
|
@ -71,7 +71,7 @@ class HasakiProductInfo:
|
||||||
|
|
||||||
cnt += 1
|
cnt += 1
|
||||||
|
|
||||||
time.sleep(random.randint(7, 23))
|
#time.sleep(random.randint(7, 23))
|
||||||
|
|
||||||
|
|
||||||
def get_product_info(self, data):
|
def get_product_info(self, data):
|
||||||
|
@ -119,6 +119,7 @@ class HasakiProductInfo:
|
||||||
except playwright._impl._errors.TimeoutError:
|
except playwright._impl._errors.TimeoutError:
|
||||||
logging.info("Timeout occurred. Retrying.....")
|
logging.info("Timeout occurred. Retrying.....")
|
||||||
page.reload()
|
page.reload()
|
||||||
|
continue
|
||||||
with page.expect_response("**/wap/v2/product/detail**") as response:
|
with page.expect_response("**/wap/v2/product/detail**") as response:
|
||||||
api_requests = response.value.json()
|
api_requests = response.value.json()
|
||||||
finally:
|
finally:
|
||||||
|
|
|
@ -1,37 +1,51 @@
|
||||||
import playwright
|
import time
|
||||||
from playwright.sync_api import sync_playwright
|
|
||||||
from fake_useragent import UserAgent
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
import playwright
|
||||||
|
from fake_useragent import UserAgent
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
|
||||||
|
def get_raw_product_data(url):
|
||||||
|
retries = 2
|
||||||
|
for _ in range(retries):
|
||||||
|
try:
|
||||||
with sync_playwright() as p:
|
with sync_playwright() as p:
|
||||||
browser = p.chromium.launch(headless=False, args=[
|
browser = p.chromium.launch(
|
||||||
|
headless=False,
|
||||||
|
args=[
|
||||||
"--disable-dev-shm-usage",
|
"--disable-dev-shm-usage",
|
||||||
"--disable-blink-features=AutomationControlled",
|
"--disable-blink-features=AutomationControlled",
|
||||||
"--disable-component-extensions-with-background-pages"
|
"--disable-component-extensions-with-background-pages"
|
||||||
])
|
]
|
||||||
ua = UserAgent(platforms='mobile')
|
)
|
||||||
|
ua = UserAgent()
|
||||||
random_mobile_ua = ua.random
|
random_mobile_ua = ua.random
|
||||||
logging.info("using user agent: {}".format(random_mobile_ua))
|
logging.info("Using user agent: {}".format(random_mobile_ua))
|
||||||
|
|
||||||
context = browser.new_context(user_agent=random_mobile_ua)
|
context = browser.new_context(user_agent=random_mobile_ua)
|
||||||
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
||||||
page = context.new_page()
|
page = context.new_page()
|
||||||
|
|
||||||
try:
|
api_requests = {}
|
||||||
|
|
||||||
page.goto("https://hasaki.vn/san-pham/kem-duong-skin1004-lam-diu-da-chiet-xuat-rau-ma-75ml-89637.html",
|
try:
|
||||||
timeout=5000)
|
page.goto(url, timeout=5000)
|
||||||
|
time.sleep(1)
|
||||||
|
page.reload()
|
||||||
with page.expect_response("**/wap/v2/product/detail**") as response:
|
with page.expect_response("**/wap/v2/product/detail**") as response:
|
||||||
api_requests = response.value.json()
|
api_requests = response.value.json()
|
||||||
except playwright._impl._errors.TimeoutError:
|
except playwright._impl._errors.TimeoutError:
|
||||||
logging.info("Timeout occurred. Retrying.....")
|
logging.info("Timeout occurred. Retrying.....")
|
||||||
page.reload()
|
continue # Retry without closing the browser
|
||||||
with page.expect_response("**/wap/v2/product/detail**") as response:
|
finally:
|
||||||
api_requests = response.value.json()
|
|
||||||
|
|
||||||
|
|
||||||
browser.close()
|
browser.close()
|
||||||
|
|
||||||
print(api_requests)
|
return api_requests
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"An error occurred: {str(e)}")
|
||||||
|
logging.info("Retrying...")
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
print(get_raw_product_data("https://hasaki.vn/san-pham/mat-na-naruko-y-di-nhan-do-duong-sang-da-25ml-moi-92613.html"))
|
|
@ -0,0 +1,34 @@
|
||||||
|
import asyncio
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
|
async def capture_api_response(url):
|
||||||
|
async with async_playwright() as p:
|
||||||
|
browser = await p.chromium.launch()
|
||||||
|
context = await browser.new_context(user_agent="Mozilla/5.0 (iPhone; CPU iPhone OS 13_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1")
|
||||||
|
page = await context.new_page()
|
||||||
|
|
||||||
|
async def capture_and_retry():
|
||||||
|
response = None
|
||||||
|
retry_count = 0
|
||||||
|
while not response and retry_count < 3: # Retry up to 3 times
|
||||||
|
try:
|
||||||
|
await page.goto(url)
|
||||||
|
response = await page.expect_response(lambda resp: "wap/v2/product/detail" in resp.url)
|
||||||
|
if not response:
|
||||||
|
print(f"No API response received. Retrying...")
|
||||||
|
retry_count += 1
|
||||||
|
await asyncio.sleep(5) # Retry after 5 seconds
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error occurred: {e}")
|
||||||
|
retry_count += 1
|
||||||
|
await asyncio.sleep(5) # Retry after 5 seconds
|
||||||
|
|
||||||
|
if response:
|
||||||
|
print(f"API response captured: {await response.text()}")
|
||||||
|
# Handle the API response here
|
||||||
|
else:
|
||||||
|
print("No API response received after multiple attempts.")
|
||||||
|
|
||||||
|
await capture_and_retry()
|
||||||
|
|
||||||
|
asyncio.run(capture_api_response("https://hasaki.vn/san-pham/son-duong-moi-khong-mau-dhc-ho-tro-giam-tham-moi-1-5g-6710.html"))
|
Loading…
Reference in New Issue