added Hasaki crawler

This commit is contained in:
Shariar Imtiaz 2024-03-17 12:01:57 +04:00
parent ed49c2e464
commit fe52f64d4b
3 changed files with 83 additions and 34 deletions

View File

@ -71,7 +71,7 @@ class HasakiProductInfo:
cnt += 1 cnt += 1
time.sleep(random.randint(7, 23)) #time.sleep(random.randint(7, 23))
def get_product_info(self, data): def get_product_info(self, data):
@ -119,6 +119,7 @@ class HasakiProductInfo:
except playwright._impl._errors.TimeoutError: except playwright._impl._errors.TimeoutError:
logging.info("Timeout occurred. Retrying.....") logging.info("Timeout occurred. Retrying.....")
page.reload() page.reload()
continue
with page.expect_response("**/wap/v2/product/detail**") as response: with page.expect_response("**/wap/v2/product/detail**") as response:
api_requests = response.value.json() api_requests = response.value.json()
finally: finally:

View File

@ -1,37 +1,51 @@
import playwright import time
from playwright.sync_api import sync_playwright
from fake_useragent import UserAgent
import logging import logging
import playwright
from fake_useragent import UserAgent
from playwright.sync_api import sync_playwright
def get_raw_product_data(url):
with sync_playwright() as p: retries = 2
browser = p.chromium.launch(headless=False, args=[ for _ in range(retries):
try:
with sync_playwright() as p:
browser = p.chromium.launch(
headless=False,
args=[
"--disable-dev-shm-usage", "--disable-dev-shm-usage",
"--disable-blink-features=AutomationControlled", "--disable-blink-features=AutomationControlled",
"--disable-component-extensions-with-background-pages" "--disable-component-extensions-with-background-pages"
]) ]
ua = UserAgent(platforms='mobile') )
ua = UserAgent()
random_mobile_ua = ua.random random_mobile_ua = ua.random
logging.info("using user agent: {}".format(random_mobile_ua)) logging.info("Using user agent: {}".format(random_mobile_ua))
context = browser.new_context(user_agent=random_mobile_ua) context = browser.new_context(user_agent=random_mobile_ua)
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
page = context.new_page() page = context.new_page()
try: api_requests = {}
page.goto("https://hasaki.vn/san-pham/kem-duong-skin1004-lam-diu-da-chiet-xuat-rau-ma-75ml-89637.html", try:
timeout=5000) page.goto(url, timeout=5000)
time.sleep(1)
page.reload()
with page.expect_response("**/wap/v2/product/detail**") as response: with page.expect_response("**/wap/v2/product/detail**") as response:
api_requests = response.value.json() api_requests = response.value.json()
except playwright._impl._errors.TimeoutError: except playwright._impl._errors.TimeoutError:
logging.info("Timeout occurred. Retrying.....") logging.info("Timeout occurred. Retrying.....")
page.reload() continue # Retry without closing the browser
with page.expect_response("**/wap/v2/product/detail**") as response: finally:
api_requests = response.value.json()
browser.close() browser.close()
print(api_requests) return api_requests
except Exception as e:
logging.error(f"An error occurred: {str(e)}")
logging.info("Retrying...")
return None
print(get_raw_product_data("https://hasaki.vn/san-pham/mat-na-naruko-y-di-nhan-do-duong-sang-da-25ml-moi-92613.html"))

View File

@ -0,0 +1,34 @@
import asyncio
from playwright.async_api import async_playwright
async def capture_api_response(url):
async with async_playwright() as p:
browser = await p.chromium.launch()
context = await browser.new_context(user_agent="Mozilla/5.0 (iPhone; CPU iPhone OS 13_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1")
page = await context.new_page()
async def capture_and_retry():
response = None
retry_count = 0
while not response and retry_count < 3: # Retry up to 3 times
try:
await page.goto(url)
response = await page.expect_response(lambda resp: "wap/v2/product/detail" in resp.url)
if not response:
print(f"No API response received. Retrying...")
retry_count += 1
await asyncio.sleep(5) # Retry after 5 seconds
except Exception as e:
print(f"Error occurred: {e}")
retry_count += 1
await asyncio.sleep(5) # Retry after 5 seconds
if response:
print(f"API response captured: {await response.text()}")
# Handle the API response here
else:
print("No API response received after multiple attempts.")
await capture_and_retry()
asyncio.run(capture_api_response("https://hasaki.vn/san-pham/son-duong-moi-khong-mau-dhc-ho-tro-giam-tham-moi-1-5g-6710.html"))