added Hasaki crawler

This commit is contained in:
Shariar Imtiaz 2024-03-15 11:47:28 +04:00
parent 0fa1dc963d
commit 1fa170d8a7
2 changed files with 25 additions and 12 deletions

View File

@ -2,6 +2,8 @@ import hashlib
import logging
import string
import re
import playwright
import psycopg2
from playwright.sync_api import sync_playwright
from hasaki_db_writer import hasaki_db_writer
@ -97,13 +99,19 @@ class HasakiProductInfo:
context = browser.new_context(user_agent=random_mobile_ua)
page = context.new_page()
page.goto(url)
page.reload()
api_requests = {}
with page.expect_response("**/wap/v2/product/detail**") as response:
api_requests = response.value.json()
browser.close()
try:
page.goto(url, timeout=5000)
with page.expect_response("**/wap/v2/product/detail**") as response:
api_requests = response.value.json()
except playwright._impl._errors.TimeoutError:
logging.info("Timeout occurred. Retrying.....")
page.reload()
with page.expect_response("**/wap/v2/product/detail**") as response:
api_requests = response.value.json()
finally:
browser.close()
return api_requests
except Exception as e:

View File

@ -1,4 +1,4 @@
import playwright
from playwright.sync_api import sync_playwright
from fake_useragent import UserAgent
import logging
@ -14,12 +14,17 @@ with sync_playwright() as p:
context = browser.new_context(user_agent=random_mobile_ua)
page = context.new_page()
page.goto("https://hasaki.vn/san-pham/kem-duong-skin1004-lam-diu-da-chiet-xuat-rau-ma-75ml-89637.html")
try:
page.reload()
with page.expect_response("**/wap/v2/product/detail**") as response:
api_requests = response.value.json()
page.goto("https://hasaki.vn/san-pham/kem-duong-skin1004-lam-diu-da-chiet-xuat-rau-ma-75ml-89637.html",
timeout=5000)
with page.expect_response("**/wap/v2/product/detail**") as response:
api_requests = response.value.json()
except playwright._impl._errors.TimeoutError:
logging.info("Timeout occurred. Retrying.....")
page.reload()
with page.expect_response("**/wap/v2/product/detail**") as response:
api_requests = response.value.json()
browser.close()