added Hasaki crawler

This commit is contained in:
Shariar Imtiaz 2024-03-15 11:47:28 +04:00
parent 0fa1dc963d
commit 1fa170d8a7
2 changed files with 25 additions and 12 deletions

View File

@ -2,6 +2,8 @@ import hashlib
import logging import logging
import string import string
import re import re
import playwright
import psycopg2 import psycopg2
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
from hasaki_db_writer import hasaki_db_writer from hasaki_db_writer import hasaki_db_writer
@ -97,12 +99,18 @@ class HasakiProductInfo:
context = browser.new_context(user_agent=random_mobile_ua) context = browser.new_context(user_agent=random_mobile_ua)
page = context.new_page() page = context.new_page()
page.goto(url) api_requests = {}
page.reload()
try:
page.goto(url, timeout=5000)
with page.expect_response("**/wap/v2/product/detail**") as response: with page.expect_response("**/wap/v2/product/detail**") as response:
api_requests = response.value.json() api_requests = response.value.json()
except playwright._impl._errors.TimeoutError:
logging.info("Timeout occurred. Retrying.....")
page.reload()
with page.expect_response("**/wap/v2/product/detail**") as response:
api_requests = response.value.json()
finally:
browser.close() browser.close()
return api_requests return api_requests

View File

@ -1,4 +1,4 @@
import playwright
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
from fake_useragent import UserAgent from fake_useragent import UserAgent
import logging import logging
@ -14,10 +14,15 @@ with sync_playwright() as p:
context = browser.new_context(user_agent=random_mobile_ua) context = browser.new_context(user_agent=random_mobile_ua)
page = context.new_page() page = context.new_page()
page.goto("https://hasaki.vn/san-pham/kem-duong-skin1004-lam-diu-da-chiet-xuat-rau-ma-75ml-89637.html") try:
page.goto("https://hasaki.vn/san-pham/kem-duong-skin1004-lam-diu-da-chiet-xuat-rau-ma-75ml-89637.html",
timeout=5000)
with page.expect_response("**/wap/v2/product/detail**") as response:
api_requests = response.value.json()
except playwright._impl._errors.TimeoutError:
logging.info("Timeout occurred. Retrying.....")
page.reload() page.reload()
with page.expect_response("**/wap/v2/product/detail**") as response: with page.expect_response("**/wap/v2/product/detail**") as response:
api_requests = response.value.json() api_requests = response.value.json()