From 1fa170d8a717ac35f07e304710194a0b848aa121 Mon Sep 17 00:00:00 2001 From: "shariar@raenabeauty.com" Date: Fri, 15 Mar 2024 11:47:28 +0400 Subject: [PATCH] added Hasaki crawler --- hasaki_crawler_engine/hasaki_product_info.py | 20 ++++++++++++++------ hasaki_crawler_engine/test.py | 17 +++++++++++------ 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/hasaki_crawler_engine/hasaki_product_info.py b/hasaki_crawler_engine/hasaki_product_info.py index ca643c2..026ff81 100644 --- a/hasaki_crawler_engine/hasaki_product_info.py +++ b/hasaki_crawler_engine/hasaki_product_info.py @@ -2,6 +2,8 @@ import hashlib import logging import string import re + +import playwright import psycopg2 from playwright.sync_api import sync_playwright from hasaki_db_writer import hasaki_db_writer @@ -97,13 +99,19 @@ class HasakiProductInfo: context = browser.new_context(user_agent=random_mobile_ua) page = context.new_page() - page.goto(url) - page.reload() + api_requests = {} - with page.expect_response("**/wap/v2/product/detail**") as response: - api_requests = response.value.json() - - browser.close() + try: + page.goto(url, timeout=5000) + with page.expect_response("**/wap/v2/product/detail**") as response: + api_requests = response.value.json() + except playwright._impl._errors.TimeoutError: + logging.info("Timeout occurred. Retrying.....") + page.reload() + with page.expect_response("**/wap/v2/product/detail**") as response: + api_requests = response.value.json() + finally: + browser.close() return api_requests except Exception as e: diff --git a/hasaki_crawler_engine/test.py b/hasaki_crawler_engine/test.py index d9ddd95..19eb9d2 100644 --- a/hasaki_crawler_engine/test.py +++ b/hasaki_crawler_engine/test.py @@ -1,4 +1,4 @@ - +import playwright from playwright.sync_api import sync_playwright from fake_useragent import UserAgent import logging @@ -14,12 +14,17 @@ with sync_playwright() as p: context = browser.new_context(user_agent=random_mobile_ua) page = context.new_page() - page.goto("https://hasaki.vn/san-pham/kem-duong-skin1004-lam-diu-da-chiet-xuat-rau-ma-75ml-89637.html") + try: - page.reload() - - with page.expect_response("**/wap/v2/product/detail**") as response: - api_requests = response.value.json() + page.goto("https://hasaki.vn/san-pham/kem-duong-skin1004-lam-diu-da-chiet-xuat-rau-ma-75ml-89637.html", + timeout=5000) + with page.expect_response("**/wap/v2/product/detail**") as response: + api_requests = response.value.json() + except playwright._impl._errors.TimeoutError: + logging.info("Timeout occurred. Retrying.....") + page.reload() + with page.expect_response("**/wap/v2/product/detail**") as response: + api_requests = response.value.json() browser.close()