added Hasaki crawler
This commit is contained in:
parent
0fa1dc963d
commit
1fa170d8a7
|
@ -2,6 +2,8 @@ import hashlib
|
||||||
import logging
|
import logging
|
||||||
import string
|
import string
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
import playwright
|
||||||
import psycopg2
|
import psycopg2
|
||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright
|
||||||
from hasaki_db_writer import hasaki_db_writer
|
from hasaki_db_writer import hasaki_db_writer
|
||||||
|
@ -97,12 +99,18 @@ class HasakiProductInfo:
|
||||||
context = browser.new_context(user_agent=random_mobile_ua)
|
context = browser.new_context(user_agent=random_mobile_ua)
|
||||||
page = context.new_page()
|
page = context.new_page()
|
||||||
|
|
||||||
page.goto(url)
|
api_requests = {}
|
||||||
page.reload()
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
page.goto(url, timeout=5000)
|
||||||
with page.expect_response("**/wap/v2/product/detail**") as response:
|
with page.expect_response("**/wap/v2/product/detail**") as response:
|
||||||
api_requests = response.value.json()
|
api_requests = response.value.json()
|
||||||
|
except playwright._impl._errors.TimeoutError:
|
||||||
|
logging.info("Timeout occurred. Retrying.....")
|
||||||
|
page.reload()
|
||||||
|
with page.expect_response("**/wap/v2/product/detail**") as response:
|
||||||
|
api_requests = response.value.json()
|
||||||
|
finally:
|
||||||
browser.close()
|
browser.close()
|
||||||
|
|
||||||
return api_requests
|
return api_requests
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
|
import playwright
|
||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright
|
||||||
from fake_useragent import UserAgent
|
from fake_useragent import UserAgent
|
||||||
import logging
|
import logging
|
||||||
|
@ -14,10 +14,15 @@ with sync_playwright() as p:
|
||||||
context = browser.new_context(user_agent=random_mobile_ua)
|
context = browser.new_context(user_agent=random_mobile_ua)
|
||||||
page = context.new_page()
|
page = context.new_page()
|
||||||
|
|
||||||
page.goto("https://hasaki.vn/san-pham/kem-duong-skin1004-lam-diu-da-chiet-xuat-rau-ma-75ml-89637.html")
|
try:
|
||||||
|
|
||||||
|
page.goto("https://hasaki.vn/san-pham/kem-duong-skin1004-lam-diu-da-chiet-xuat-rau-ma-75ml-89637.html",
|
||||||
|
timeout=5000)
|
||||||
|
with page.expect_response("**/wap/v2/product/detail**") as response:
|
||||||
|
api_requests = response.value.json()
|
||||||
|
except playwright._impl._errors.TimeoutError:
|
||||||
|
logging.info("Timeout occurred. Retrying.....")
|
||||||
page.reload()
|
page.reload()
|
||||||
|
|
||||||
with page.expect_response("**/wap/v2/product/detail**") as response:
|
with page.expect_response("**/wap/v2/product/detail**") as response:
|
||||||
api_requests = response.value.json()
|
api_requests = response.value.json()
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue