added Hasaki crawler
This commit is contained in:
parent
1e7dcaa894
commit
085cd08947
|
@ -46,7 +46,7 @@ class HasakiCategories:
|
|||
logging.info(e)
|
||||
|
||||
def __del__(self):
|
||||
print("Closing connection.....")
|
||||
logging.info("Closing connection.....")
|
||||
self.conn.close()
|
||||
|
||||
|
||||
|
@ -85,42 +85,42 @@ class HasakiCategories:
|
|||
def crawl_and_track(self, parent, url_to_visit):
|
||||
self.master_category.append((0,"0", parent, url_to_visit))
|
||||
|
||||
print(self.master_category)
|
||||
logging.info(self.master_category)
|
||||
|
||||
cats = self.crawl_categories(parent, url_to_visit)
|
||||
time.sleep(10)
|
||||
if cats:
|
||||
for cat in cats:
|
||||
self.master_category.append((1,)+(cat))
|
||||
print((1,)+(cat))
|
||||
logging.info((1,)+(cat))
|
||||
|
||||
sub_cats1 = self.crawl_categories(cat[1], cat[2])
|
||||
time.sleep(3)
|
||||
if sub_cats1:
|
||||
for sub_cat1 in sub_cats1:
|
||||
self.master_category.append((2,) + (sub_cat1))
|
||||
print((2,) + (sub_cat1))
|
||||
logging.info((2,) + (sub_cat1))
|
||||
|
||||
sub_cats2 = self.crawl_categories(sub_cat1[1], sub_cat1[2])
|
||||
time.sleep(3)
|
||||
if sub_cats2:
|
||||
for sub_cat2 in sub_cats2:
|
||||
self.master_category.append((3,) + (sub_cat2))
|
||||
print((3,) + (sub_cat2))
|
||||
logging.info((3,) + (sub_cat2))
|
||||
|
||||
sub_cats3 = self.crawl_categories(sub_cat2[1], sub_cat2[2])
|
||||
time.sleep(3)
|
||||
if sub_cats3:
|
||||
for sub_cat3 in sub_cats3:
|
||||
self.master_category.append((4,) + (sub_cat3))
|
||||
print((4,) + (sub_cat3))
|
||||
logging.info((4,) + (sub_cat3))
|
||||
|
||||
sub_cats4 = self.crawl_categories(sub_cat3[1], sub_cat3[2])
|
||||
time.sleep(3)
|
||||
if sub_cats4:
|
||||
for sub_cat4 in sub_cats4:
|
||||
self.master_category.append((4,) + (sub_cat4))
|
||||
print((5,) + (sub_cat4))
|
||||
logging.info((5,) + (sub_cat4))
|
||||
|
||||
def crawl_categories(self, parent, url_to_visit):
|
||||
|
||||
|
|
|
@ -39,7 +39,7 @@ class HasakiCategoryProducts:
|
|||
self.display.start()
|
||||
|
||||
def __del__(self):
|
||||
print("Closing connection.....")
|
||||
logging.info("Closing connection.....")
|
||||
self.conn.close()
|
||||
|
||||
def start_processing(self):
|
||||
|
@ -80,7 +80,7 @@ class HasakiCategoryProducts:
|
|||
for element in top_search_element:
|
||||
url = element.query_selector(".top_big_search").query_selector('a').get_attribute('href').strip()
|
||||
|
||||
print(url)
|
||||
logging.info(url)
|
||||
browser.close()
|
||||
|
||||
|
||||
|
@ -192,7 +192,7 @@ class HasakiCategoryProducts:
|
|||
logging.info("Product already present. skipping.....")
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
logging.info(e)
|
||||
|
||||
item_count += 1
|
||||
|
||||
|
@ -202,7 +202,7 @@ class HasakiCategoryProducts:
|
|||
|
||||
browser.close()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
logging.info(e)
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -52,7 +52,7 @@ class HasakiProductInfo:
|
|||
self.display.start()
|
||||
|
||||
def __del__(self):
|
||||
print("Closing connection.....")
|
||||
logging.info("Closing connection.....")
|
||||
self.conn.close()
|
||||
|
||||
|
||||
|
@ -96,7 +96,7 @@ class HasakiProductInfo:
|
|||
|
||||
raw_data = self.get_raw_product_data(data[3])
|
||||
|
||||
print(raw_data)
|
||||
logging.info(raw_data)
|
||||
|
||||
if raw_data:
|
||||
self.product_info(data, raw_data)
|
||||
|
@ -174,7 +174,7 @@ class HasakiProductInfo:
|
|||
if request.response:
|
||||
if '/wap/v2/product/detail' in request.url:
|
||||
encoding = request.response.headers.get('content-encoding')
|
||||
# print(encoding)
|
||||
# logging.info(encoding)
|
||||
if encoding:
|
||||
iteminfo = brotli.decompress(request.response.body)
|
||||
else:
|
||||
|
@ -358,7 +358,7 @@ class HasakiProductInfo:
|
|||
'product_variant_stock': 'first'
|
||||
}).reset_index()
|
||||
|
||||
#print(df_variant_merged.to_string())
|
||||
#logging.info(df_variant_merged.to_string())
|
||||
|
||||
for index, row in df_variant_merged.iterrows():
|
||||
try:
|
||||
|
|
|
@ -1,51 +0,0 @@
|
|||
import time
|
||||
import logging
|
||||
|
||||
import playwright
|
||||
from fake_useragent import UserAgent
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
def get_raw_product_data(url):
|
||||
retries = 2
|
||||
for _ in range(retries):
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(
|
||||
headless=False,
|
||||
args=[
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--disable-component-extensions-with-background-pages"
|
||||
]
|
||||
)
|
||||
ua = UserAgent()
|
||||
random_mobile_ua = ua.random
|
||||
logging.info("Using user agent: {}".format(random_mobile_ua))
|
||||
|
||||
context = browser.new_context(user_agent=random_mobile_ua)
|
||||
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
||||
page = context.new_page()
|
||||
|
||||
api_requests = {}
|
||||
|
||||
try:
|
||||
page.goto(url, timeout=5000)
|
||||
time.sleep(1)
|
||||
page.reload()
|
||||
with page.expect_response("**/wap/v2/product/detail**") as response:
|
||||
api_requests = response.value.json()
|
||||
except playwright._impl._errors.TimeoutError:
|
||||
logging.info("Timeout occurred. Retrying.....")
|
||||
continue # Retry without closing the browser
|
||||
finally:
|
||||
browser.close()
|
||||
|
||||
return api_requests
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred: {str(e)}")
|
||||
logging.info("Retrying...")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
print(get_raw_product_data("https://hasaki.vn/san-pham/mat-na-naruko-y-di-nhan-do-duong-sang-da-25ml-moi-92613.html"))
|
|
@ -1,45 +0,0 @@
|
|||
from seleniumwire import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from fake_useragent import UserAgent
|
||||
import brotli
|
||||
import json
|
||||
|
||||
|
||||
def get_raw_product(url):
|
||||
ua = UserAgent(platforms='mobile')
|
||||
random_mobile_ua = ua.random
|
||||
|
||||
op = webdriver.ChromeOptions()
|
||||
op.add_argument(f"user-agent={random_mobile_ua}")
|
||||
op.add_experimental_option("useAutomationExtension", False)
|
||||
op.add_argument('--no-sandbox')
|
||||
op.add_argument('--disable-notifications')
|
||||
op.add_argument("--lang=en-GB")
|
||||
op.headless = False
|
||||
|
||||
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op)
|
||||
|
||||
driver.get(url)
|
||||
|
||||
iteminfo = ""
|
||||
|
||||
for request in driver.requests:
|
||||
if request.response:
|
||||
if '/wap/v2/product/detail' in request.url:
|
||||
encoding = request.response.headers.get('content-encoding')
|
||||
# print(encoding)
|
||||
if encoding:
|
||||
iteminfo = brotli.decompress(request.response.body)
|
||||
else:
|
||||
iteminfo = request.response.body
|
||||
|
||||
|
||||
|
||||
driver.quit()
|
||||
|
||||
iteminfo_json = json.loads(iteminfo)
|
||||
return iteminfo_json
|
||||
|
||||
|
||||
print(get_raw_product('https://hasaki.vn/san-pham/nuoc-tay-trang-bioderma-danh-cho-da-nhay-cam-500ml-9740.html'))
|
|
@ -1,34 +0,0 @@
|
|||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def capture_api_response(url):
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch()
|
||||
context = await browser.new_context(user_agent="Mozilla/5.0 (iPhone; CPU iPhone OS 13_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1")
|
||||
page = await context.new_page()
|
||||
|
||||
async def capture_and_retry():
|
||||
response = None
|
||||
retry_count = 0
|
||||
while not response and retry_count < 3: # Retry up to 3 times
|
||||
try:
|
||||
await page.goto(url)
|
||||
response = await page.expect_response(lambda resp: "wap/v2/product/detail" in resp.url)
|
||||
if not response:
|
||||
print(f"No API response received. Retrying...")
|
||||
retry_count += 1
|
||||
await asyncio.sleep(5) # Retry after 5 seconds
|
||||
except Exception as e:
|
||||
print(f"Error occurred: {e}")
|
||||
retry_count += 1
|
||||
await asyncio.sleep(5) # Retry after 5 seconds
|
||||
|
||||
if response:
|
||||
print(f"API response captured: {await response.text()}")
|
||||
# Handle the API response here
|
||||
else:
|
||||
print("No API response received after multiple attempts.")
|
||||
|
||||
await capture_and_retry()
|
||||
|
||||
asyncio.run(capture_api_response("https://hasaki.vn/san-pham/son-duong-moi-khong-mau-dhc-ho-tro-giam-tham-moi-1-5g-6710.html"))
|
Loading…
Reference in New Issue