added Hasaki crawler

This commit is contained in:
Shariar Imtiaz 2024-04-01 12:05:11 +04:00
parent 1e7dcaa894
commit 085cd08947
6 changed files with 15 additions and 145 deletions

View File

@ -46,7 +46,7 @@ class HasakiCategories:
logging.info(e)
def __del__(self):
print("Closing connection.....")
logging.info("Closing connection.....")
self.conn.close()
@ -85,42 +85,42 @@ class HasakiCategories:
def crawl_and_track(self, parent, url_to_visit):
self.master_category.append((0,"0", parent, url_to_visit))
print(self.master_category)
logging.info(self.master_category)
cats = self.crawl_categories(parent, url_to_visit)
time.sleep(10)
if cats:
for cat in cats:
self.master_category.append((1,)+(cat))
print((1,)+(cat))
logging.info((1,)+(cat))
sub_cats1 = self.crawl_categories(cat[1], cat[2])
time.sleep(3)
if sub_cats1:
for sub_cat1 in sub_cats1:
self.master_category.append((2,) + (sub_cat1))
print((2,) + (sub_cat1))
logging.info((2,) + (sub_cat1))
sub_cats2 = self.crawl_categories(sub_cat1[1], sub_cat1[2])
time.sleep(3)
if sub_cats2:
for sub_cat2 in sub_cats2:
self.master_category.append((3,) + (sub_cat2))
print((3,) + (sub_cat2))
logging.info((3,) + (sub_cat2))
sub_cats3 = self.crawl_categories(sub_cat2[1], sub_cat2[2])
time.sleep(3)
if sub_cats3:
for sub_cat3 in sub_cats3:
self.master_category.append((4,) + (sub_cat3))
print((4,) + (sub_cat3))
logging.info((4,) + (sub_cat3))
sub_cats4 = self.crawl_categories(sub_cat3[1], sub_cat3[2])
time.sleep(3)
if sub_cats4:
for sub_cat4 in sub_cats4:
self.master_category.append((4,) + (sub_cat4))
print((5,) + (sub_cat4))
logging.info((5,) + (sub_cat4))
def crawl_categories(self, parent, url_to_visit):

View File

@ -39,7 +39,7 @@ class HasakiCategoryProducts:
self.display.start()
def __del__(self):
print("Closing connection.....")
logging.info("Closing connection.....")
self.conn.close()
def start_processing(self):
@ -80,7 +80,7 @@ class HasakiCategoryProducts:
for element in top_search_element:
url = element.query_selector(".top_big_search").query_selector('a').get_attribute('href').strip()
print(url)
logging.info(url)
browser.close()
@ -192,7 +192,7 @@ class HasakiCategoryProducts:
logging.info("Product already present. skipping.....")
except Exception as e:
print(e)
logging.info(e)
item_count += 1
@ -202,7 +202,7 @@ class HasakiCategoryProducts:
browser.close()
except Exception as e:
print(e)
logging.info(e)

View File

@ -52,7 +52,7 @@ class HasakiProductInfo:
self.display.start()
def __del__(self):
print("Closing connection.....")
logging.info("Closing connection.....")
self.conn.close()
@ -96,7 +96,7 @@ class HasakiProductInfo:
raw_data = self.get_raw_product_data(data[3])
print(raw_data)
logging.info(raw_data)
if raw_data:
self.product_info(data, raw_data)
@ -174,7 +174,7 @@ class HasakiProductInfo:
if request.response:
if '/wap/v2/product/detail' in request.url:
encoding = request.response.headers.get('content-encoding')
# print(encoding)
# logging.info(encoding)
if encoding:
iteminfo = brotli.decompress(request.response.body)
else:
@ -358,7 +358,7 @@ class HasakiProductInfo:
'product_variant_stock': 'first'
}).reset_index()
#print(df_variant_merged.to_string())
#logging.info(df_variant_merged.to_string())
for index, row in df_variant_merged.iterrows():
try:

View File

@ -1,51 +0,0 @@
import time
import logging
import playwright
from fake_useragent import UserAgent
from playwright.sync_api import sync_playwright
def get_raw_product_data(url):
retries = 2
for _ in range(retries):
try:
with sync_playwright() as p:
browser = p.chromium.launch(
headless=False,
args=[
"--disable-dev-shm-usage",
"--disable-blink-features=AutomationControlled",
"--disable-component-extensions-with-background-pages"
]
)
ua = UserAgent()
random_mobile_ua = ua.random
logging.info("Using user agent: {}".format(random_mobile_ua))
context = browser.new_context(user_agent=random_mobile_ua)
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
page = context.new_page()
api_requests = {}
try:
page.goto(url, timeout=5000)
time.sleep(1)
page.reload()
with page.expect_response("**/wap/v2/product/detail**") as response:
api_requests = response.value.json()
except playwright._impl._errors.TimeoutError:
logging.info("Timeout occurred. Retrying.....")
continue # Retry without closing the browser
finally:
browser.close()
return api_requests
except Exception as e:
logging.error(f"An error occurred: {str(e)}")
logging.info("Retrying...")
return None
print(get_raw_product_data("https://hasaki.vn/san-pham/mat-na-naruko-y-di-nhan-do-duong-sang-da-25ml-moi-92613.html"))

View File

@ -1,45 +0,0 @@
from seleniumwire import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from fake_useragent import UserAgent
import brotli
import json
def get_raw_product(url):
ua = UserAgent(platforms='mobile')
random_mobile_ua = ua.random
op = webdriver.ChromeOptions()
op.add_argument(f"user-agent={random_mobile_ua}")
op.add_experimental_option("useAutomationExtension", False)
op.add_argument('--no-sandbox')
op.add_argument('--disable-notifications')
op.add_argument("--lang=en-GB")
op.headless = False
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op)
driver.get(url)
iteminfo = ""
for request in driver.requests:
if request.response:
if '/wap/v2/product/detail' in request.url:
encoding = request.response.headers.get('content-encoding')
# print(encoding)
if encoding:
iteminfo = brotli.decompress(request.response.body)
else:
iteminfo = request.response.body
driver.quit()
iteminfo_json = json.loads(iteminfo)
return iteminfo_json
print(get_raw_product('https://hasaki.vn/san-pham/nuoc-tay-trang-bioderma-danh-cho-da-nhay-cam-500ml-9740.html'))

View File

@ -1,34 +0,0 @@
import asyncio
from playwright.async_api import async_playwright
async def capture_api_response(url):
async with async_playwright() as p:
browser = await p.chromium.launch()
context = await browser.new_context(user_agent="Mozilla/5.0 (iPhone; CPU iPhone OS 13_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1")
page = await context.new_page()
async def capture_and_retry():
response = None
retry_count = 0
while not response and retry_count < 3: # Retry up to 3 times
try:
await page.goto(url)
response = await page.expect_response(lambda resp: "wap/v2/product/detail" in resp.url)
if not response:
print(f"No API response received. Retrying...")
retry_count += 1
await asyncio.sleep(5) # Retry after 5 seconds
except Exception as e:
print(f"Error occurred: {e}")
retry_count += 1
await asyncio.sleep(5) # Retry after 5 seconds
if response:
print(f"API response captured: {await response.text()}")
# Handle the API response here
else:
print("No API response received after multiple attempts.")
await capture_and_retry()
asyncio.run(capture_api_response("https://hasaki.vn/san-pham/son-duong-moi-khong-mau-dhc-ho-tro-giam-tham-moi-1-5g-6710.html"))