added Hasaki crawler
This commit is contained in:
parent
1e7dcaa894
commit
085cd08947
|
@ -46,7 +46,7 @@ class HasakiCategories:
|
||||||
logging.info(e)
|
logging.info(e)
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
print("Closing connection.....")
|
logging.info("Closing connection.....")
|
||||||
self.conn.close()
|
self.conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
@ -85,42 +85,42 @@ class HasakiCategories:
|
||||||
def crawl_and_track(self, parent, url_to_visit):
|
def crawl_and_track(self, parent, url_to_visit):
|
||||||
self.master_category.append((0,"0", parent, url_to_visit))
|
self.master_category.append((0,"0", parent, url_to_visit))
|
||||||
|
|
||||||
print(self.master_category)
|
logging.info(self.master_category)
|
||||||
|
|
||||||
cats = self.crawl_categories(parent, url_to_visit)
|
cats = self.crawl_categories(parent, url_to_visit)
|
||||||
time.sleep(10)
|
time.sleep(10)
|
||||||
if cats:
|
if cats:
|
||||||
for cat in cats:
|
for cat in cats:
|
||||||
self.master_category.append((1,)+(cat))
|
self.master_category.append((1,)+(cat))
|
||||||
print((1,)+(cat))
|
logging.info((1,)+(cat))
|
||||||
|
|
||||||
sub_cats1 = self.crawl_categories(cat[1], cat[2])
|
sub_cats1 = self.crawl_categories(cat[1], cat[2])
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
if sub_cats1:
|
if sub_cats1:
|
||||||
for sub_cat1 in sub_cats1:
|
for sub_cat1 in sub_cats1:
|
||||||
self.master_category.append((2,) + (sub_cat1))
|
self.master_category.append((2,) + (sub_cat1))
|
||||||
print((2,) + (sub_cat1))
|
logging.info((2,) + (sub_cat1))
|
||||||
|
|
||||||
sub_cats2 = self.crawl_categories(sub_cat1[1], sub_cat1[2])
|
sub_cats2 = self.crawl_categories(sub_cat1[1], sub_cat1[2])
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
if sub_cats2:
|
if sub_cats2:
|
||||||
for sub_cat2 in sub_cats2:
|
for sub_cat2 in sub_cats2:
|
||||||
self.master_category.append((3,) + (sub_cat2))
|
self.master_category.append((3,) + (sub_cat2))
|
||||||
print((3,) + (sub_cat2))
|
logging.info((3,) + (sub_cat2))
|
||||||
|
|
||||||
sub_cats3 = self.crawl_categories(sub_cat2[1], sub_cat2[2])
|
sub_cats3 = self.crawl_categories(sub_cat2[1], sub_cat2[2])
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
if sub_cats3:
|
if sub_cats3:
|
||||||
for sub_cat3 in sub_cats3:
|
for sub_cat3 in sub_cats3:
|
||||||
self.master_category.append((4,) + (sub_cat3))
|
self.master_category.append((4,) + (sub_cat3))
|
||||||
print((4,) + (sub_cat3))
|
logging.info((4,) + (sub_cat3))
|
||||||
|
|
||||||
sub_cats4 = self.crawl_categories(sub_cat3[1], sub_cat3[2])
|
sub_cats4 = self.crawl_categories(sub_cat3[1], sub_cat3[2])
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
if sub_cats4:
|
if sub_cats4:
|
||||||
for sub_cat4 in sub_cats4:
|
for sub_cat4 in sub_cats4:
|
||||||
self.master_category.append((4,) + (sub_cat4))
|
self.master_category.append((4,) + (sub_cat4))
|
||||||
print((5,) + (sub_cat4))
|
logging.info((5,) + (sub_cat4))
|
||||||
|
|
||||||
def crawl_categories(self, parent, url_to_visit):
|
def crawl_categories(self, parent, url_to_visit):
|
||||||
|
|
||||||
|
|
|
@ -39,7 +39,7 @@ class HasakiCategoryProducts:
|
||||||
self.display.start()
|
self.display.start()
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
print("Closing connection.....")
|
logging.info("Closing connection.....")
|
||||||
self.conn.close()
|
self.conn.close()
|
||||||
|
|
||||||
def start_processing(self):
|
def start_processing(self):
|
||||||
|
@ -80,7 +80,7 @@ class HasakiCategoryProducts:
|
||||||
for element in top_search_element:
|
for element in top_search_element:
|
||||||
url = element.query_selector(".top_big_search").query_selector('a').get_attribute('href').strip()
|
url = element.query_selector(".top_big_search").query_selector('a').get_attribute('href').strip()
|
||||||
|
|
||||||
print(url)
|
logging.info(url)
|
||||||
browser.close()
|
browser.close()
|
||||||
|
|
||||||
|
|
||||||
|
@ -192,7 +192,7 @@ class HasakiCategoryProducts:
|
||||||
logging.info("Product already present. skipping.....")
|
logging.info("Product already present. skipping.....")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
logging.info(e)
|
||||||
|
|
||||||
item_count += 1
|
item_count += 1
|
||||||
|
|
||||||
|
@ -202,7 +202,7 @@ class HasakiCategoryProducts:
|
||||||
|
|
||||||
browser.close()
|
browser.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
logging.info(e)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -52,7 +52,7 @@ class HasakiProductInfo:
|
||||||
self.display.start()
|
self.display.start()
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
print("Closing connection.....")
|
logging.info("Closing connection.....")
|
||||||
self.conn.close()
|
self.conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
@ -96,7 +96,7 @@ class HasakiProductInfo:
|
||||||
|
|
||||||
raw_data = self.get_raw_product_data(data[3])
|
raw_data = self.get_raw_product_data(data[3])
|
||||||
|
|
||||||
print(raw_data)
|
logging.info(raw_data)
|
||||||
|
|
||||||
if raw_data:
|
if raw_data:
|
||||||
self.product_info(data, raw_data)
|
self.product_info(data, raw_data)
|
||||||
|
@ -174,7 +174,7 @@ class HasakiProductInfo:
|
||||||
if request.response:
|
if request.response:
|
||||||
if '/wap/v2/product/detail' in request.url:
|
if '/wap/v2/product/detail' in request.url:
|
||||||
encoding = request.response.headers.get('content-encoding')
|
encoding = request.response.headers.get('content-encoding')
|
||||||
# print(encoding)
|
# logging.info(encoding)
|
||||||
if encoding:
|
if encoding:
|
||||||
iteminfo = brotli.decompress(request.response.body)
|
iteminfo = brotli.decompress(request.response.body)
|
||||||
else:
|
else:
|
||||||
|
@ -358,7 +358,7 @@ class HasakiProductInfo:
|
||||||
'product_variant_stock': 'first'
|
'product_variant_stock': 'first'
|
||||||
}).reset_index()
|
}).reset_index()
|
||||||
|
|
||||||
#print(df_variant_merged.to_string())
|
#logging.info(df_variant_merged.to_string())
|
||||||
|
|
||||||
for index, row in df_variant_merged.iterrows():
|
for index, row in df_variant_merged.iterrows():
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -1,51 +0,0 @@
|
||||||
import time
|
|
||||||
import logging
|
|
||||||
|
|
||||||
import playwright
|
|
||||||
from fake_useragent import UserAgent
|
|
||||||
from playwright.sync_api import sync_playwright
|
|
||||||
|
|
||||||
def get_raw_product_data(url):
|
|
||||||
retries = 2
|
|
||||||
for _ in range(retries):
|
|
||||||
try:
|
|
||||||
with sync_playwright() as p:
|
|
||||||
browser = p.chromium.launch(
|
|
||||||
headless=False,
|
|
||||||
args=[
|
|
||||||
"--disable-dev-shm-usage",
|
|
||||||
"--disable-blink-features=AutomationControlled",
|
|
||||||
"--disable-component-extensions-with-background-pages"
|
|
||||||
]
|
|
||||||
)
|
|
||||||
ua = UserAgent()
|
|
||||||
random_mobile_ua = ua.random
|
|
||||||
logging.info("Using user agent: {}".format(random_mobile_ua))
|
|
||||||
|
|
||||||
context = browser.new_context(user_agent=random_mobile_ua)
|
|
||||||
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
|
||||||
page = context.new_page()
|
|
||||||
|
|
||||||
api_requests = {}
|
|
||||||
|
|
||||||
try:
|
|
||||||
page.goto(url, timeout=5000)
|
|
||||||
time.sleep(1)
|
|
||||||
page.reload()
|
|
||||||
with page.expect_response("**/wap/v2/product/detail**") as response:
|
|
||||||
api_requests = response.value.json()
|
|
||||||
except playwright._impl._errors.TimeoutError:
|
|
||||||
logging.info("Timeout occurred. Retrying.....")
|
|
||||||
continue # Retry without closing the browser
|
|
||||||
finally:
|
|
||||||
browser.close()
|
|
||||||
|
|
||||||
return api_requests
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"An error occurred: {str(e)}")
|
|
||||||
logging.info("Retrying...")
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
print(get_raw_product_data("https://hasaki.vn/san-pham/mat-na-naruko-y-di-nhan-do-duong-sang-da-25ml-moi-92613.html"))
|
|
|
@ -1,45 +0,0 @@
|
||||||
from seleniumwire import webdriver
|
|
||||||
from selenium.webdriver.chrome.service import Service
|
|
||||||
from webdriver_manager.chrome import ChromeDriverManager
|
|
||||||
from fake_useragent import UserAgent
|
|
||||||
import brotli
|
|
||||||
import json
|
|
||||||
|
|
||||||
|
|
||||||
def get_raw_product(url):
|
|
||||||
ua = UserAgent(platforms='mobile')
|
|
||||||
random_mobile_ua = ua.random
|
|
||||||
|
|
||||||
op = webdriver.ChromeOptions()
|
|
||||||
op.add_argument(f"user-agent={random_mobile_ua}")
|
|
||||||
op.add_experimental_option("useAutomationExtension", False)
|
|
||||||
op.add_argument('--no-sandbox')
|
|
||||||
op.add_argument('--disable-notifications')
|
|
||||||
op.add_argument("--lang=en-GB")
|
|
||||||
op.headless = False
|
|
||||||
|
|
||||||
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op)
|
|
||||||
|
|
||||||
driver.get(url)
|
|
||||||
|
|
||||||
iteminfo = ""
|
|
||||||
|
|
||||||
for request in driver.requests:
|
|
||||||
if request.response:
|
|
||||||
if '/wap/v2/product/detail' in request.url:
|
|
||||||
encoding = request.response.headers.get('content-encoding')
|
|
||||||
# print(encoding)
|
|
||||||
if encoding:
|
|
||||||
iteminfo = brotli.decompress(request.response.body)
|
|
||||||
else:
|
|
||||||
iteminfo = request.response.body
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
driver.quit()
|
|
||||||
|
|
||||||
iteminfo_json = json.loads(iteminfo)
|
|
||||||
return iteminfo_json
|
|
||||||
|
|
||||||
|
|
||||||
print(get_raw_product('https://hasaki.vn/san-pham/nuoc-tay-trang-bioderma-danh-cho-da-nhay-cam-500ml-9740.html'))
|
|
|
@ -1,34 +0,0 @@
|
||||||
import asyncio
|
|
||||||
from playwright.async_api import async_playwright
|
|
||||||
|
|
||||||
async def capture_api_response(url):
|
|
||||||
async with async_playwright() as p:
|
|
||||||
browser = await p.chromium.launch()
|
|
||||||
context = await browser.new_context(user_agent="Mozilla/5.0 (iPhone; CPU iPhone OS 13_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1")
|
|
||||||
page = await context.new_page()
|
|
||||||
|
|
||||||
async def capture_and_retry():
|
|
||||||
response = None
|
|
||||||
retry_count = 0
|
|
||||||
while not response and retry_count < 3: # Retry up to 3 times
|
|
||||||
try:
|
|
||||||
await page.goto(url)
|
|
||||||
response = await page.expect_response(lambda resp: "wap/v2/product/detail" in resp.url)
|
|
||||||
if not response:
|
|
||||||
print(f"No API response received. Retrying...")
|
|
||||||
retry_count += 1
|
|
||||||
await asyncio.sleep(5) # Retry after 5 seconds
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error occurred: {e}")
|
|
||||||
retry_count += 1
|
|
||||||
await asyncio.sleep(5) # Retry after 5 seconds
|
|
||||||
|
|
||||||
if response:
|
|
||||||
print(f"API response captured: {await response.text()}")
|
|
||||||
# Handle the API response here
|
|
||||||
else:
|
|
||||||
print("No API response received after multiple attempts.")
|
|
||||||
|
|
||||||
await capture_and_retry()
|
|
||||||
|
|
||||||
asyncio.run(capture_api_response("https://hasaki.vn/san-pham/son-duong-moi-khong-mau-dhc-ho-tro-giam-tham-moi-1-5g-6710.html"))
|
|
Loading…
Reference in New Issue