added Hasaki crawler

2024-04-01 12:05:11 +04:00 · 2024-04-01 12:05:11 +04:00 · 085cd08947
parent 1e7dcaa894
commit 085cd08947
6 changed files with 15 additions and 145 deletions
--- a/hasaki_crawler_engine/hasaki_categories.py
+++ b/hasaki_crawler_engine/hasaki_categories.py
@ -46,7 +46,7 @@ class HasakiCategories:
            logging.info(e)

    def __del__(self):
-        print("Closing connection.....")
+        logging.info("Closing connection.....")
        self.conn.close()


@ -85,42 +85,42 @@ class HasakiCategories:
    def crawl_and_track(self, parent, url_to_visit):
        self.master_category.append((0,"0", parent, url_to_visit))

-        print(self.master_category)
+        logging.info(self.master_category)

        cats = self.crawl_categories(parent, url_to_visit)
        time.sleep(10)
        if cats:
            for cat in cats:
                self.master_category.append((1,)+(cat))
-                print((1,)+(cat))
+                logging.info((1,)+(cat))

                sub_cats1 = self.crawl_categories(cat[1], cat[2])
                time.sleep(3)
                if sub_cats1:
                    for sub_cat1 in sub_cats1:
                        self.master_category.append((2,) + (sub_cat1))
-                        print((2,) + (sub_cat1))
+                        logging.info((2,) + (sub_cat1))

                        sub_cats2 = self.crawl_categories(sub_cat1[1], sub_cat1[2])
                        time.sleep(3)
                        if sub_cats2:
                            for sub_cat2 in sub_cats2:
                                self.master_category.append((3,) + (sub_cat2))
-                                print((3,) + (sub_cat2))
+                                logging.info((3,) + (sub_cat2))

                                sub_cats3 = self.crawl_categories(sub_cat2[1], sub_cat2[2])
                                time.sleep(3)
                                if sub_cats3:
                                    for sub_cat3 in sub_cats3:
                                        self.master_category.append((4,) + (sub_cat3))
-                                        print((4,) + (sub_cat3))
+                                        logging.info((4,) + (sub_cat3))

                                        sub_cats4 = self.crawl_categories(sub_cat3[1], sub_cat3[2])
                                        time.sleep(3)
                                        if sub_cats4:
                                            for sub_cat4 in sub_cats4:
                                                self.master_category.append((4,) + (sub_cat4))
-                                                print((5,) + (sub_cat4))
+                                                logging.info((5,) + (sub_cat4))

    def crawl_categories(self, parent, url_to_visit):

--- a/hasaki_crawler_engine/hasaki_category_products.py
+++ b/hasaki_crawler_engine/hasaki_category_products.py
@ -39,7 +39,7 @@ class HasakiCategoryProducts:
        self.display.start()

    def __del__(self):
-        print("Closing connection.....")
+        logging.info("Closing connection.....")
        self.conn.close()

    def start_processing(self):
@ -80,7 +80,7 @@ class HasakiCategoryProducts:
            for element in top_search_element:
                url = element.query_selector(".top_big_search").query_selector('a').get_attribute('href').strip()

-                print(url)
+                logging.info(url)
            browser.close()


@ -192,7 +192,7 @@ class HasakiCategoryProducts:
                                    logging.info("Product already present. skipping.....")

                            except Exception as e:
-                                print(e)
+                                logging.info(e)

                            item_count += 1

@ -202,7 +202,7 @@ class HasakiCategoryProducts:

                browser.close()
        except Exception as e:
-            print(e)
+            logging.info(e)



--- a/hasaki_crawler_engine/hasaki_product_info.py
+++ b/hasaki_crawler_engine/hasaki_product_info.py
@ -52,7 +52,7 @@ class HasakiProductInfo:
        self.display.start()

    def __del__(self):
-        print("Closing connection.....")
+        logging.info("Closing connection.....")
        self.conn.close()


@ -96,7 +96,7 @@ class HasakiProductInfo:

        raw_data = self.get_raw_product_data(data[3])

-        print(raw_data)
+        logging.info(raw_data)

        if raw_data:
            self.product_info(data, raw_data)
@ -174,7 +174,7 @@ class HasakiProductInfo:
            if request.response:
                if '/wap/v2/product/detail' in request.url:
                    encoding = request.response.headers.get('content-encoding')
-                    # print(encoding)
+                    # logging.info(encoding)
                    if encoding:
                        iteminfo = brotli.decompress(request.response.body)
                    else:
@ -358,7 +358,7 @@ class HasakiProductInfo:
                'product_variant_stock': 'first'
            }).reset_index()

-            #print(df_variant_merged.to_string())
+            #logging.info(df_variant_merged.to_string())

            for index, row in df_variant_merged.iterrows():
                try:
--- a/hasaki_crawler_engine/test.py
+++ b/hasaki_crawler_engine/test.py
@ -1,51 +0,0 @@
-import time
-import logging
-
-import playwright
-from fake_useragent import UserAgent
-from playwright.sync_api import sync_playwright
-
-def get_raw_product_data(url):
-    retries = 2
-    for _ in range(retries):
-        try:
-            with sync_playwright() as p:
-                browser = p.chromium.launch(
-                                            headless=False,
-                                            args=[
-                                                "--disable-dev-shm-usage",
-                                                "--disable-blink-features=AutomationControlled",
-                                                "--disable-component-extensions-with-background-pages"
-                                            ]
-                                        )
-                ua = UserAgent()
-                random_mobile_ua = ua.random
-                logging.info("Using user agent: {}".format(random_mobile_ua))
-
-                context = browser.new_context(user_agent=random_mobile_ua)
-                context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
-                page = context.new_page()
-
-                api_requests = {}
-
-                try:
-                    page.goto(url, timeout=5000)
-                    time.sleep(1)
-                    page.reload()
-                    with page.expect_response("**/wap/v2/product/detail**") as response:
-                        api_requests = response.value.json()
-                except playwright._impl._errors.TimeoutError:
-                    logging.info("Timeout occurred. Retrying.....")
-                    continue  # Retry without closing the browser
-                finally:
-                    browser.close()
-
-                return api_requests
-        except Exception as e:
-            logging.error(f"An error occurred: {str(e)}")
-            logging.info("Retrying...")
-
-    return None
-
-
-print(get_raw_product_data("https://hasaki.vn/san-pham/mat-na-naruko-y-di-nhan-do-duong-sang-da-25ml-moi-92613.html"))
--- a/hasaki_crawler_engine/test2.py
+++ b/hasaki_crawler_engine/test2.py
@ -1,45 +0,0 @@
-from seleniumwire import webdriver
-from selenium.webdriver.chrome.service import Service
-from webdriver_manager.chrome import ChromeDriverManager
-from fake_useragent import UserAgent
-import brotli
-import json
-
-
-def get_raw_product(url):
-    ua = UserAgent(platforms='mobile')
-    random_mobile_ua = ua.random
-
-    op = webdriver.ChromeOptions()
-    op.add_argument(f"user-agent={random_mobile_ua}")
-    op.add_experimental_option("useAutomationExtension", False)
-    op.add_argument('--no-sandbox')
-    op.add_argument('--disable-notifications')
-    op.add_argument("--lang=en-GB")
-    op.headless = False
-
-    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op)
-
-    driver.get(url)
-
-    iteminfo = ""
-
-    for request in driver.requests:
-        if request.response:
-            if '/wap/v2/product/detail' in request.url:
-                encoding = request.response.headers.get('content-encoding')
-                # print(encoding)
-                if encoding:
-                    iteminfo = brotli.decompress(request.response.body)
-                else:
-                    iteminfo = request.response.body
-
-
-
-    driver.quit()
-
-    iteminfo_json = json.loads(iteminfo)
-    return iteminfo_json
-
-
-print(get_raw_product('https://hasaki.vn/san-pham/nuoc-tay-trang-bioderma-danh-cho-da-nhay-cam-500ml-9740.html'))
--- a/hasaki_crawler_engine/test3.py
+++ b/hasaki_crawler_engine/test3.py
@ -1,34 +0,0 @@
-import asyncio
-from playwright.async_api import async_playwright
-
-async def capture_api_response(url):
-    async with async_playwright() as p:
-        browser = await p.chromium.launch()
-        context = await browser.new_context(user_agent="Mozilla/5.0 (iPhone; CPU iPhone OS 13_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1")
-        page = await context.new_page()
-
-        async def capture_and_retry():
-            response = None
-            retry_count = 0
-            while not response and retry_count < 3:  # Retry up to 3 times
-                try:
-                    await page.goto(url)
-                    response = await page.expect_response(lambda resp: "wap/v2/product/detail" in resp.url)
-                    if not response:
-                        print(f"No API response received. Retrying...")
-                        retry_count += 1
-                        await asyncio.sleep(5)  # Retry after 5 seconds
-                except Exception as e:
-                    print(f"Error occurred: {e}")
-                    retry_count += 1
-                    await asyncio.sleep(5)  # Retry after 5 seconds
-
-            if response:
-                print(f"API response captured: {await response.text()}")
-                # Handle the API response here
-            else:
-                print("No API response received after multiple attempts.")
-
-        await capture_and_retry()
-
-asyncio.run(capture_api_response("https://hasaki.vn/san-pham/son-duong-moi-khong-mau-dhc-ho-tro-giam-tham-moi-1-5g-6710.html"))