From 6be836848252fce325cbf938e210cdf0921b8bdb Mon Sep 17 00:00:00 2001 From: "shariar@raenabeauty.com" Date: Fri, 15 Mar 2024 11:02:44 +0400 Subject: [PATCH] added Hasaki crawler --- hasaki_crawler_engine/hasaki_crawler.py | 2 +- hasaki_crawler_engine/hasaki_product_info.py | 1 + hasaki_crawler_engine/test.py | 68 +++++--------------- hasaki_crawler_engine/test2.py | 38 +++++++++++ 4 files changed, 56 insertions(+), 53 deletions(-) create mode 100644 hasaki_crawler_engine/test2.py diff --git a/hasaki_crawler_engine/hasaki_crawler.py b/hasaki_crawler_engine/hasaki_crawler.py index 4c00760..8fa98c3 100644 --- a/hasaki_crawler_engine/hasaki_crawler.py +++ b/hasaki_crawler_engine/hasaki_crawler.py @@ -35,7 +35,7 @@ def send_mail(msg): EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5" EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh" From = 'data_reporting@raenabeauty.com' - To = 'shariar@raenabeauty.com' + To = 'shariar@raenabeauty.com, data_reporting@raenabeauty.com' # To = 'shariar@raenabeauty.com' html = f''' diff --git a/hasaki_crawler_engine/hasaki_product_info.py b/hasaki_crawler_engine/hasaki_product_info.py index a4829ae..ca643c2 100644 --- a/hasaki_crawler_engine/hasaki_product_info.py +++ b/hasaki_crawler_engine/hasaki_product_info.py @@ -98,6 +98,7 @@ class HasakiProductInfo: page = context.new_page() page.goto(url) + page.reload() with page.expect_response("**/wap/v2/product/detail**") as response: api_requests = response.value.json() diff --git a/hasaki_crawler_engine/test.py b/hasaki_crawler_engine/test.py index 728385b..d9ddd95 100644 --- a/hasaki_crawler_engine/test.py +++ b/hasaki_crawler_engine/test.py @@ -1,63 +1,27 @@ -import time -from bs4 import BeautifulSoup -from playwright.sync_api import sync_playwright -import pandas as pd -# Launch the Playwright browser in mobile mode +from playwright.sync_api import sync_playwright +from fake_useragent import UserAgent +import logging + + + with sync_playwright() as p: browser = p.chromium.launch(headless=False) - context = browser.new_context(user_agent="Mozilla/5.0 (iPhone X; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1") + ua = UserAgent(platforms='mobile') + random_mobile_ua = ua.random + logging.info("using user agent: {}".format(random_mobile_ua)) + + context = browser.new_context(user_agent=random_mobile_ua) page = context.new_page() - page.goto("https://hasaki.vn/san-pham/nuoc-tay-trang-tuoi-mat-l-oreal-3-in-1-danh-cho-da-dau-da-hon-hop-400ml-19325.html") - page.wait_for_load_state('load') - #time.sleep(10) + page.goto("https://hasaki.vn/san-pham/kem-duong-skin1004-lam-diu-da-chiet-xuat-rau-ma-75ml-89637.html") - # Capture the underlying API request URL - #api_requests = page.evaluate('''() => window.fetch('https://hasaki.vn/wap/v2/product/detail').then(response => response.json())''') - #print(api_requests) + page.reload() with page.expect_response("**/wap/v2/product/detail**") as response: - data = response.value.json() + api_requests = response.value.json() - variant_items = data['attribute']['items'] - df = pd.DataFrame({}, columns=['product_variant_name','rce_source_variant_id','product_variant_price','product_variant_stock','product_variant_sku']) - - data_variant = {} - for variant in variant_items: - for item in variant['options']: - data_variant['product_variant_name'] = item['long_label'] - for product in item['products']: - data_variant['rce_source_variant_id'] = product['id'] - data_variant['rce_product_id'] = "" - data_variant['product_variant_price'] = product['price'] - data_variant['product_variant_price_before_discount'] = "" - data_variant['product_variant_stock'] = product['quantity'] - data_variant['product_variant_sku'] = product['sku'] - - #variants_arr.append(data_variant) - - tmp = pd.DataFrame([[data_variant['product_variant_name'],data_variant['rce_source_variant_id'],data_variant['product_variant_price'],data_variant['product_variant_stock'],data_variant['product_variant_sku']]], - columns=['product_variant_name', 'rce_source_variant_id', 'product_variant_price', - 'product_variant_stock', 'product_variant_sku']) - df = pd.concat([df, tmp]) - - print(data_variant) - - df = df.sort_values(by=['product_variant_sku']) - print(df.to_string()) - - print("======================================") - - merged_df = df.groupby('product_variant_sku').agg({ - 'product_variant_name': ' '.join, - 'rce_source_variant_id': 'first', - 'product_variant_price': 'first', - 'product_variant_stock': 'first' - }).reset_index() - - print(merged_df.to_string()) - - # Close the browser browser.close() + + print(api_requests) diff --git a/hasaki_crawler_engine/test2.py b/hasaki_crawler_engine/test2.py new file mode 100644 index 0000000..7ba1fe5 --- /dev/null +++ b/hasaki_crawler_engine/test2.py @@ -0,0 +1,38 @@ +from seleniumwire import webdriver +import random +from selenium.webdriver.chrome.service import Service +from webdriver_manager.chrome import ChromeDriverManager + +# Set mobile emulation options +mobile_emulation = { + "deviceName": "iPhone X" +} + +op = webdriver.ChromeOptions() +# hight = str(random.randint(640,1280)) +# width = str(random.randint(1024,1920)) +# op.add_argument("window-size="+width+","+hight+"") +op.add_experimental_option("useAutomationExtension", False) +op.add_argument('--no-sandbox') +op.add_argument('--disable-notifications') +op.add_argument("--lang=en-GB") +op.add_argument("--log-level=3") +op.headless = False + + + + +driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op) + +# Access a website +driver.get('https://hasaki.vn') + +# Get all requests made by the browser +for request in driver.requests: + if request.response: + if '/wap/v2/product/detail' in request.url: + iteminfo = request.response.body + print(iteminfo) + +# Quit the driver +driver.quit()