added Hasaki crawler
This commit is contained in:
parent
a1290c8772
commit
6be8368482
|
@ -35,7 +35,7 @@ def send_mail(msg):
|
||||||
EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5"
|
EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5"
|
||||||
EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh"
|
EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh"
|
||||||
From = 'data_reporting@raenabeauty.com'
|
From = 'data_reporting@raenabeauty.com'
|
||||||
To = 'shariar@raenabeauty.com'
|
To = 'shariar@raenabeauty.com, data_reporting@raenabeauty.com'
|
||||||
# To = 'shariar@raenabeauty.com'
|
# To = 'shariar@raenabeauty.com'
|
||||||
|
|
||||||
html = f'''
|
html = f'''
|
||||||
|
|
|
@ -98,6 +98,7 @@ class HasakiProductInfo:
|
||||||
page = context.new_page()
|
page = context.new_page()
|
||||||
|
|
||||||
page.goto(url)
|
page.goto(url)
|
||||||
|
page.reload()
|
||||||
|
|
||||||
with page.expect_response("**/wap/v2/product/detail**") as response:
|
with page.expect_response("**/wap/v2/product/detail**") as response:
|
||||||
api_requests = response.value.json()
|
api_requests = response.value.json()
|
||||||
|
|
|
@ -1,63 +1,27 @@
|
||||||
import time
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from playwright.sync_api import sync_playwright
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
# Launch the Playwright browser in mobile mode
|
from playwright.sync_api import sync_playwright
|
||||||
|
from fake_useragent import UserAgent
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
with sync_playwright() as p:
|
with sync_playwright() as p:
|
||||||
browser = p.chromium.launch(headless=False)
|
browser = p.chromium.launch(headless=False)
|
||||||
context = browser.new_context(user_agent="Mozilla/5.0 (iPhone X; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1")
|
ua = UserAgent(platforms='mobile')
|
||||||
|
random_mobile_ua = ua.random
|
||||||
|
logging.info("using user agent: {}".format(random_mobile_ua))
|
||||||
|
|
||||||
|
context = browser.new_context(user_agent=random_mobile_ua)
|
||||||
page = context.new_page()
|
page = context.new_page()
|
||||||
|
|
||||||
page.goto("https://hasaki.vn/san-pham/nuoc-tay-trang-tuoi-mat-l-oreal-3-in-1-danh-cho-da-dau-da-hon-hop-400ml-19325.html")
|
page.goto("https://hasaki.vn/san-pham/kem-duong-skin1004-lam-diu-da-chiet-xuat-rau-ma-75ml-89637.html")
|
||||||
page.wait_for_load_state('load')
|
|
||||||
#time.sleep(10)
|
|
||||||
|
|
||||||
# Capture the underlying API request URL
|
page.reload()
|
||||||
#api_requests = page.evaluate('''() => window.fetch('https://hasaki.vn/wap/v2/product/detail').then(response => response.json())''')
|
|
||||||
#print(api_requests)
|
|
||||||
|
|
||||||
with page.expect_response("**/wap/v2/product/detail**") as response:
|
with page.expect_response("**/wap/v2/product/detail**") as response:
|
||||||
data = response.value.json()
|
api_requests = response.value.json()
|
||||||
|
|
||||||
variant_items = data['attribute']['items']
|
|
||||||
|
|
||||||
df = pd.DataFrame({}, columns=['product_variant_name','rce_source_variant_id','product_variant_price','product_variant_stock','product_variant_sku'])
|
|
||||||
|
|
||||||
data_variant = {}
|
|
||||||
for variant in variant_items:
|
|
||||||
for item in variant['options']:
|
|
||||||
data_variant['product_variant_name'] = item['long_label']
|
|
||||||
for product in item['products']:
|
|
||||||
data_variant['rce_source_variant_id'] = product['id']
|
|
||||||
data_variant['rce_product_id'] = ""
|
|
||||||
data_variant['product_variant_price'] = product['price']
|
|
||||||
data_variant['product_variant_price_before_discount'] = ""
|
|
||||||
data_variant['product_variant_stock'] = product['quantity']
|
|
||||||
data_variant['product_variant_sku'] = product['sku']
|
|
||||||
|
|
||||||
#variants_arr.append(data_variant)
|
|
||||||
|
|
||||||
tmp = pd.DataFrame([[data_variant['product_variant_name'],data_variant['rce_source_variant_id'],data_variant['product_variant_price'],data_variant['product_variant_stock'],data_variant['product_variant_sku']]],
|
|
||||||
columns=['product_variant_name', 'rce_source_variant_id', 'product_variant_price',
|
|
||||||
'product_variant_stock', 'product_variant_sku'])
|
|
||||||
df = pd.concat([df, tmp])
|
|
||||||
|
|
||||||
print(data_variant)
|
|
||||||
|
|
||||||
df = df.sort_values(by=['product_variant_sku'])
|
|
||||||
print(df.to_string())
|
|
||||||
|
|
||||||
print("======================================")
|
|
||||||
|
|
||||||
merged_df = df.groupby('product_variant_sku').agg({
|
|
||||||
'product_variant_name': ' '.join,
|
|
||||||
'rce_source_variant_id': 'first',
|
|
||||||
'product_variant_price': 'first',
|
|
||||||
'product_variant_stock': 'first'
|
|
||||||
}).reset_index()
|
|
||||||
|
|
||||||
print(merged_df.to_string())
|
|
||||||
|
|
||||||
# Close the browser
|
|
||||||
browser.close()
|
browser.close()
|
||||||
|
|
||||||
|
print(api_requests)
|
||||||
|
|
|
@ -0,0 +1,38 @@
|
||||||
|
from seleniumwire import webdriver
|
||||||
|
import random
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
|
||||||
|
# Set mobile emulation options
|
||||||
|
mobile_emulation = {
|
||||||
|
"deviceName": "iPhone X"
|
||||||
|
}
|
||||||
|
|
||||||
|
op = webdriver.ChromeOptions()
|
||||||
|
# hight = str(random.randint(640,1280))
|
||||||
|
# width = str(random.randint(1024,1920))
|
||||||
|
# op.add_argument("window-size="+width+","+hight+"")
|
||||||
|
op.add_experimental_option("useAutomationExtension", False)
|
||||||
|
op.add_argument('--no-sandbox')
|
||||||
|
op.add_argument('--disable-notifications')
|
||||||
|
op.add_argument("--lang=en-GB")
|
||||||
|
op.add_argument("--log-level=3")
|
||||||
|
op.headless = False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op)
|
||||||
|
|
||||||
|
# Access a website
|
||||||
|
driver.get('https://hasaki.vn')
|
||||||
|
|
||||||
|
# Get all requests made by the browser
|
||||||
|
for request in driver.requests:
|
||||||
|
if request.response:
|
||||||
|
if '/wap/v2/product/detail' in request.url:
|
||||||
|
iteminfo = request.response.body
|
||||||
|
print(iteminfo)
|
||||||
|
|
||||||
|
# Quit the driver
|
||||||
|
driver.quit()
|
Loading…
Reference in New Issue