added Hasaki crawler
This commit is contained in:
parent
45e6965679
commit
d0344122e2
|
@ -58,6 +58,7 @@ def send_mail():
|
|||
smtp.send_message(msg)
|
||||
except Exception as e:
|
||||
logging.info("Error while sending mail: {}".format(e))
|
||||
|
||||
def main():
|
||||
# start = datetime.now()
|
||||
# categories = amazon_categories(config)
|
||||
|
|
|
@ -1,83 +0,0 @@
|
|||
import hashlib
|
||||
import logging
|
||||
import sys
|
||||
import string
|
||||
import undetected_chromedriver as webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
import psycopg2
|
||||
import bs4
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
import random
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import time
|
||||
import gzip
|
||||
import re
|
||||
import random
|
||||
from amazon_db_writer import amazon_db_writer
|
||||
|
||||
import ssl
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
|
||||
|
||||
def reseller_info(store_url):
|
||||
|
||||
op = webdriver.ChromeOptions()
|
||||
op.add_argument('--no-sandbox')
|
||||
op.add_argument('--disable-notifications')
|
||||
op.add_argument("--lang=en-GB")
|
||||
#op.headless = True
|
||||
driver=webdriver.Chrome( options=op)
|
||||
|
||||
driver.get(store_url)
|
||||
|
||||
driver.implicitly_wait(5)
|
||||
|
||||
try:
|
||||
driver.get(store_url)
|
||||
driver.implicitly_wait(5)
|
||||
|
||||
##### reseller info
|
||||
|
||||
avg_rating = driver.find_element(By.CSS_SELECTOR,'#effective-timeperiod-rating-year-description.ratings-reviews').text
|
||||
|
||||
print(avg_rating)
|
||||
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
config = {
|
||||
"crawler_name": "raena_crawler_enginer_amazon",
|
||||
"crawler_schema": "raena_spider_management",
|
||||
"category_tab": "rce_category",
|
||||
"tracker_tab": "crawler_tracker",
|
||||
"product_tab": "rce_product",
|
||||
"variant_tab": "rce_product_variant",
|
||||
"brand_tab": "rce_brand",
|
||||
"reseller_tab": "rce_reseller",
|
||||
"reseller_store_tab": "rce_reseller_store",
|
||||
"review_tab": "rce_ratings_reviews",
|
||||
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
|
||||
"review_producttags_tab": "rce_ratings_reviews_producttags",
|
||||
"review_tags": "rce_tags",
|
||||
"source_tab": "rce_source",
|
||||
"product_per_category": "1000",
|
||||
"source_category": "11043145",
|
||||
"db_user": "postgres",
|
||||
"db_pass": "postgres",
|
||||
"database": "postgres",
|
||||
"db_host": "localhost",
|
||||
"db_port": "5444",
|
||||
"crawler_main": "1",
|
||||
"crawler_slave_no": ""
|
||||
}
|
||||
conn = psycopg2.connect(database=config.get('database'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port'))
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
db_writer = amazon_db_writer(config)
|
||||
|
||||
|
||||
reseller_info('https://www.amazon.ae/sp?ie=UTF8&seller=A3TFGX22P341AN&isAmazonFulfilled=0&asin=B09BR31PF9&ref_=olp_merch_name_1')
|
|
@ -78,26 +78,33 @@ class HasakiCategories:
|
|||
print((1,)+(cat))
|
||||
|
||||
sub_cats1 = self.crawl_categories(cat[1], cat[2])
|
||||
time.sleep(10)
|
||||
time.sleep(3)
|
||||
if sub_cats1:
|
||||
for sub_cat1 in sub_cats1:
|
||||
self.master_category.append((2,) + (sub_cat1))
|
||||
print((2,) + (sub_cat1))
|
||||
|
||||
sub_cats2 = self.crawl_categories(sub_cat1[1], sub_cat1[2])
|
||||
time.sleep(10)
|
||||
time.sleep(3)
|
||||
if sub_cats2:
|
||||
for sub_cat2 in sub_cats2:
|
||||
self.master_category.append((3,) + (sub_cat2))
|
||||
print((3,) + (sub_cat2))
|
||||
|
||||
sub_cats3 = self.crawl_categories(sub_cat2[1], sub_cat2[2])
|
||||
time.sleep(10)
|
||||
time.sleep(3)
|
||||
if sub_cats3:
|
||||
for sub_cat3 in sub_cats3:
|
||||
self.master_category.append((4,) + (sub_cat3))
|
||||
print((4,) + (sub_cat3))
|
||||
|
||||
sub_cats4 = self.crawl_categories(sub_cat3[1], sub_cat3[2])
|
||||
time.sleep(3)
|
||||
if sub_cats4:
|
||||
for sub_cat4 in sub_cats4:
|
||||
self.master_category.append((4,) + (sub_cat4))
|
||||
print((5,) + (sub_cat4))
|
||||
|
||||
def crawl_categories(self, parent, url_to_visit):
|
||||
|
||||
with sync_playwright() as p:
|
||||
|
|
|
@ -55,6 +55,23 @@ class HasakiCategoryProducts:
|
|||
self.get_product_list(urls = pages, categoryId = category[0])
|
||||
|
||||
|
||||
def find_top_search(self):
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=False)
|
||||
|
||||
page = browser.new_page()
|
||||
page.goto("https://hasaki.vn/")
|
||||
|
||||
page.wait_for_load_state('load')
|
||||
|
||||
top_search_element = page.query_selector_all(".item_top_search")
|
||||
|
||||
for element in top_search_element:
|
||||
url = element.query_selector(".top_big_search").query_selector('a').get_attribute('href').strip()
|
||||
|
||||
print(url)
|
||||
browser.close()
|
||||
|
||||
|
||||
|
||||
def get_pages(self, url):
|
||||
|
@ -64,7 +81,7 @@ class HasakiCategoryProducts:
|
|||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
browser = p.chromium.launch(headless=False)
|
||||
|
||||
page = browser.new_page()
|
||||
page.goto(url)
|
||||
|
@ -88,7 +105,7 @@ class HasakiCategoryProducts:
|
|||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
browser = p.chromium.launch(headless=False)
|
||||
|
||||
page = browser.new_page()
|
||||
|
||||
|
@ -109,6 +126,13 @@ class HasakiCategoryProducts:
|
|||
for item_element in item_elements:
|
||||
try:
|
||||
product_section = "Base Product Page " + str(page_count)
|
||||
if url in ["https://hasaki.vn/danh-muc/chong-nang-da-mat-c11.html",
|
||||
"https://hasaki.vn/danh-muc/trang-diem-moi-c24.html",
|
||||
"https://hasaki.vn/danh-muc/sua-rua-mat-c19.html",
|
||||
"https://hasaki.vn/danh-muc/kem-duong-dau-duong-c9.html"]:
|
||||
|
||||
product_section = "Top Search - Base Product Page " + str(page_count)
|
||||
|
||||
product_name = translate_text_to_english(str(item_element.query_selector('.width_common.name_sp.space_bottom_5').text_content()).strip().replace("'",""))
|
||||
product_url = str(item_element.query_selector('.v3_thumb_common_sp.relative').get_attribute('href')).strip()
|
||||
product_brand = translate_text_to_english(str(item_element.query_selector('.width_common.txt_color_1.space_bottom_3').text_content()).strip().replace("'",""))
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
import logging
|
||||
import json
|
||||
import time
|
||||
import smtplib
|
||||
|
||||
from hasaki_categories import HasakiCategories
|
||||
from hasaki_category_products import HasakiCategoryProducts
|
||||
from hasaki_product_info import HasakiProductInfo
|
||||
from email.message import EmailMessage
|
||||
|
||||
##### Looger ######
|
||||
format = "%(asctime)s: %(message)s"
|
||||
|
@ -14,20 +16,61 @@ config = {}
|
|||
|
||||
|
||||
def main():
|
||||
# hasaki_categories = HasakiCategories(config)
|
||||
# hasaki_categories.start_processing()
|
||||
#
|
||||
# time.sleep(60)
|
||||
#
|
||||
# hasaki_category_products = HasakiCategoryProducts(config)
|
||||
# hasaki_category_products.start_processing()
|
||||
#
|
||||
# time.sleep(60)
|
||||
hasaki_categories = HasakiCategories(config)
|
||||
hasaki_categories.start_processing()
|
||||
|
||||
time.sleep(60)
|
||||
|
||||
hasaki_category_products = HasakiCategoryProducts(config)
|
||||
hasaki_category_products.start_processing()
|
||||
|
||||
time.sleep(60)
|
||||
|
||||
hasaki_products = HasakiProductInfo(config)
|
||||
hasaki_products.start_processing()
|
||||
|
||||
|
||||
def send_mail(msg):
|
||||
try:
|
||||
EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5"
|
||||
EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh"
|
||||
From = 'data_reporting@raenabeauty.com'
|
||||
To = 'shariar@raenabeauty.com'
|
||||
# To = 'shariar@raenabeauty.com'
|
||||
|
||||
html = f'''
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<div style="background-color:#eee;padding:10px 20px;">
|
||||
<h2 style="font-family:Georgia, 'Times New Roman', Times, serif;color#454349;">Hasaki Crawler Status</h2>
|
||||
</div>
|
||||
<div style="padding:20px 0px">
|
||||
<div style="height: 800px;width:800px">
|
||||
{msg}
|
||||
<div style="text-align:Left;">
|
||||
<p>This is system generated mail. Please do not reply</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
msg = EmailMessage()
|
||||
msg['Subject'] = 'Hasaki Crawler Status'
|
||||
msg['From'] = From
|
||||
msg['To'] = To
|
||||
msg.set_content(html, subtype='html')
|
||||
|
||||
with smtplib.SMTP('email-smtp.ap-southeast-1.amazonaws.com', 587) as smtp:
|
||||
smtp.ehlo()
|
||||
smtp.starttls()
|
||||
smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD)
|
||||
smtp.send_message(msg)
|
||||
except Exception as e:
|
||||
logging.info("Error while sending mail: {}".format(e))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.info("Starting Hasaki Crawler.......")
|
||||
|
@ -39,9 +82,10 @@ if __name__ == "__main__":
|
|||
print(config)
|
||||
|
||||
main()
|
||||
send_mail("Hasaki crawler run complete.")
|
||||
|
||||
except Exception as e:
|
||||
logging.info("Error: ".format(e))
|
||||
#logging.info("Cannot load config file. Please check. Exiting......")
|
||||
#send_mail()
|
||||
logging.info("Cannot load config file. Please check. Exiting......")
|
||||
send_mail("Error occurred. Please check Hasaki Pipeline.")
|
||||
exit(1)
|
||||
|
|
|
@ -11,6 +11,7 @@ from hasaki_db_writer import hasaki_db_writer
|
|||
import pandas as pd
|
||||
from bs4 import BeautifulSoup
|
||||
from Util import translate_text_to_english
|
||||
from fake_useragent import UserAgent
|
||||
|
||||
class HasakiProductInfo:
|
||||
def __init__(self, config):
|
||||
|
@ -57,13 +58,15 @@ class HasakiProductInfo:
|
|||
|
||||
try:
|
||||
self.get_product_info(row)
|
||||
#time.sleep(random.randint(23,57))
|
||||
except:
|
||||
pass
|
||||
|
||||
sql = f"""
|
||||
update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1
|
||||
where categoryid={row[9]}, product_section='{row[1]}', product_rank={row[8]}, product_url='{row[3]}'
|
||||
where categoryid={row[9]} and product_section='{row[1]}' and product_rank={row[8]} and product_url='{row[3]}'
|
||||
"""
|
||||
logging.info(sql)
|
||||
self.cur.execute(sql)
|
||||
|
||||
cnt += 1
|
||||
|
@ -82,12 +85,17 @@ class HasakiProductInfo:
|
|||
|
||||
self.seo_info(raw_data)
|
||||
|
||||
|
||||
def get_raw_product_data(self, url):
|
||||
retries = 2
|
||||
for _ in range(retries):
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
context = browser.new_context(
|
||||
user_agent="Mozilla/5.0 (iPhone X; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1")
|
||||
browser = p.chromium.launch(headless=False)
|
||||
ua = UserAgent(platforms='mobile')
|
||||
random_mobile_ua = ua.random
|
||||
logging.info("using user agent: {}".format(random_mobile_ua))
|
||||
|
||||
context = browser.new_context(user_agent=random_mobile_ua)
|
||||
page = context.new_page()
|
||||
|
||||
page.goto(url)
|
||||
|
@ -98,6 +106,11 @@ class HasakiProductInfo:
|
|||
browser.close()
|
||||
|
||||
return api_requests
|
||||
except Exception as e:
|
||||
logging.error(f"An error occurred: {str(e)}")
|
||||
logging.info("Retrying...")
|
||||
|
||||
return None
|
||||
|
||||
def product_info(self, data, raw_data):
|
||||
|
||||
|
@ -212,8 +225,10 @@ class HasakiProductInfo:
|
|||
data_product['product_price_min_before_discount'] = 0
|
||||
data_product['product_price_max_before_discount'] = 0
|
||||
try:
|
||||
data_product['product_price_min_before_discount'] = raw_data['price']
|
||||
data_product['product_price_max_before_discount'] = raw_data['price']
|
||||
market_price = raw_data['market_price']
|
||||
market_price = re.sub(r'\D', '', market_price)
|
||||
data_product['product_price_min_before_discount'] = market_price
|
||||
data_product['product_price_max_before_discount'] = market_price
|
||||
except:
|
||||
pass
|
||||
|
||||
|
|
|
@ -1,25 +0,0 @@
|
|||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async def main():
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch()
|
||||
context = await browser.new_context()
|
||||
|
||||
page = await context.new_page()
|
||||
|
||||
# Enable request interception
|
||||
await page.route('https://hasaki.vn/wap/v2/product/detail', lambda route: route.continue_())
|
||||
|
||||
# Navigate to the website URL
|
||||
await page.goto('https://hasaki.vn/san-pham/nuoc-hoa-hong-khong-mui-klairs-danh-cho-da-nhay-cam-180ml-65994.html')
|
||||
|
||||
# Wait for the API request to be made
|
||||
response = await page.wait_for_event('request', predicate=lambda req: 'v2/product/detail' in req.url)
|
||||
json_response = await response.response.json()
|
||||
|
||||
print(json_response)
|
||||
|
||||
await browser.close()
|
||||
|
||||
asyncio.run(main())
|
Loading…
Reference in New Issue