added Hasaki crawler

This commit is contained in:
Shariar Imtiaz 2024-03-14 13:32:49 +04:00
parent 45e6965679
commit d0344122e2
7 changed files with 121 additions and 138 deletions

View File

@ -58,6 +58,7 @@ def send_mail():
smtp.send_message(msg) smtp.send_message(msg)
except Exception as e: except Exception as e:
logging.info("Error while sending mail: {}".format(e)) logging.info("Error while sending mail: {}".format(e))
def main(): def main():
# start = datetime.now() # start = datetime.now()
# categories = amazon_categories(config) # categories = amazon_categories(config)

View File

@ -1,83 +0,0 @@
import hashlib
import logging
import sys
import string
import undetected_chromedriver as webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import psycopg2
import bs4
from webdriver_manager.chrome import ChromeDriverManager
import random
from bs4 import BeautifulSoup
import json
import time
import gzip
import re
import random
from amazon_db_writer import amazon_db_writer
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
def reseller_info(store_url):
op = webdriver.ChromeOptions()
op.add_argument('--no-sandbox')
op.add_argument('--disable-notifications')
op.add_argument("--lang=en-GB")
#op.headless = True
driver=webdriver.Chrome( options=op)
driver.get(store_url)
driver.implicitly_wait(5)
try:
driver.get(store_url)
driver.implicitly_wait(5)
##### reseller info
avg_rating = driver.find_element(By.CSS_SELECTOR,'#effective-timeperiod-rating-year-description.ratings-reviews').text
print(avg_rating)
except Exception as e:
print(e)
config = {
"crawler_name": "raena_crawler_enginer_amazon",
"crawler_schema": "raena_spider_management",
"category_tab": "rce_category",
"tracker_tab": "crawler_tracker",
"product_tab": "rce_product",
"variant_tab": "rce_product_variant",
"brand_tab": "rce_brand",
"reseller_tab": "rce_reseller",
"reseller_store_tab": "rce_reseller_store",
"review_tab": "rce_ratings_reviews",
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
"review_producttags_tab": "rce_ratings_reviews_producttags",
"review_tags": "rce_tags",
"source_tab": "rce_source",
"product_per_category": "1000",
"source_category": "11043145",
"db_user": "postgres",
"db_pass": "postgres",
"database": "postgres",
"db_host": "localhost",
"db_port": "5444",
"crawler_main": "1",
"crawler_slave_no": ""
}
conn = psycopg2.connect(database=config.get('database'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port'))
conn.autocommit = True
cur = conn.cursor()
db_writer = amazon_db_writer(config)
reseller_info('https://www.amazon.ae/sp?ie=UTF8&seller=A3TFGX22P341AN&isAmazonFulfilled=0&asin=B09BR31PF9&ref_=olp_merch_name_1')

View File

@ -78,26 +78,33 @@ class HasakiCategories:
print((1,)+(cat)) print((1,)+(cat))
sub_cats1 = self.crawl_categories(cat[1], cat[2]) sub_cats1 = self.crawl_categories(cat[1], cat[2])
time.sleep(10) time.sleep(3)
if sub_cats1: if sub_cats1:
for sub_cat1 in sub_cats1: for sub_cat1 in sub_cats1:
self.master_category.append((2,) + (sub_cat1)) self.master_category.append((2,) + (sub_cat1))
print((2,) + (sub_cat1)) print((2,) + (sub_cat1))
sub_cats2 = self.crawl_categories(sub_cat1[1], sub_cat1[2]) sub_cats2 = self.crawl_categories(sub_cat1[1], sub_cat1[2])
time.sleep(10) time.sleep(3)
if sub_cats2: if sub_cats2:
for sub_cat2 in sub_cats2: for sub_cat2 in sub_cats2:
self.master_category.append((3,) + (sub_cat2)) self.master_category.append((3,) + (sub_cat2))
print((3,) + (sub_cat2)) print((3,) + (sub_cat2))
sub_cats3 = self.crawl_categories(sub_cat2[1], sub_cat2[2]) sub_cats3 = self.crawl_categories(sub_cat2[1], sub_cat2[2])
time.sleep(10) time.sleep(3)
if sub_cats3: if sub_cats3:
for sub_cat3 in sub_cats3: for sub_cat3 in sub_cats3:
self.master_category.append((4,) + (sub_cat3)) self.master_category.append((4,) + (sub_cat3))
print((4,) + (sub_cat3)) print((4,) + (sub_cat3))
sub_cats4 = self.crawl_categories(sub_cat3[1], sub_cat3[2])
time.sleep(3)
if sub_cats4:
for sub_cat4 in sub_cats4:
self.master_category.append((4,) + (sub_cat4))
print((5,) + (sub_cat4))
def crawl_categories(self, parent, url_to_visit): def crawl_categories(self, parent, url_to_visit):
with sync_playwright() as p: with sync_playwright() as p:

View File

@ -55,6 +55,23 @@ class HasakiCategoryProducts:
self.get_product_list(urls = pages, categoryId = category[0]) self.get_product_list(urls = pages, categoryId = category[0])
def find_top_search(self):
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://hasaki.vn/")
page.wait_for_load_state('load')
top_search_element = page.query_selector_all(".item_top_search")
for element in top_search_element:
url = element.query_selector(".top_big_search").query_selector('a').get_attribute('href').strip()
print(url)
browser.close()
def get_pages(self, url): def get_pages(self, url):
@ -64,7 +81,7 @@ class HasakiCategoryProducts:
try: try:
with sync_playwright() as p: with sync_playwright() as p:
browser = p.chromium.launch(headless=True) browser = p.chromium.launch(headless=False)
page = browser.new_page() page = browser.new_page()
page.goto(url) page.goto(url)
@ -88,7 +105,7 @@ class HasakiCategoryProducts:
try: try:
with sync_playwright() as p: with sync_playwright() as p:
browser = p.chromium.launch(headless=True) browser = p.chromium.launch(headless=False)
page = browser.new_page() page = browser.new_page()
@ -109,6 +126,13 @@ class HasakiCategoryProducts:
for item_element in item_elements: for item_element in item_elements:
try: try:
product_section = "Base Product Page " + str(page_count) product_section = "Base Product Page " + str(page_count)
if url in ["https://hasaki.vn/danh-muc/chong-nang-da-mat-c11.html",
"https://hasaki.vn/danh-muc/trang-diem-moi-c24.html",
"https://hasaki.vn/danh-muc/sua-rua-mat-c19.html",
"https://hasaki.vn/danh-muc/kem-duong-dau-duong-c9.html"]:
product_section = "Top Search - Base Product Page " + str(page_count)
product_name = translate_text_to_english(str(item_element.query_selector('.width_common.name_sp.space_bottom_5').text_content()).strip().replace("'","")) product_name = translate_text_to_english(str(item_element.query_selector('.width_common.name_sp.space_bottom_5').text_content()).strip().replace("'",""))
product_url = str(item_element.query_selector('.v3_thumb_common_sp.relative').get_attribute('href')).strip() product_url = str(item_element.query_selector('.v3_thumb_common_sp.relative').get_attribute('href')).strip()
product_brand = translate_text_to_english(str(item_element.query_selector('.width_common.txt_color_1.space_bottom_3').text_content()).strip().replace("'","")) product_brand = translate_text_to_english(str(item_element.query_selector('.width_common.txt_color_1.space_bottom_3').text_content()).strip().replace("'",""))

View File

@ -1,10 +1,12 @@
import logging import logging
import json import json
import time import time
import smtplib
from hasaki_categories import HasakiCategories from hasaki_categories import HasakiCategories
from hasaki_category_products import HasakiCategoryProducts from hasaki_category_products import HasakiCategoryProducts
from hasaki_product_info import HasakiProductInfo from hasaki_product_info import HasakiProductInfo
from email.message import EmailMessage
##### Looger ###### ##### Looger ######
format = "%(asctime)s: %(message)s" format = "%(asctime)s: %(message)s"
@ -14,20 +16,61 @@ config = {}
def main(): def main():
# hasaki_categories = HasakiCategories(config) hasaki_categories = HasakiCategories(config)
# hasaki_categories.start_processing() hasaki_categories.start_processing()
#
# time.sleep(60) time.sleep(60)
#
# hasaki_category_products = HasakiCategoryProducts(config) hasaki_category_products = HasakiCategoryProducts(config)
# hasaki_category_products.start_processing() hasaki_category_products.start_processing()
#
# time.sleep(60) time.sleep(60)
hasaki_products = HasakiProductInfo(config) hasaki_products = HasakiProductInfo(config)
hasaki_products.start_processing() hasaki_products.start_processing()
def send_mail(msg):
try:
EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5"
EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh"
From = 'data_reporting@raenabeauty.com'
To = 'shariar@raenabeauty.com'
# To = 'shariar@raenabeauty.com'
html = f'''
<!DOCTYPE html>
<html>
<body>
<div style="background-color:#eee;padding:10px 20px;">
<h2 style="font-family:Georgia, 'Times New Roman', Times, serif;color#454349;">Hasaki Crawler Status</h2>
</div>
<div style="padding:20px 0px">
<div style="height: 800px;width:800px">
{msg}
<div style="text-align:Left;">
<p>This is system generated mail. Please do not reply</p>
</div>
</div>
</div>
</body>
</html>
'''
msg = EmailMessage()
msg['Subject'] = 'Hasaki Crawler Status'
msg['From'] = From
msg['To'] = To
msg.set_content(html, subtype='html')
with smtplib.SMTP('email-smtp.ap-southeast-1.amazonaws.com', 587) as smtp:
smtp.ehlo()
smtp.starttls()
smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD)
smtp.send_message(msg)
except Exception as e:
logging.info("Error while sending mail: {}".format(e))
if __name__ == "__main__": if __name__ == "__main__":
logging.info("Starting Hasaki Crawler.......") logging.info("Starting Hasaki Crawler.......")
@ -39,9 +82,10 @@ if __name__ == "__main__":
print(config) print(config)
main() main()
send_mail("Hasaki crawler run complete.")
except Exception as e: except Exception as e:
logging.info("Error: ".format(e)) logging.info("Error: ".format(e))
#logging.info("Cannot load config file. Please check. Exiting......") logging.info("Cannot load config file. Please check. Exiting......")
#send_mail() send_mail("Error occurred. Please check Hasaki Pipeline.")
exit(1) exit(1)

View File

@ -11,6 +11,7 @@ from hasaki_db_writer import hasaki_db_writer
import pandas as pd import pandas as pd
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from Util import translate_text_to_english from Util import translate_text_to_english
from fake_useragent import UserAgent
class HasakiProductInfo: class HasakiProductInfo:
def __init__(self, config): def __init__(self, config):
@ -57,13 +58,15 @@ class HasakiProductInfo:
try: try:
self.get_product_info(row) self.get_product_info(row)
#time.sleep(random.randint(23,57))
except: except:
pass pass
sql = f""" sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1
where categoryid={row[9]}, product_section='{row[1]}', product_rank={row[8]}, product_url='{row[3]}' where categoryid={row[9]} and product_section='{row[1]}' and product_rank={row[8]} and product_url='{row[3]}'
""" """
logging.info(sql)
self.cur.execute(sql) self.cur.execute(sql)
cnt += 1 cnt += 1
@ -82,22 +85,32 @@ class HasakiProductInfo:
self.seo_info(raw_data) self.seo_info(raw_data)
def get_raw_product_data(self, url): def get_raw_product_data(self, url):
with sync_playwright() as p: retries = 2
browser = p.chromium.launch(headless=True) for _ in range(retries):
context = browser.new_context( try:
user_agent="Mozilla/5.0 (iPhone X; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1") with sync_playwright() as p:
page = context.new_page() browser = p.chromium.launch(headless=False)
ua = UserAgent(platforms='mobile')
random_mobile_ua = ua.random
logging.info("using user agent: {}".format(random_mobile_ua))
page.goto(url) context = browser.new_context(user_agent=random_mobile_ua)
page = context.new_page()
with page.expect_response("**/wap/v2/product/detail**") as response: page.goto(url)
api_requests = response.value.json()
browser.close() with page.expect_response("**/wap/v2/product/detail**") as response:
api_requests = response.value.json()
return api_requests browser.close()
return api_requests
except Exception as e:
logging.error(f"An error occurred: {str(e)}")
logging.info("Retrying...")
return None
def product_info(self, data, raw_data): def product_info(self, data, raw_data):
@ -212,8 +225,10 @@ class HasakiProductInfo:
data_product['product_price_min_before_discount'] = 0 data_product['product_price_min_before_discount'] = 0
data_product['product_price_max_before_discount'] = 0 data_product['product_price_max_before_discount'] = 0
try: try:
data_product['product_price_min_before_discount'] = raw_data['price'] market_price = raw_data['market_price']
data_product['product_price_max_before_discount'] = raw_data['price'] market_price = re.sub(r'\D', '', market_price)
data_product['product_price_min_before_discount'] = market_price
data_product['product_price_max_before_discount'] = market_price
except: except:
pass pass

View File

@ -1,25 +0,0 @@
import asyncio
from playwright.async_api import async_playwright
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch()
context = await browser.new_context()
page = await context.new_page()
# Enable request interception
await page.route('https://hasaki.vn/wap/v2/product/detail', lambda route: route.continue_())
# Navigate to the website URL
await page.goto('https://hasaki.vn/san-pham/nuoc-hoa-hong-khong-mui-klairs-danh-cho-da-nhay-cam-180ml-65994.html')
# Wait for the API request to be made
response = await page.wait_for_event('request', predicate=lambda req: 'v2/product/detail' in req.url)
json_response = await response.response.json()
print(json_response)
await browser.close()
asyncio.run(main())