diff --git a/amazon_crawler_engine/amazon_crawler.py b/amazon_crawler_engine/amazon_crawler.py index 2554a72..70dde7b 100644 --- a/amazon_crawler_engine/amazon_crawler.py +++ b/amazon_crawler_engine/amazon_crawler.py @@ -58,6 +58,7 @@ def send_mail(): smtp.send_message(msg) except Exception as e: logging.info("Error while sending mail: {}".format(e)) + def main(): # start = datetime.now() # categories = amazon_categories(config) diff --git a/amazon_crawler_engine/test1.py b/amazon_crawler_engine/test1.py index b1eb402..e69de29 100644 --- a/amazon_crawler_engine/test1.py +++ b/amazon_crawler_engine/test1.py @@ -1,83 +0,0 @@ -import hashlib -import logging -import sys -import string -import undetected_chromedriver as webdriver -from selenium.webdriver.common.by import By -from selenium.webdriver.chrome.service import Service -import psycopg2 -import bs4 -from webdriver_manager.chrome import ChromeDriverManager -import random -from bs4 import BeautifulSoup -import json -import time -import gzip -import re -import random -from amazon_db_writer import amazon_db_writer - -import ssl -ssl._create_default_https_context = ssl._create_unverified_context - - -def reseller_info(store_url): - - op = webdriver.ChromeOptions() - op.add_argument('--no-sandbox') - op.add_argument('--disable-notifications') - op.add_argument("--lang=en-GB") - #op.headless = True - driver=webdriver.Chrome( options=op) - - driver.get(store_url) - - driver.implicitly_wait(5) - - try: - driver.get(store_url) - driver.implicitly_wait(5) - - ##### reseller info - - avg_rating = driver.find_element(By.CSS_SELECTOR,'#effective-timeperiod-rating-year-description.ratings-reviews').text - - print(avg_rating) - - - - except Exception as e: - print(e) - -config = { - "crawler_name": "raena_crawler_enginer_amazon", - "crawler_schema": "raena_spider_management", - "category_tab": "rce_category", - "tracker_tab": "crawler_tracker", - "product_tab": "rce_product", - "variant_tab": "rce_product_variant", - "brand_tab": "rce_brand", - "reseller_tab": "rce_reseller", - "reseller_store_tab": "rce_reseller_store", - "review_tab": "rce_ratings_reviews", - "review_productmodels_tab": "rce_ratings_reviews_productmodels", - "review_producttags_tab": "rce_ratings_reviews_producttags", - "review_tags": "rce_tags", - "source_tab": "rce_source", - "product_per_category": "1000", - "source_category": "11043145", - "db_user": "postgres", - "db_pass": "postgres", - "database": "postgres", - "db_host": "localhost", - "db_port": "5444", - "crawler_main": "1", - "crawler_slave_no": "" -} -conn = psycopg2.connect(database=config.get('database'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port')) -conn.autocommit = True -cur = conn.cursor() -db_writer = amazon_db_writer(config) - - -reseller_info('https://www.amazon.ae/sp?ie=UTF8&seller=A3TFGX22P341AN&isAmazonFulfilled=0&asin=B09BR31PF9&ref_=olp_merch_name_1') \ No newline at end of file diff --git a/hasaki_crawler_engine/hasaki_categories.py b/hasaki_crawler_engine/hasaki_categories.py index d312bc9..c752ff3 100644 --- a/hasaki_crawler_engine/hasaki_categories.py +++ b/hasaki_crawler_engine/hasaki_categories.py @@ -78,26 +78,33 @@ class HasakiCategories: print((1,)+(cat)) sub_cats1 = self.crawl_categories(cat[1], cat[2]) - time.sleep(10) + time.sleep(3) if sub_cats1: for sub_cat1 in sub_cats1: self.master_category.append((2,) + (sub_cat1)) print((2,) + (sub_cat1)) sub_cats2 = self.crawl_categories(sub_cat1[1], sub_cat1[2]) - time.sleep(10) + time.sleep(3) if sub_cats2: for sub_cat2 in sub_cats2: self.master_category.append((3,) + (sub_cat2)) print((3,) + (sub_cat2)) sub_cats3 = self.crawl_categories(sub_cat2[1], sub_cat2[2]) - time.sleep(10) + time.sleep(3) if sub_cats3: for sub_cat3 in sub_cats3: self.master_category.append((4,) + (sub_cat3)) print((4,) + (sub_cat3)) + sub_cats4 = self.crawl_categories(sub_cat3[1], sub_cat3[2]) + time.sleep(3) + if sub_cats4: + for sub_cat4 in sub_cats4: + self.master_category.append((4,) + (sub_cat4)) + print((5,) + (sub_cat4)) + def crawl_categories(self, parent, url_to_visit): with sync_playwright() as p: diff --git a/hasaki_crawler_engine/hasaki_category_products.py b/hasaki_crawler_engine/hasaki_category_products.py index 26e6e94..b17af67 100644 --- a/hasaki_crawler_engine/hasaki_category_products.py +++ b/hasaki_crawler_engine/hasaki_category_products.py @@ -55,6 +55,23 @@ class HasakiCategoryProducts: self.get_product_list(urls = pages, categoryId = category[0]) + def find_top_search(self): + with sync_playwright() as p: + browser = p.chromium.launch(headless=False) + + page = browser.new_page() + page.goto("https://hasaki.vn/") + + page.wait_for_load_state('load') + + top_search_element = page.query_selector_all(".item_top_search") + + for element in top_search_element: + url = element.query_selector(".top_big_search").query_selector('a').get_attribute('href').strip() + + print(url) + browser.close() + def get_pages(self, url): @@ -64,7 +81,7 @@ class HasakiCategoryProducts: try: with sync_playwright() as p: - browser = p.chromium.launch(headless=True) + browser = p.chromium.launch(headless=False) page = browser.new_page() page.goto(url) @@ -88,7 +105,7 @@ class HasakiCategoryProducts: try: with sync_playwright() as p: - browser = p.chromium.launch(headless=True) + browser = p.chromium.launch(headless=False) page = browser.new_page() @@ -109,6 +126,13 @@ class HasakiCategoryProducts: for item_element in item_elements: try: product_section = "Base Product Page " + str(page_count) + if url in ["https://hasaki.vn/danh-muc/chong-nang-da-mat-c11.html", + "https://hasaki.vn/danh-muc/trang-diem-moi-c24.html", + "https://hasaki.vn/danh-muc/sua-rua-mat-c19.html", + "https://hasaki.vn/danh-muc/kem-duong-dau-duong-c9.html"]: + + product_section = "Top Search - Base Product Page " + str(page_count) + product_name = translate_text_to_english(str(item_element.query_selector('.width_common.name_sp.space_bottom_5').text_content()).strip().replace("'","")) product_url = str(item_element.query_selector('.v3_thumb_common_sp.relative').get_attribute('href')).strip() product_brand = translate_text_to_english(str(item_element.query_selector('.width_common.txt_color_1.space_bottom_3').text_content()).strip().replace("'","")) diff --git a/hasaki_crawler_engine/hasaki_crawler.py b/hasaki_crawler_engine/hasaki_crawler.py index 4f7bf56..4c00760 100644 --- a/hasaki_crawler_engine/hasaki_crawler.py +++ b/hasaki_crawler_engine/hasaki_crawler.py @@ -1,10 +1,12 @@ import logging import json import time +import smtplib from hasaki_categories import HasakiCategories from hasaki_category_products import HasakiCategoryProducts from hasaki_product_info import HasakiProductInfo +from email.message import EmailMessage ##### Looger ###### format = "%(asctime)s: %(message)s" @@ -14,20 +16,61 @@ config = {} def main(): - # hasaki_categories = HasakiCategories(config) - # hasaki_categories.start_processing() - # - # time.sleep(60) - # - # hasaki_category_products = HasakiCategoryProducts(config) - # hasaki_category_products.start_processing() - # - # time.sleep(60) + hasaki_categories = HasakiCategories(config) + hasaki_categories.start_processing() + + time.sleep(60) + + hasaki_category_products = HasakiCategoryProducts(config) + hasaki_category_products.start_processing() + + time.sleep(60) hasaki_products = HasakiProductInfo(config) hasaki_products.start_processing() +def send_mail(msg): + try: + EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5" + EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh" + From = 'data_reporting@raenabeauty.com' + To = 'shariar@raenabeauty.com' + # To = 'shariar@raenabeauty.com' + + html = f''' + + + +
+

Hasaki Crawler Status

+
+
+
+ {msg} +
+

This is system generated mail. Please do not reply

+
+
+
+ + + ''' + + msg = EmailMessage() + msg['Subject'] = 'Hasaki Crawler Status' + msg['From'] = From + msg['To'] = To + msg.set_content(html, subtype='html') + + with smtplib.SMTP('email-smtp.ap-southeast-1.amazonaws.com', 587) as smtp: + smtp.ehlo() + smtp.starttls() + smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD) + smtp.send_message(msg) + except Exception as e: + logging.info("Error while sending mail: {}".format(e)) + if __name__ == "__main__": logging.info("Starting Hasaki Crawler.......") @@ -39,9 +82,10 @@ if __name__ == "__main__": print(config) main() + send_mail("Hasaki crawler run complete.") except Exception as e: logging.info("Error: ".format(e)) - #logging.info("Cannot load config file. Please check. Exiting......") - #send_mail() + logging.info("Cannot load config file. Please check. Exiting......") + send_mail("Error occurred. Please check Hasaki Pipeline.") exit(1) diff --git a/hasaki_crawler_engine/hasaki_product_info.py b/hasaki_crawler_engine/hasaki_product_info.py index 7c5584d..49aaf0e 100644 --- a/hasaki_crawler_engine/hasaki_product_info.py +++ b/hasaki_crawler_engine/hasaki_product_info.py @@ -11,6 +11,7 @@ from hasaki_db_writer import hasaki_db_writer import pandas as pd from bs4 import BeautifulSoup from Util import translate_text_to_english +from fake_useragent import UserAgent class HasakiProductInfo: def __init__(self, config): @@ -57,13 +58,15 @@ class HasakiProductInfo: try: self.get_product_info(row) + #time.sleep(random.randint(23,57)) except: pass sql = f""" update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 - where categoryid={row[9]}, product_section='{row[1]}', product_rank={row[8]}, product_url='{row[3]}' + where categoryid={row[9]} and product_section='{row[1]}' and product_rank={row[8]} and product_url='{row[3]}' """ + logging.info(sql) self.cur.execute(sql) cnt += 1 @@ -82,22 +85,32 @@ class HasakiProductInfo: self.seo_info(raw_data) - def get_raw_product_data(self, url): - with sync_playwright() as p: - browser = p.chromium.launch(headless=True) - context = browser.new_context( - user_agent="Mozilla/5.0 (iPhone X; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1") - page = context.new_page() + retries = 2 + for _ in range(retries): + try: + with sync_playwright() as p: + browser = p.chromium.launch(headless=False) + ua = UserAgent(platforms='mobile') + random_mobile_ua = ua.random + logging.info("using user agent: {}".format(random_mobile_ua)) - page.goto(url) + context = browser.new_context(user_agent=random_mobile_ua) + page = context.new_page() - with page.expect_response("**/wap/v2/product/detail**") as response: - api_requests = response.value.json() + page.goto(url) - browser.close() + with page.expect_response("**/wap/v2/product/detail**") as response: + api_requests = response.value.json() - return api_requests + browser.close() + + return api_requests + except Exception as e: + logging.error(f"An error occurred: {str(e)}") + logging.info("Retrying...") + + return None def product_info(self, data, raw_data): @@ -212,8 +225,10 @@ class HasakiProductInfo: data_product['product_price_min_before_discount'] = 0 data_product['product_price_max_before_discount'] = 0 try: - data_product['product_price_min_before_discount'] = raw_data['price'] - data_product['product_price_max_before_discount'] = raw_data['price'] + market_price = raw_data['market_price'] + market_price = re.sub(r'\D', '', market_price) + data_product['product_price_min_before_discount'] = market_price + data_product['product_price_max_before_discount'] = market_price except: pass diff --git a/hasaki_crawler_engine/test2.py b/hasaki_crawler_engine/test2.py deleted file mode 100644 index bdef230..0000000 --- a/hasaki_crawler_engine/test2.py +++ /dev/null @@ -1,25 +0,0 @@ -import asyncio -from playwright.async_api import async_playwright - -async def main(): - async with async_playwright() as p: - browser = await p.chromium.launch() - context = await browser.new_context() - - page = await context.new_page() - - # Enable request interception - await page.route('https://hasaki.vn/wap/v2/product/detail', lambda route: route.continue_()) - - # Navigate to the website URL - await page.goto('https://hasaki.vn/san-pham/nuoc-hoa-hong-khong-mui-klairs-danh-cho-da-nhay-cam-180ml-65994.html') - - # Wait for the API request to be made - response = await page.wait_for_event('request', predicate=lambda req: 'v2/product/detail' in req.url) - json_response = await response.response.json() - - print(json_response) - - await browser.close() - -asyncio.run(main())