diff --git a/amazon_crawler_engine/amazon_crawler.py b/amazon_crawler_engine/amazon_crawler.py index 2554a72..70dde7b 100644 --- a/amazon_crawler_engine/amazon_crawler.py +++ b/amazon_crawler_engine/amazon_crawler.py @@ -58,6 +58,7 @@ def send_mail(): smtp.send_message(msg) except Exception as e: logging.info("Error while sending mail: {}".format(e)) + def main(): # start = datetime.now() # categories = amazon_categories(config) diff --git a/amazon_crawler_engine/test1.py b/amazon_crawler_engine/test1.py index b1eb402..e69de29 100644 --- a/amazon_crawler_engine/test1.py +++ b/amazon_crawler_engine/test1.py @@ -1,83 +0,0 @@ -import hashlib -import logging -import sys -import string -import undetected_chromedriver as webdriver -from selenium.webdriver.common.by import By -from selenium.webdriver.chrome.service import Service -import psycopg2 -import bs4 -from webdriver_manager.chrome import ChromeDriverManager -import random -from bs4 import BeautifulSoup -import json -import time -import gzip -import re -import random -from amazon_db_writer import amazon_db_writer - -import ssl -ssl._create_default_https_context = ssl._create_unverified_context - - -def reseller_info(store_url): - - op = webdriver.ChromeOptions() - op.add_argument('--no-sandbox') - op.add_argument('--disable-notifications') - op.add_argument("--lang=en-GB") - #op.headless = True - driver=webdriver.Chrome( options=op) - - driver.get(store_url) - - driver.implicitly_wait(5) - - try: - driver.get(store_url) - driver.implicitly_wait(5) - - ##### reseller info - - avg_rating = driver.find_element(By.CSS_SELECTOR,'#effective-timeperiod-rating-year-description.ratings-reviews').text - - print(avg_rating) - - - - except Exception as e: - print(e) - -config = { - "crawler_name": "raena_crawler_enginer_amazon", - "crawler_schema": "raena_spider_management", - "category_tab": "rce_category", - "tracker_tab": "crawler_tracker", - "product_tab": "rce_product", - "variant_tab": "rce_product_variant", - "brand_tab": "rce_brand", - "reseller_tab": "rce_reseller", - "reseller_store_tab": "rce_reseller_store", - "review_tab": "rce_ratings_reviews", - "review_productmodels_tab": "rce_ratings_reviews_productmodels", - "review_producttags_tab": "rce_ratings_reviews_producttags", - "review_tags": "rce_tags", - "source_tab": "rce_source", - "product_per_category": "1000", - "source_category": "11043145", - "db_user": "postgres", - "db_pass": "postgres", - "database": "postgres", - "db_host": "localhost", - "db_port": "5444", - "crawler_main": "1", - "crawler_slave_no": "" -} -conn = psycopg2.connect(database=config.get('database'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port')) -conn.autocommit = True -cur = conn.cursor() -db_writer = amazon_db_writer(config) - - -reseller_info('https://www.amazon.ae/sp?ie=UTF8&seller=A3TFGX22P341AN&isAmazonFulfilled=0&asin=B09BR31PF9&ref_=olp_merch_name_1') \ No newline at end of file diff --git a/hasaki_crawler_engine/hasaki_categories.py b/hasaki_crawler_engine/hasaki_categories.py index d312bc9..c752ff3 100644 --- a/hasaki_crawler_engine/hasaki_categories.py +++ b/hasaki_crawler_engine/hasaki_categories.py @@ -78,26 +78,33 @@ class HasakiCategories: print((1,)+(cat)) sub_cats1 = self.crawl_categories(cat[1], cat[2]) - time.sleep(10) + time.sleep(3) if sub_cats1: for sub_cat1 in sub_cats1: self.master_category.append((2,) + (sub_cat1)) print((2,) + (sub_cat1)) sub_cats2 = self.crawl_categories(sub_cat1[1], sub_cat1[2]) - time.sleep(10) + time.sleep(3) if sub_cats2: for sub_cat2 in sub_cats2: self.master_category.append((3,) + (sub_cat2)) print((3,) + (sub_cat2)) sub_cats3 = self.crawl_categories(sub_cat2[1], sub_cat2[2]) - time.sleep(10) + time.sleep(3) if sub_cats3: for sub_cat3 in sub_cats3: self.master_category.append((4,) + (sub_cat3)) print((4,) + (sub_cat3)) + sub_cats4 = self.crawl_categories(sub_cat3[1], sub_cat3[2]) + time.sleep(3) + if sub_cats4: + for sub_cat4 in sub_cats4: + self.master_category.append((4,) + (sub_cat4)) + print((5,) + (sub_cat4)) + def crawl_categories(self, parent, url_to_visit): with sync_playwright() as p: diff --git a/hasaki_crawler_engine/hasaki_category_products.py b/hasaki_crawler_engine/hasaki_category_products.py index 26e6e94..b17af67 100644 --- a/hasaki_crawler_engine/hasaki_category_products.py +++ b/hasaki_crawler_engine/hasaki_category_products.py @@ -55,6 +55,23 @@ class HasakiCategoryProducts: self.get_product_list(urls = pages, categoryId = category[0]) + def find_top_search(self): + with sync_playwright() as p: + browser = p.chromium.launch(headless=False) + + page = browser.new_page() + page.goto("https://hasaki.vn/") + + page.wait_for_load_state('load') + + top_search_element = page.query_selector_all(".item_top_search") + + for element in top_search_element: + url = element.query_selector(".top_big_search").query_selector('a').get_attribute('href').strip() + + print(url) + browser.close() + def get_pages(self, url): @@ -64,7 +81,7 @@ class HasakiCategoryProducts: try: with sync_playwright() as p: - browser = p.chromium.launch(headless=True) + browser = p.chromium.launch(headless=False) page = browser.new_page() page.goto(url) @@ -88,7 +105,7 @@ class HasakiCategoryProducts: try: with sync_playwright() as p: - browser = p.chromium.launch(headless=True) + browser = p.chromium.launch(headless=False) page = browser.new_page() @@ -109,6 +126,13 @@ class HasakiCategoryProducts: for item_element in item_elements: try: product_section = "Base Product Page " + str(page_count) + if url in ["https://hasaki.vn/danh-muc/chong-nang-da-mat-c11.html", + "https://hasaki.vn/danh-muc/trang-diem-moi-c24.html", + "https://hasaki.vn/danh-muc/sua-rua-mat-c19.html", + "https://hasaki.vn/danh-muc/kem-duong-dau-duong-c9.html"]: + + product_section = "Top Search - Base Product Page " + str(page_count) + product_name = translate_text_to_english(str(item_element.query_selector('.width_common.name_sp.space_bottom_5').text_content()).strip().replace("'","")) product_url = str(item_element.query_selector('.v3_thumb_common_sp.relative').get_attribute('href')).strip() product_brand = translate_text_to_english(str(item_element.query_selector('.width_common.txt_color_1.space_bottom_3').text_content()).strip().replace("'","")) diff --git a/hasaki_crawler_engine/hasaki_crawler.py b/hasaki_crawler_engine/hasaki_crawler.py index 4f7bf56..4c00760 100644 --- a/hasaki_crawler_engine/hasaki_crawler.py +++ b/hasaki_crawler_engine/hasaki_crawler.py @@ -1,10 +1,12 @@ import logging import json import time +import smtplib from hasaki_categories import HasakiCategories from hasaki_category_products import HasakiCategoryProducts from hasaki_product_info import HasakiProductInfo +from email.message import EmailMessage ##### Looger ###### format = "%(asctime)s: %(message)s" @@ -14,20 +16,61 @@ config = {} def main(): - # hasaki_categories = HasakiCategories(config) - # hasaki_categories.start_processing() - # - # time.sleep(60) - # - # hasaki_category_products = HasakiCategoryProducts(config) - # hasaki_category_products.start_processing() - # - # time.sleep(60) + hasaki_categories = HasakiCategories(config) + hasaki_categories.start_processing() + + time.sleep(60) + + hasaki_category_products = HasakiCategoryProducts(config) + hasaki_category_products.start_processing() + + time.sleep(60) hasaki_products = HasakiProductInfo(config) hasaki_products.start_processing() +def send_mail(msg): + try: + EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5" + EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh" + From = 'data_reporting@raenabeauty.com' + To = 'shariar@raenabeauty.com' + # To = 'shariar@raenabeauty.com' + + html = f''' + + +
+This is system generated mail. Please do not reply
+