From d2391297391f080ff21716c57aa788f284542184 Mon Sep 17 00:00:00 2001 From: "shariar@raenabeauty.com" Date: Mon, 1 Apr 2024 11:31:33 +0400 Subject: [PATCH] added Hasaki crawler --- hasaki_crawler_engine/hasaki_categories.py | 15 +++++++++++++++ .../hasaki_category_products.py | 14 ++++++++++++++ hasaki_crawler_engine/hasaki_crawler.py | 16 +++++++++++++--- hasaki_crawler_engine/hasaki_db_writer.py | 8 ++++++-- hasaki_crawler_engine/hasaki_product_info.py | 13 +++++++++++++ 5 files changed, 61 insertions(+), 5 deletions(-) diff --git a/hasaki_crawler_engine/hasaki_categories.py b/hasaki_crawler_engine/hasaki_categories.py index c752ff3..d8557c8 100644 --- a/hasaki_crawler_engine/hasaki_categories.py +++ b/hasaki_crawler_engine/hasaki_categories.py @@ -3,11 +3,22 @@ import logging import time import psycopg2 import pandas as pd +from pyvirtualdisplay import Display from playwright.sync_api import sync_playwright from hasaki_db_writer import hasaki_db_writer from Util import translate_text_to_english +###### Looger ###### +logname = '/home/ubuntu/logs/hasaki_crawler.log' +logging.basicConfig(filename=logname, + filemode='a', + format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) + + + class HasakiCategories: @@ -31,9 +42,13 @@ class HasakiCategories: self.db_writer = hasaki_db_writer(config) + self.display = Display(visible=0, size=(800, 600)) + self.display.start() + def __del__(self): print("Closing connection.....") self.conn.close() + self.display.stop() def start_processing(self): diff --git a/hasaki_crawler_engine/hasaki_category_products.py b/hasaki_crawler_engine/hasaki_category_products.py index 67a143d..20dbceb 100644 --- a/hasaki_crawler_engine/hasaki_category_products.py +++ b/hasaki_crawler_engine/hasaki_category_products.py @@ -2,9 +2,19 @@ import logging import random import time import psycopg2 +from pyvirtualdisplay import Display from playwright.sync_api import sync_playwright from hasaki_db_writer import hasaki_db_writer from Util import translate_text_to_english + +###### Looger ###### +logname = '/home/ubuntu/logs/hasaki_crawler.log' +logging.basicConfig(filename=logname, + filemode='a', + format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) + class HasakiCategoryProducts: def __init__(self, config): logging.info("Initializing HasakiCategoryProducts........") @@ -26,9 +36,13 @@ class HasakiCategoryProducts: self.db_writer = hasaki_db_writer(config) + self.display = Display(visible=0, size=(800, 600)) + self.display.start() + def __del__(self): print("Closing connection.....") self.conn.close() + self.display.stop() def start_processing(self): diff --git a/hasaki_crawler_engine/hasaki_crawler.py b/hasaki_crawler_engine/hasaki_crawler.py index 7fc4315..0da5bfc 100644 --- a/hasaki_crawler_engine/hasaki_crawler.py +++ b/hasaki_crawler_engine/hasaki_crawler.py @@ -10,9 +10,13 @@ from hasaki_category_products import HasakiCategoryProducts from hasaki_product_info import HasakiProductInfo from email.message import EmailMessage -##### Looger ###### -format = "%(asctime)s: %(message)s" -logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") +###### Looger ###### +logname = '/home/ubuntu/logs/hasaki_crawler.log' +logging.basicConfig(filename=logname, + filemode='a', + format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) config = {} @@ -26,6 +30,8 @@ def main(cur): hasaki_categories.start_processing() cur.execute(f"""update process_tracker set flag = 1 where process = 'category'""") + logging.info("Category collection completed........ Moving to collecting products ;ist of the categories.......") + #time.sleep(60) cur.execute(f"""select flag from process_tracker where process = 'category_product'""") @@ -35,6 +41,8 @@ def main(cur): hasaki_category_products.start_processing() cur.execute(f"""update process_tracker set flag = 1 where process = 'category_product'""") + logging.info("Category products collection completed........ Moving to collecting product info.......") + #time.sleep(60) cur.execute(f"""select flag from process_tracker where process = 'product_info'""") @@ -48,6 +56,8 @@ def main(cur): cur.execute(f"""update process_tracker set flag = 0 where process = 'category_product'""") cur.execute(f"""update process_tracker set flag = 0 where process = 'product_info'""") + logging.info("Product info collection done. Stopping........") + diff --git a/hasaki_crawler_engine/hasaki_db_writer.py b/hasaki_crawler_engine/hasaki_db_writer.py index bf3b999..a34da84 100755 --- a/hasaki_crawler_engine/hasaki_db_writer.py +++ b/hasaki_crawler_engine/hasaki_db_writer.py @@ -2,8 +2,12 @@ import logging import psycopg2 ###### Looger ###### -format = "%(asctime)s: %(message)s" -logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") +logname = '/home/ubuntu/logs/hasaki_crawler.log' +logging.basicConfig(filename=logname, + filemode='a', + format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) class hasaki_db_writer: def __init__(self, config): diff --git a/hasaki_crawler_engine/hasaki_product_info.py b/hasaki_crawler_engine/hasaki_product_info.py index d2d9693..08daf40 100644 --- a/hasaki_crawler_engine/hasaki_product_info.py +++ b/hasaki_crawler_engine/hasaki_product_info.py @@ -12,12 +12,21 @@ from Util import translate_text_to_english from fake_useragent import UserAgent import time import random +from pyvirtualdisplay import Display from seleniumwire import webdriver from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager import brotli import json +###### Looger ###### +logname = '/home/ubuntu/logs/hasaki_crawler.log' +logging.basicConfig(filename=logname, + filemode='a', + format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO) + class HasakiProductInfo: def __init__(self, config): logging.info("Initializing HasakiProductInfo") @@ -40,9 +49,13 @@ class HasakiProductInfo: self.db_writer = hasaki_db_writer(config) + self.display = Display(visible=0, size=(800, 600)) + self.display.start() + def __del__(self): print("Closing connection.....") self.conn.close() + self.display.stop() def start_processing(self): logging.info("Starting to collect product info from Hasaki........")