From 959fd9a03e29c8a3782f70a900658848eac79668 Mon Sep 17 00:00:00 2001 From: "shariar@raenabeauty.com" Date: Mon, 1 Apr 2024 11:43:38 +0400 Subject: [PATCH] added Hasaki crawler --- hasaki_crawler_engine/hasaki_categories.py | 6 ++- .../hasaki_category_products.py | 4 +- hasaki_crawler_engine/hasaki_crawler.log | 44 +++++++++++++++++++ hasaki_crawler_engine/hasaki_crawler.py | 17 ++++--- hasaki_crawler_engine/hasaki_db_writer.py | 1 + hasaki_crawler_engine/hasaki_product_info.py | 5 ++- 6 files changed, 69 insertions(+), 8 deletions(-) create mode 100644 hasaki_crawler_engine/hasaki_crawler.log diff --git a/hasaki_crawler_engine/hasaki_categories.py b/hasaki_crawler_engine/hasaki_categories.py index d8557c8..6dcbadf 100644 --- a/hasaki_crawler_engine/hasaki_categories.py +++ b/hasaki_crawler_engine/hasaki_categories.py @@ -9,8 +9,10 @@ from playwright.sync_api import sync_playwright from hasaki_db_writer import hasaki_db_writer from Util import translate_text_to_english + ###### Looger ###### logname = '/home/ubuntu/logs/hasaki_crawler.log' +#logname = 'hasaki_crawler.log' logging.basicConfig(filename=logname, filemode='a', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', @@ -48,7 +50,7 @@ class HasakiCategories: def __del__(self): print("Closing connection.....") self.conn.close() - self.display.stop() + def start_processing(self): @@ -62,6 +64,8 @@ class HasakiCategories: self.process_category(df) + self.display.stop() + def process_category(self, category): diff --git a/hasaki_crawler_engine/hasaki_category_products.py b/hasaki_crawler_engine/hasaki_category_products.py index 20dbceb..cadf1f2 100644 --- a/hasaki_crawler_engine/hasaki_category_products.py +++ b/hasaki_crawler_engine/hasaki_category_products.py @@ -9,6 +9,7 @@ from Util import translate_text_to_english ###### Looger ###### logname = '/home/ubuntu/logs/hasaki_crawler.log' +#logname = 'hasaki_crawler.log' logging.basicConfig(filename=logname, filemode='a', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', @@ -42,7 +43,6 @@ class HasakiCategoryProducts: def __del__(self): print("Closing connection.....") self.conn.close() - self.display.stop() def start_processing(self): @@ -65,6 +65,8 @@ class HasakiCategoryProducts: self.get_product_list(urls = pages, categoryId = category[0]) + self.display.stop() + def find_top_search(self): with sync_playwright() as p: diff --git a/hasaki_crawler_engine/hasaki_crawler.log b/hasaki_crawler_engine/hasaki_crawler.log new file mode 100644 index 0000000..e0f6f84 --- /dev/null +++ b/hasaki_crawler_engine/hasaki_crawler.log @@ -0,0 +1,44 @@ +2024-04-01 11:35:40,147 root INFO: Starting Hasaki Crawler....... +2024-04-01 11:35:40,147 root INFO: Loading config file....... +2024-04-01 11:35:40,147 root INFO: Config file loaded....... +2024-04-01 11:35:40,147 root INFO: {'crawler_name': 'raena_crawler_engine_hasaki', 'crawler_schema': 'raena_spider_management', 'category_tab': 'rce_category', 'tracker_tab': 'crawler_tracker_hasaki', 'product_tab': 'rce_product', 'variant_tab': 'rce_product_variant', 'brand_tab': 'rce_brand', 'reseller_tab': 'rce_reseller', 'reseller_store_tab': 'rce_reseller_store', 'review_tab': 'rce_ratings_reviews', 'review_productmodels_tab': 'rce_ratings_reviews_productmodels', 'review_producttags_tab': 'rce_ratings_reviews_producttags', 'review_tags': 'rce_tags', 'source_tab': 'rce_source', 'seo_tab': 'rce_seo', 'product_per_category': '1000', 'source_category': '11043145', 'db_user': 'dbadmin', 'db_pass': '5qCif6eyY3Kmg4z', 'database': 'analytics', 'db_host': 'redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com', 'db_port': '5439', 'crawler_main': '1', 'crawler_slave_no': ''} +2024-04-01 11:35:40,151 root INFO: ++++++++++++++++ process tracker tab status ++++++++++++++++++++++ +2024-04-01 11:35:40,151 root INFO: [('category', 1), ('category_product', 1), ('product_info', 0)] +2024-04-01 11:35:40,151 root INFO: Category collection completed........ Moving to collecting products ;ist of the categories....... +2024-04-01 11:35:40,151 root INFO: Category products collection completed........ Moving to collecting product info....... +2024-04-01 11:35:40,151 root INFO: Initializing HasakiProductInfo +2024-04-01 11:35:43,844 root INFO: Error: +2024-04-01 11:35:43,844 root INFO: Error occurred. Please check config file or the internal SQLLITE DB. Exiting...... +2024-04-01 11:35:47,656 root INFO: Closing connection..... +2024-04-01 11:37:18,979 root INFO: Starting Hasaki Crawler....... +2024-04-01 11:37:18,979 root INFO: Loading config file....... +2024-04-01 11:37:18,979 root INFO: Config file loaded....... +2024-04-01 11:37:18,979 root INFO: {'crawler_name': 'raena_crawler_engine_hasaki', 'crawler_schema': 'raena_spider_management', 'category_tab': 'rce_category', 'tracker_tab': 'crawler_tracker_hasaki', 'product_tab': 'rce_product', 'variant_tab': 'rce_product_variant', 'brand_tab': 'rce_brand', 'reseller_tab': 'rce_reseller', 'reseller_store_tab': 'rce_reseller_store', 'review_tab': 'rce_ratings_reviews', 'review_productmodels_tab': 'rce_ratings_reviews_productmodels', 'review_producttags_tab': 'rce_ratings_reviews_producttags', 'review_tags': 'rce_tags', 'source_tab': 'rce_source', 'seo_tab': 'rce_seo', 'product_per_category': '1000', 'source_category': '11043145', 'db_user': 'dbadmin', 'db_pass': '5qCif6eyY3Kmg4z', 'database': 'analytics', 'db_host': 'redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com', 'db_port': '5439', 'crawler_main': '1', 'crawler_slave_no': ''} +2024-04-01 11:37:18,983 root INFO: ++++++++++++++++ process tracker tab status ++++++++++++++++++++++ +2024-04-01 11:37:18,983 root INFO: [('category', 1), ('category_product', 1), ('product_info', 0)] +2024-04-01 11:37:18,983 root INFO: Category collection completed........ Moving to collecting products ;ist of the categories....... +2024-04-01 11:37:18,983 root INFO: Category products collection completed........ Moving to collecting product info....... +2024-04-01 11:37:18,983 root INFO: Initializing HasakiProductInfo +2024-04-01 11:37:21,796 root INFO: Closing connection..... +2024-04-01 11:37:37,443 root INFO: Starting Hasaki Crawler....... +2024-04-01 11:37:37,443 root INFO: Loading config file....... +2024-04-01 11:37:37,444 root INFO: Config file loaded....... +2024-04-01 11:37:37,444 root INFO: {'crawler_name': 'raena_crawler_engine_hasaki', 'crawler_schema': 'raena_spider_management', 'category_tab': 'rce_category', 'tracker_tab': 'crawler_tracker_hasaki', 'product_tab': 'rce_product', 'variant_tab': 'rce_product_variant', 'brand_tab': 'rce_brand', 'reseller_tab': 'rce_reseller', 'reseller_store_tab': 'rce_reseller_store', 'review_tab': 'rce_ratings_reviews', 'review_productmodels_tab': 'rce_ratings_reviews_productmodels', 'review_producttags_tab': 'rce_ratings_reviews_producttags', 'review_tags': 'rce_tags', 'source_tab': 'rce_source', 'seo_tab': 'rce_seo', 'product_per_category': '1000', 'source_category': '11043145', 'db_user': 'dbadmin', 'db_pass': '5qCif6eyY3Kmg4z', 'database': 'analytics', 'db_host': 'redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com', 'db_port': '5439', 'crawler_main': '1', 'crawler_slave_no': ''} +2024-04-01 11:37:37,447 root INFO: ++++++++++++++++ process tracker tab status ++++++++++++++++++++++ +2024-04-01 11:37:37,447 root INFO: [('category', 1), ('category_product', 1), ('product_info', 0)] +2024-04-01 11:37:37,447 root INFO: Category collection completed........ Moving to collecting products ;ist of the categories....... +2024-04-01 11:37:37,447 root INFO: Category products collection completed........ Moving to collecting product info....... +2024-04-01 11:37:37,447 root INFO: Initializing HasakiProductInfo +2024-04-01 11:37:40,69 root ERROR: [Errno 2] No such file or directory: 'Xvfb' +2024-04-01 11:37:40,72 root INFO: Closing connection..... +2024-04-01 11:39:24,935 root INFO: Starting Hasaki Crawler....... +2024-04-01 11:39:24,935 root INFO: Loading config file....... +2024-04-01 11:39:24,935 root INFO: Config file loaded....... +2024-04-01 11:39:24,935 root INFO: {'crawler_name': 'raena_crawler_engine_hasaki', 'crawler_schema': 'raena_spider_management', 'category_tab': 'rce_category', 'tracker_tab': 'crawler_tracker_hasaki', 'product_tab': 'rce_product', 'variant_tab': 'rce_product_variant', 'brand_tab': 'rce_brand', 'reseller_tab': 'rce_reseller', 'reseller_store_tab': 'rce_reseller_store', 'review_tab': 'rce_ratings_reviews', 'review_productmodels_tab': 'rce_ratings_reviews_productmodels', 'review_producttags_tab': 'rce_ratings_reviews_producttags', 'review_tags': 'rce_tags', 'source_tab': 'rce_source', 'seo_tab': 'rce_seo', 'product_per_category': '1000', 'source_category': '11043145', 'db_user': 'dbadmin', 'db_pass': '5qCif6eyY3Kmg4z', 'database': 'analytics', 'db_host': 'redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com', 'db_port': '5439', 'crawler_main': '1', 'crawler_slave_no': ''} +2024-04-01 11:39:24,939 root INFO: ++++++++++++++++ process tracker tab status ++++++++++++++++++++++ +2024-04-01 11:39:24,939 root INFO: [('category', 1), ('category_product', 1), ('product_info', 0)] +2024-04-01 11:39:24,939 root INFO: Category collection completed........ Moving to collecting products ;ist of the categories....... +2024-04-01 11:39:24,939 root INFO: Category products collection completed........ Moving to collecting product info....... +2024-04-01 11:39:24,939 root INFO: Initializing HasakiProductInfo +2024-04-01 11:39:27,536 root ERROR: [Errno 2] No such file or directory: 'Xvfb' +2024-04-01 11:39:27,538 root INFO: Closing connection..... diff --git a/hasaki_crawler_engine/hasaki_crawler.py b/hasaki_crawler_engine/hasaki_crawler.py index 0da5bfc..b728b41 100644 --- a/hasaki_crawler_engine/hasaki_crawler.py +++ b/hasaki_crawler_engine/hasaki_crawler.py @@ -10,15 +10,19 @@ from hasaki_category_products import HasakiCategoryProducts from hasaki_product_info import HasakiProductInfo from email.message import EmailMessage +config = {} + + ###### Looger ###### logname = '/home/ubuntu/logs/hasaki_crawler.log' +#logname = 'hasaki_crawler.log' logging.basicConfig(filename=logname, filemode='a', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO) -config = {} + def main(cur): @@ -30,7 +34,7 @@ def main(cur): hasaki_categories.start_processing() cur.execute(f"""update process_tracker set flag = 1 where process = 'category'""") - logging.info("Category collection completed........ Moving to collecting products ;ist of the categories.......") + logging.info("Category collection completed........ Moving to collecting products list of the categories.......") #time.sleep(60) @@ -66,8 +70,8 @@ def send_mail(msg): EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5" EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh" From = 'data_reporting@raenabeauty.com' - To = 'shariar@raenabeauty.com, data_reporting@raenabeauty.com' - #To = 'shariar@raenabeauty.com' + #To = 'shariar@raenabeauty.com, data_reporting@raenabeauty.com' + To = 'shariar@raenabeauty.com' html = f''' @@ -188,7 +192,10 @@ if __name__ == "__main__": init_tracker_tab(cur) - main(cur) + try: + main(cur) + except Exception as e: + logging.error(e) cur.close() conn.close() diff --git a/hasaki_crawler_engine/hasaki_db_writer.py b/hasaki_crawler_engine/hasaki_db_writer.py index a34da84..f9b3d32 100755 --- a/hasaki_crawler_engine/hasaki_db_writer.py +++ b/hasaki_crawler_engine/hasaki_db_writer.py @@ -3,6 +3,7 @@ import psycopg2 ###### Looger ###### logname = '/home/ubuntu/logs/hasaki_crawler.log' +#logname = 'hasaki_crawler.log' logging.basicConfig(filename=logname, filemode='a', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', diff --git a/hasaki_crawler_engine/hasaki_product_info.py b/hasaki_crawler_engine/hasaki_product_info.py index 08daf40..a418e6d 100644 --- a/hasaki_crawler_engine/hasaki_product_info.py +++ b/hasaki_crawler_engine/hasaki_product_info.py @@ -21,6 +21,7 @@ import json ###### Looger ###### logname = '/home/ubuntu/logs/hasaki_crawler.log' +#logname = 'hasaki_crawler.log' logging.basicConfig(filename=logname, filemode='a', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', @@ -55,7 +56,7 @@ class HasakiProductInfo: def __del__(self): print("Closing connection.....") self.conn.close() - self.display.stop() + def start_processing(self): logging.info("Starting to collect product info from Hasaki........") @@ -90,6 +91,8 @@ class HasakiProductInfo: #time.sleep(random.randint(7, 23)) + self.display.stop() + def get_product_info(self, data):