added Hasaki crawler

This commit is contained in:
Shariar Imtiaz 2024-04-01 11:43:38 +04:00
parent d239129739
commit 959fd9a03e
6 changed files with 69 additions and 8 deletions

View File

@ -9,8 +9,10 @@ from playwright.sync_api import sync_playwright
from hasaki_db_writer import hasaki_db_writer from hasaki_db_writer import hasaki_db_writer
from Util import translate_text_to_english from Util import translate_text_to_english
###### Looger ###### ###### Looger ######
logname = '/home/ubuntu/logs/hasaki_crawler.log' logname = '/home/ubuntu/logs/hasaki_crawler.log'
#logname = 'hasaki_crawler.log'
logging.basicConfig(filename=logname, logging.basicConfig(filename=logname,
filemode='a', filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
@ -48,7 +50,7 @@ class HasakiCategories:
def __del__(self): def __del__(self):
print("Closing connection.....") print("Closing connection.....")
self.conn.close() self.conn.close()
self.display.stop()
def start_processing(self): def start_processing(self):
@ -62,6 +64,8 @@ class HasakiCategories:
self.process_category(df) self.process_category(df)
self.display.stop()
def process_category(self, category): def process_category(self, category):

View File

@ -9,6 +9,7 @@ from Util import translate_text_to_english
###### Looger ###### ###### Looger ######
logname = '/home/ubuntu/logs/hasaki_crawler.log' logname = '/home/ubuntu/logs/hasaki_crawler.log'
#logname = 'hasaki_crawler.log'
logging.basicConfig(filename=logname, logging.basicConfig(filename=logname,
filemode='a', filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
@ -42,7 +43,6 @@ class HasakiCategoryProducts:
def __del__(self): def __del__(self):
print("Closing connection.....") print("Closing connection.....")
self.conn.close() self.conn.close()
self.display.stop()
def start_processing(self): def start_processing(self):
@ -65,6 +65,8 @@ class HasakiCategoryProducts:
self.get_product_list(urls = pages, categoryId = category[0]) self.get_product_list(urls = pages, categoryId = category[0])
self.display.stop()
def find_top_search(self): def find_top_search(self):
with sync_playwright() as p: with sync_playwright() as p:

View File

@ -0,0 +1,44 @@
2024-04-01 11:35:40,147 root INFO: Starting Hasaki Crawler.......
2024-04-01 11:35:40,147 root INFO: Loading config file.......
2024-04-01 11:35:40,147 root INFO: Config file loaded.......
2024-04-01 11:35:40,147 root INFO: {'crawler_name': 'raena_crawler_engine_hasaki', 'crawler_schema': 'raena_spider_management', 'category_tab': 'rce_category', 'tracker_tab': 'crawler_tracker_hasaki', 'product_tab': 'rce_product', 'variant_tab': 'rce_product_variant', 'brand_tab': 'rce_brand', 'reseller_tab': 'rce_reseller', 'reseller_store_tab': 'rce_reseller_store', 'review_tab': 'rce_ratings_reviews', 'review_productmodels_tab': 'rce_ratings_reviews_productmodels', 'review_producttags_tab': 'rce_ratings_reviews_producttags', 'review_tags': 'rce_tags', 'source_tab': 'rce_source', 'seo_tab': 'rce_seo', 'product_per_category': '1000', 'source_category': '11043145', 'db_user': 'dbadmin', 'db_pass': '5qCif6eyY3Kmg4z', 'database': 'analytics', 'db_host': 'redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com', 'db_port': '5439', 'crawler_main': '1', 'crawler_slave_no': ''}
2024-04-01 11:35:40,151 root INFO: ++++++++++++++++ process tracker tab status ++++++++++++++++++++++
2024-04-01 11:35:40,151 root INFO: [('category', 1), ('category_product', 1), ('product_info', 0)]
2024-04-01 11:35:40,151 root INFO: Category collection completed........ Moving to collecting products ;ist of the categories.......
2024-04-01 11:35:40,151 root INFO: Category products collection completed........ Moving to collecting product info.......
2024-04-01 11:35:40,151 root INFO: Initializing HasakiProductInfo
2024-04-01 11:35:43,844 root INFO: Error:
2024-04-01 11:35:43,844 root INFO: Error occurred. Please check config file or the internal SQLLITE DB. Exiting......
2024-04-01 11:35:47,656 root INFO: Closing connection.....
2024-04-01 11:37:18,979 root INFO: Starting Hasaki Crawler.......
2024-04-01 11:37:18,979 root INFO: Loading config file.......
2024-04-01 11:37:18,979 root INFO: Config file loaded.......
2024-04-01 11:37:18,979 root INFO: {'crawler_name': 'raena_crawler_engine_hasaki', 'crawler_schema': 'raena_spider_management', 'category_tab': 'rce_category', 'tracker_tab': 'crawler_tracker_hasaki', 'product_tab': 'rce_product', 'variant_tab': 'rce_product_variant', 'brand_tab': 'rce_brand', 'reseller_tab': 'rce_reseller', 'reseller_store_tab': 'rce_reseller_store', 'review_tab': 'rce_ratings_reviews', 'review_productmodels_tab': 'rce_ratings_reviews_productmodels', 'review_producttags_tab': 'rce_ratings_reviews_producttags', 'review_tags': 'rce_tags', 'source_tab': 'rce_source', 'seo_tab': 'rce_seo', 'product_per_category': '1000', 'source_category': '11043145', 'db_user': 'dbadmin', 'db_pass': '5qCif6eyY3Kmg4z', 'database': 'analytics', 'db_host': 'redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com', 'db_port': '5439', 'crawler_main': '1', 'crawler_slave_no': ''}
2024-04-01 11:37:18,983 root INFO: ++++++++++++++++ process tracker tab status ++++++++++++++++++++++
2024-04-01 11:37:18,983 root INFO: [('category', 1), ('category_product', 1), ('product_info', 0)]
2024-04-01 11:37:18,983 root INFO: Category collection completed........ Moving to collecting products ;ist of the categories.......
2024-04-01 11:37:18,983 root INFO: Category products collection completed........ Moving to collecting product info.......
2024-04-01 11:37:18,983 root INFO: Initializing HasakiProductInfo
2024-04-01 11:37:21,796 root INFO: Closing connection.....
2024-04-01 11:37:37,443 root INFO: Starting Hasaki Crawler.......
2024-04-01 11:37:37,443 root INFO: Loading config file.......
2024-04-01 11:37:37,444 root INFO: Config file loaded.......
2024-04-01 11:37:37,444 root INFO: {'crawler_name': 'raena_crawler_engine_hasaki', 'crawler_schema': 'raena_spider_management', 'category_tab': 'rce_category', 'tracker_tab': 'crawler_tracker_hasaki', 'product_tab': 'rce_product', 'variant_tab': 'rce_product_variant', 'brand_tab': 'rce_brand', 'reseller_tab': 'rce_reseller', 'reseller_store_tab': 'rce_reseller_store', 'review_tab': 'rce_ratings_reviews', 'review_productmodels_tab': 'rce_ratings_reviews_productmodels', 'review_producttags_tab': 'rce_ratings_reviews_producttags', 'review_tags': 'rce_tags', 'source_tab': 'rce_source', 'seo_tab': 'rce_seo', 'product_per_category': '1000', 'source_category': '11043145', 'db_user': 'dbadmin', 'db_pass': '5qCif6eyY3Kmg4z', 'database': 'analytics', 'db_host': 'redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com', 'db_port': '5439', 'crawler_main': '1', 'crawler_slave_no': ''}
2024-04-01 11:37:37,447 root INFO: ++++++++++++++++ process tracker tab status ++++++++++++++++++++++
2024-04-01 11:37:37,447 root INFO: [('category', 1), ('category_product', 1), ('product_info', 0)]
2024-04-01 11:37:37,447 root INFO: Category collection completed........ Moving to collecting products ;ist of the categories.......
2024-04-01 11:37:37,447 root INFO: Category products collection completed........ Moving to collecting product info.......
2024-04-01 11:37:37,447 root INFO: Initializing HasakiProductInfo
2024-04-01 11:37:40,69 root ERROR: [Errno 2] No such file or directory: 'Xvfb'
2024-04-01 11:37:40,72 root INFO: Closing connection.....
2024-04-01 11:39:24,935 root INFO: Starting Hasaki Crawler.......
2024-04-01 11:39:24,935 root INFO: Loading config file.......
2024-04-01 11:39:24,935 root INFO: Config file loaded.......
2024-04-01 11:39:24,935 root INFO: {'crawler_name': 'raena_crawler_engine_hasaki', 'crawler_schema': 'raena_spider_management', 'category_tab': 'rce_category', 'tracker_tab': 'crawler_tracker_hasaki', 'product_tab': 'rce_product', 'variant_tab': 'rce_product_variant', 'brand_tab': 'rce_brand', 'reseller_tab': 'rce_reseller', 'reseller_store_tab': 'rce_reseller_store', 'review_tab': 'rce_ratings_reviews', 'review_productmodels_tab': 'rce_ratings_reviews_productmodels', 'review_producttags_tab': 'rce_ratings_reviews_producttags', 'review_tags': 'rce_tags', 'source_tab': 'rce_source', 'seo_tab': 'rce_seo', 'product_per_category': '1000', 'source_category': '11043145', 'db_user': 'dbadmin', 'db_pass': '5qCif6eyY3Kmg4z', 'database': 'analytics', 'db_host': 'redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com', 'db_port': '5439', 'crawler_main': '1', 'crawler_slave_no': ''}
2024-04-01 11:39:24,939 root INFO: ++++++++++++++++ process tracker tab status ++++++++++++++++++++++
2024-04-01 11:39:24,939 root INFO: [('category', 1), ('category_product', 1), ('product_info', 0)]
2024-04-01 11:39:24,939 root INFO: Category collection completed........ Moving to collecting products ;ist of the categories.......
2024-04-01 11:39:24,939 root INFO: Category products collection completed........ Moving to collecting product info.......
2024-04-01 11:39:24,939 root INFO: Initializing HasakiProductInfo
2024-04-01 11:39:27,536 root ERROR: [Errno 2] No such file or directory: 'Xvfb'
2024-04-01 11:39:27,538 root INFO: Closing connection.....

View File

@ -10,15 +10,19 @@ from hasaki_category_products import HasakiCategoryProducts
from hasaki_product_info import HasakiProductInfo from hasaki_product_info import HasakiProductInfo
from email.message import EmailMessage from email.message import EmailMessage
config = {}
###### Looger ###### ###### Looger ######
logname = '/home/ubuntu/logs/hasaki_crawler.log' logname = '/home/ubuntu/logs/hasaki_crawler.log'
#logname = 'hasaki_crawler.log'
logging.basicConfig(filename=logname, logging.basicConfig(filename=logname,
filemode='a', filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S", datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO) level=logging.INFO)
config = {}
def main(cur): def main(cur):
@ -30,7 +34,7 @@ def main(cur):
hasaki_categories.start_processing() hasaki_categories.start_processing()
cur.execute(f"""update process_tracker set flag = 1 where process = 'category'""") cur.execute(f"""update process_tracker set flag = 1 where process = 'category'""")
logging.info("Category collection completed........ Moving to collecting products ;ist of the categories.......") logging.info("Category collection completed........ Moving to collecting products list of the categories.......")
#time.sleep(60) #time.sleep(60)
@ -66,8 +70,8 @@ def send_mail(msg):
EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5" EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5"
EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh" EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh"
From = 'data_reporting@raenabeauty.com' From = 'data_reporting@raenabeauty.com'
To = 'shariar@raenabeauty.com, data_reporting@raenabeauty.com' #To = 'shariar@raenabeauty.com, data_reporting@raenabeauty.com'
#To = 'shariar@raenabeauty.com' To = 'shariar@raenabeauty.com'
html = f''' html = f'''
<!DOCTYPE html> <!DOCTYPE html>
@ -188,7 +192,10 @@ if __name__ == "__main__":
init_tracker_tab(cur) init_tracker_tab(cur)
try:
main(cur) main(cur)
except Exception as e:
logging.error(e)
cur.close() cur.close()
conn.close() conn.close()

View File

@ -3,6 +3,7 @@ import psycopg2
###### Looger ###### ###### Looger ######
logname = '/home/ubuntu/logs/hasaki_crawler.log' logname = '/home/ubuntu/logs/hasaki_crawler.log'
#logname = 'hasaki_crawler.log'
logging.basicConfig(filename=logname, logging.basicConfig(filename=logname,
filemode='a', filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',

View File

@ -21,6 +21,7 @@ import json
###### Looger ###### ###### Looger ######
logname = '/home/ubuntu/logs/hasaki_crawler.log' logname = '/home/ubuntu/logs/hasaki_crawler.log'
#logname = 'hasaki_crawler.log'
logging.basicConfig(filename=logname, logging.basicConfig(filename=logname,
filemode='a', filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
@ -55,7 +56,7 @@ class HasakiProductInfo:
def __del__(self): def __del__(self):
print("Closing connection.....") print("Closing connection.....")
self.conn.close() self.conn.close()
self.display.stop()
def start_processing(self): def start_processing(self):
logging.info("Starting to collect product info from Hasaki........") logging.info("Starting to collect product info from Hasaki........")
@ -90,6 +91,8 @@ class HasakiProductInfo:
#time.sleep(random.randint(7, 23)) #time.sleep(random.randint(7, 23))
self.display.stop()
def get_product_info(self, data): def get_product_info(self, data):