added Hasaki crawler
This commit is contained in:
parent
d239129739
commit
959fd9a03e
|
@ -9,8 +9,10 @@ from playwright.sync_api import sync_playwright
|
||||||
from hasaki_db_writer import hasaki_db_writer
|
from hasaki_db_writer import hasaki_db_writer
|
||||||
from Util import translate_text_to_english
|
from Util import translate_text_to_english
|
||||||
|
|
||||||
|
|
||||||
###### Looger ######
|
###### Looger ######
|
||||||
logname = '/home/ubuntu/logs/hasaki_crawler.log'
|
logname = '/home/ubuntu/logs/hasaki_crawler.log'
|
||||||
|
#logname = 'hasaki_crawler.log'
|
||||||
logging.basicConfig(filename=logname,
|
logging.basicConfig(filename=logname,
|
||||||
filemode='a',
|
filemode='a',
|
||||||
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
||||||
|
@ -48,7 +50,7 @@ class HasakiCategories:
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
print("Closing connection.....")
|
print("Closing connection.....")
|
||||||
self.conn.close()
|
self.conn.close()
|
||||||
self.display.stop()
|
|
||||||
|
|
||||||
def start_processing(self):
|
def start_processing(self):
|
||||||
|
|
||||||
|
@ -62,6 +64,8 @@ class HasakiCategories:
|
||||||
|
|
||||||
self.process_category(df)
|
self.process_category(df)
|
||||||
|
|
||||||
|
self.display.stop()
|
||||||
|
|
||||||
|
|
||||||
def process_category(self, category):
|
def process_category(self, category):
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,7 @@ from Util import translate_text_to_english
|
||||||
|
|
||||||
###### Looger ######
|
###### Looger ######
|
||||||
logname = '/home/ubuntu/logs/hasaki_crawler.log'
|
logname = '/home/ubuntu/logs/hasaki_crawler.log'
|
||||||
|
#logname = 'hasaki_crawler.log'
|
||||||
logging.basicConfig(filename=logname,
|
logging.basicConfig(filename=logname,
|
||||||
filemode='a',
|
filemode='a',
|
||||||
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
||||||
|
@ -42,7 +43,6 @@ class HasakiCategoryProducts:
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
print("Closing connection.....")
|
print("Closing connection.....")
|
||||||
self.conn.close()
|
self.conn.close()
|
||||||
self.display.stop()
|
|
||||||
|
|
||||||
def start_processing(self):
|
def start_processing(self):
|
||||||
|
|
||||||
|
@ -65,6 +65,8 @@ class HasakiCategoryProducts:
|
||||||
|
|
||||||
self.get_product_list(urls = pages, categoryId = category[0])
|
self.get_product_list(urls = pages, categoryId = category[0])
|
||||||
|
|
||||||
|
self.display.stop()
|
||||||
|
|
||||||
|
|
||||||
def find_top_search(self):
|
def find_top_search(self):
|
||||||
with sync_playwright() as p:
|
with sync_playwright() as p:
|
||||||
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
2024-04-01 11:35:40,147 root INFO: Starting Hasaki Crawler.......
|
||||||
|
2024-04-01 11:35:40,147 root INFO: Loading config file.......
|
||||||
|
2024-04-01 11:35:40,147 root INFO: Config file loaded.......
|
||||||
|
2024-04-01 11:35:40,147 root INFO: {'crawler_name': 'raena_crawler_engine_hasaki', 'crawler_schema': 'raena_spider_management', 'category_tab': 'rce_category', 'tracker_tab': 'crawler_tracker_hasaki', 'product_tab': 'rce_product', 'variant_tab': 'rce_product_variant', 'brand_tab': 'rce_brand', 'reseller_tab': 'rce_reseller', 'reseller_store_tab': 'rce_reseller_store', 'review_tab': 'rce_ratings_reviews', 'review_productmodels_tab': 'rce_ratings_reviews_productmodels', 'review_producttags_tab': 'rce_ratings_reviews_producttags', 'review_tags': 'rce_tags', 'source_tab': 'rce_source', 'seo_tab': 'rce_seo', 'product_per_category': '1000', 'source_category': '11043145', 'db_user': 'dbadmin', 'db_pass': '5qCif6eyY3Kmg4z', 'database': 'analytics', 'db_host': 'redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com', 'db_port': '5439', 'crawler_main': '1', 'crawler_slave_no': ''}
|
||||||
|
2024-04-01 11:35:40,151 root INFO: ++++++++++++++++ process tracker tab status ++++++++++++++++++++++
|
||||||
|
2024-04-01 11:35:40,151 root INFO: [('category', 1), ('category_product', 1), ('product_info', 0)]
|
||||||
|
2024-04-01 11:35:40,151 root INFO: Category collection completed........ Moving to collecting products ;ist of the categories.......
|
||||||
|
2024-04-01 11:35:40,151 root INFO: Category products collection completed........ Moving to collecting product info.......
|
||||||
|
2024-04-01 11:35:40,151 root INFO: Initializing HasakiProductInfo
|
||||||
|
2024-04-01 11:35:43,844 root INFO: Error:
|
||||||
|
2024-04-01 11:35:43,844 root INFO: Error occurred. Please check config file or the internal SQLLITE DB. Exiting......
|
||||||
|
2024-04-01 11:35:47,656 root INFO: Closing connection.....
|
||||||
|
2024-04-01 11:37:18,979 root INFO: Starting Hasaki Crawler.......
|
||||||
|
2024-04-01 11:37:18,979 root INFO: Loading config file.......
|
||||||
|
2024-04-01 11:37:18,979 root INFO: Config file loaded.......
|
||||||
|
2024-04-01 11:37:18,979 root INFO: {'crawler_name': 'raena_crawler_engine_hasaki', 'crawler_schema': 'raena_spider_management', 'category_tab': 'rce_category', 'tracker_tab': 'crawler_tracker_hasaki', 'product_tab': 'rce_product', 'variant_tab': 'rce_product_variant', 'brand_tab': 'rce_brand', 'reseller_tab': 'rce_reseller', 'reseller_store_tab': 'rce_reseller_store', 'review_tab': 'rce_ratings_reviews', 'review_productmodels_tab': 'rce_ratings_reviews_productmodels', 'review_producttags_tab': 'rce_ratings_reviews_producttags', 'review_tags': 'rce_tags', 'source_tab': 'rce_source', 'seo_tab': 'rce_seo', 'product_per_category': '1000', 'source_category': '11043145', 'db_user': 'dbadmin', 'db_pass': '5qCif6eyY3Kmg4z', 'database': 'analytics', 'db_host': 'redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com', 'db_port': '5439', 'crawler_main': '1', 'crawler_slave_no': ''}
|
||||||
|
2024-04-01 11:37:18,983 root INFO: ++++++++++++++++ process tracker tab status ++++++++++++++++++++++
|
||||||
|
2024-04-01 11:37:18,983 root INFO: [('category', 1), ('category_product', 1), ('product_info', 0)]
|
||||||
|
2024-04-01 11:37:18,983 root INFO: Category collection completed........ Moving to collecting products ;ist of the categories.......
|
||||||
|
2024-04-01 11:37:18,983 root INFO: Category products collection completed........ Moving to collecting product info.......
|
||||||
|
2024-04-01 11:37:18,983 root INFO: Initializing HasakiProductInfo
|
||||||
|
2024-04-01 11:37:21,796 root INFO: Closing connection.....
|
||||||
|
2024-04-01 11:37:37,443 root INFO: Starting Hasaki Crawler.......
|
||||||
|
2024-04-01 11:37:37,443 root INFO: Loading config file.......
|
||||||
|
2024-04-01 11:37:37,444 root INFO: Config file loaded.......
|
||||||
|
2024-04-01 11:37:37,444 root INFO: {'crawler_name': 'raena_crawler_engine_hasaki', 'crawler_schema': 'raena_spider_management', 'category_tab': 'rce_category', 'tracker_tab': 'crawler_tracker_hasaki', 'product_tab': 'rce_product', 'variant_tab': 'rce_product_variant', 'brand_tab': 'rce_brand', 'reseller_tab': 'rce_reseller', 'reseller_store_tab': 'rce_reseller_store', 'review_tab': 'rce_ratings_reviews', 'review_productmodels_tab': 'rce_ratings_reviews_productmodels', 'review_producttags_tab': 'rce_ratings_reviews_producttags', 'review_tags': 'rce_tags', 'source_tab': 'rce_source', 'seo_tab': 'rce_seo', 'product_per_category': '1000', 'source_category': '11043145', 'db_user': 'dbadmin', 'db_pass': '5qCif6eyY3Kmg4z', 'database': 'analytics', 'db_host': 'redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com', 'db_port': '5439', 'crawler_main': '1', 'crawler_slave_no': ''}
|
||||||
|
2024-04-01 11:37:37,447 root INFO: ++++++++++++++++ process tracker tab status ++++++++++++++++++++++
|
||||||
|
2024-04-01 11:37:37,447 root INFO: [('category', 1), ('category_product', 1), ('product_info', 0)]
|
||||||
|
2024-04-01 11:37:37,447 root INFO: Category collection completed........ Moving to collecting products ;ist of the categories.......
|
||||||
|
2024-04-01 11:37:37,447 root INFO: Category products collection completed........ Moving to collecting product info.......
|
||||||
|
2024-04-01 11:37:37,447 root INFO: Initializing HasakiProductInfo
|
||||||
|
2024-04-01 11:37:40,69 root ERROR: [Errno 2] No such file or directory: 'Xvfb'
|
||||||
|
2024-04-01 11:37:40,72 root INFO: Closing connection.....
|
||||||
|
2024-04-01 11:39:24,935 root INFO: Starting Hasaki Crawler.......
|
||||||
|
2024-04-01 11:39:24,935 root INFO: Loading config file.......
|
||||||
|
2024-04-01 11:39:24,935 root INFO: Config file loaded.......
|
||||||
|
2024-04-01 11:39:24,935 root INFO: {'crawler_name': 'raena_crawler_engine_hasaki', 'crawler_schema': 'raena_spider_management', 'category_tab': 'rce_category', 'tracker_tab': 'crawler_tracker_hasaki', 'product_tab': 'rce_product', 'variant_tab': 'rce_product_variant', 'brand_tab': 'rce_brand', 'reseller_tab': 'rce_reseller', 'reseller_store_tab': 'rce_reseller_store', 'review_tab': 'rce_ratings_reviews', 'review_productmodels_tab': 'rce_ratings_reviews_productmodels', 'review_producttags_tab': 'rce_ratings_reviews_producttags', 'review_tags': 'rce_tags', 'source_tab': 'rce_source', 'seo_tab': 'rce_seo', 'product_per_category': '1000', 'source_category': '11043145', 'db_user': 'dbadmin', 'db_pass': '5qCif6eyY3Kmg4z', 'database': 'analytics', 'db_host': 'redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com', 'db_port': '5439', 'crawler_main': '1', 'crawler_slave_no': ''}
|
||||||
|
2024-04-01 11:39:24,939 root INFO: ++++++++++++++++ process tracker tab status ++++++++++++++++++++++
|
||||||
|
2024-04-01 11:39:24,939 root INFO: [('category', 1), ('category_product', 1), ('product_info', 0)]
|
||||||
|
2024-04-01 11:39:24,939 root INFO: Category collection completed........ Moving to collecting products ;ist of the categories.......
|
||||||
|
2024-04-01 11:39:24,939 root INFO: Category products collection completed........ Moving to collecting product info.......
|
||||||
|
2024-04-01 11:39:24,939 root INFO: Initializing HasakiProductInfo
|
||||||
|
2024-04-01 11:39:27,536 root ERROR: [Errno 2] No such file or directory: 'Xvfb'
|
||||||
|
2024-04-01 11:39:27,538 root INFO: Closing connection.....
|
|
@ -10,15 +10,19 @@ from hasaki_category_products import HasakiCategoryProducts
|
||||||
from hasaki_product_info import HasakiProductInfo
|
from hasaki_product_info import HasakiProductInfo
|
||||||
from email.message import EmailMessage
|
from email.message import EmailMessage
|
||||||
|
|
||||||
|
config = {}
|
||||||
|
|
||||||
|
|
||||||
###### Looger ######
|
###### Looger ######
|
||||||
logname = '/home/ubuntu/logs/hasaki_crawler.log'
|
logname = '/home/ubuntu/logs/hasaki_crawler.log'
|
||||||
|
#logname = 'hasaki_crawler.log'
|
||||||
logging.basicConfig(filename=logname,
|
logging.basicConfig(filename=logname,
|
||||||
filemode='a',
|
filemode='a',
|
||||||
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
||||||
datefmt="%Y-%m-%d %H:%M:%S",
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
level=logging.INFO)
|
level=logging.INFO)
|
||||||
|
|
||||||
config = {}
|
|
||||||
|
|
||||||
|
|
||||||
def main(cur):
|
def main(cur):
|
||||||
|
@ -30,7 +34,7 @@ def main(cur):
|
||||||
hasaki_categories.start_processing()
|
hasaki_categories.start_processing()
|
||||||
cur.execute(f"""update process_tracker set flag = 1 where process = 'category'""")
|
cur.execute(f"""update process_tracker set flag = 1 where process = 'category'""")
|
||||||
|
|
||||||
logging.info("Category collection completed........ Moving to collecting products ;ist of the categories.......")
|
logging.info("Category collection completed........ Moving to collecting products list of the categories.......")
|
||||||
|
|
||||||
#time.sleep(60)
|
#time.sleep(60)
|
||||||
|
|
||||||
|
@ -66,8 +70,8 @@ def send_mail(msg):
|
||||||
EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5"
|
EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5"
|
||||||
EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh"
|
EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh"
|
||||||
From = 'data_reporting@raenabeauty.com'
|
From = 'data_reporting@raenabeauty.com'
|
||||||
To = 'shariar@raenabeauty.com, data_reporting@raenabeauty.com'
|
#To = 'shariar@raenabeauty.com, data_reporting@raenabeauty.com'
|
||||||
#To = 'shariar@raenabeauty.com'
|
To = 'shariar@raenabeauty.com'
|
||||||
|
|
||||||
html = f'''
|
html = f'''
|
||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
|
@ -188,7 +192,10 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
init_tracker_tab(cur)
|
init_tracker_tab(cur)
|
||||||
|
|
||||||
|
try:
|
||||||
main(cur)
|
main(cur)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(e)
|
||||||
|
|
||||||
cur.close()
|
cur.close()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
|
@ -3,6 +3,7 @@ import psycopg2
|
||||||
|
|
||||||
###### Looger ######
|
###### Looger ######
|
||||||
logname = '/home/ubuntu/logs/hasaki_crawler.log'
|
logname = '/home/ubuntu/logs/hasaki_crawler.log'
|
||||||
|
#logname = 'hasaki_crawler.log'
|
||||||
logging.basicConfig(filename=logname,
|
logging.basicConfig(filename=logname,
|
||||||
filemode='a',
|
filemode='a',
|
||||||
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
||||||
|
|
|
@ -21,6 +21,7 @@ import json
|
||||||
|
|
||||||
###### Looger ######
|
###### Looger ######
|
||||||
logname = '/home/ubuntu/logs/hasaki_crawler.log'
|
logname = '/home/ubuntu/logs/hasaki_crawler.log'
|
||||||
|
#logname = 'hasaki_crawler.log'
|
||||||
logging.basicConfig(filename=logname,
|
logging.basicConfig(filename=logname,
|
||||||
filemode='a',
|
filemode='a',
|
||||||
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
||||||
|
@ -55,7 +56,7 @@ class HasakiProductInfo:
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
print("Closing connection.....")
|
print("Closing connection.....")
|
||||||
self.conn.close()
|
self.conn.close()
|
||||||
self.display.stop()
|
|
||||||
|
|
||||||
def start_processing(self):
|
def start_processing(self):
|
||||||
logging.info("Starting to collect product info from Hasaki........")
|
logging.info("Starting to collect product info from Hasaki........")
|
||||||
|
@ -90,6 +91,8 @@ class HasakiProductInfo:
|
||||||
|
|
||||||
#time.sleep(random.randint(7, 23))
|
#time.sleep(random.randint(7, 23))
|
||||||
|
|
||||||
|
self.display.stop()
|
||||||
|
|
||||||
|
|
||||||
def get_product_info(self, data):
|
def get_product_info(self, data):
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue