import logging import json import time import smtplib import sqlite3 import psycopg2 from hasaki_categories import HasakiCategories from hasaki_category_products import HasakiCategoryProducts from hasaki_product_info import HasakiProductInfo from email.message import EmailMessage config = {} ###### Looger ###### logging.basicConfig(filename="/home/ubuntu/logs/hasaki_crawler.log", filemode='a', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO) def main(cur): cur.execute(f"""select flag from process_tracker where process = 'category'""") cat_flags = cur.fetchone() if cat_flags[0]==0: hasaki_categories = HasakiCategories(config) hasaki_categories.start_processing() cur.execute(f"""update process_tracker set flag = 1 where process = 'category'""") logging.info("Category collection completed........ Moving to collecting products list of the categories.......") #time.sleep(60) cur.execute(f"""select flag from process_tracker where process = 'category_product'""") cat_pro_flags = cur.fetchone() if cat_pro_flags[0] == 0: hasaki_category_products = HasakiCategoryProducts(config) hasaki_category_products.start_processing() cur.execute(f"""update process_tracker set flag = 1 where process = 'category_product'""") logging.info("Category products collection completed........ Moving to collecting product info.......") #time.sleep(60) cur.execute(f"""select flag from process_tracker where process = 'product_info'""") prod_flag = cur.fetchone() if prod_flag[0] == 0: hasaki_products = HasakiProductInfo(config) hasaki_products.start_processing() cur.execute(f"""update process_tracker set flag = 1 where process = 'product_info'""") else: cur.execute(f"""update process_tracker set flag = 0 where process = 'category'""") cur.execute(f"""update process_tracker set flag = 0 where process = 'category_product'""") cur.execute(f"""update process_tracker set flag = 0 where process = 'product_info'""") logging.info("Product info collection done. Stopping........") def send_mail(msg): try: EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5" EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh" From = 'data_reporting@raenabeauty.com' #To = 'shariar@raenabeauty.com, data_reporting@raenabeauty.com' To = 'shariar@raenabeauty.com' html = f'''

Hasaki Crawler Status

{msg}

This is system generated mail. Please do not reply.

''' msg = EmailMessage() msg['Subject'] = 'Hasaki Crawler Status' msg['From'] = From msg['To'] = To msg.set_content(html, subtype='html') with smtplib.SMTP('email-smtp.ap-southeast-1.amazonaws.com', 587) as smtp: smtp.ehlo() smtp.starttls() smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD) smtp.send_message(msg) except Exception as e: logging.info("Error while sending mail: {}".format(e)) def init_tracker_tab(cur): cur.execute(f"""CREATE TABLE IF NOT EXISTS process_tracker ( process TEXT, flag int )""") # logging.info("++++++++++++++++++++++++++++++++++++++") # cur.execute(f"""select * from process_tracker""") # logging.info(cur.fetchall()) cur.execute(f"""select * from process_tracker where process = 'category'""") if cur.fetchone() is None: cur.execute(f"""insert into process_tracker (process, flag) values('category', 0)""") cur.execute(f"""select * from process_tracker where process = 'category_product'""") if cur.fetchone() is None: cur.execute(f"""insert into process_tracker (process, flag) values('category_product', 0)""") cur.execute(f"""select * from process_tracker where process = 'product_info'""") if cur.fetchone() is None: cur.execute(f"""insert into process_tracker (process, flag) values('product_info', 0)""") logging.info("++++++++++++++++ process tracker tab status ++++++++++++++++++++++") cur.execute(f"""select * from process_tracker""") logging.info(cur.fetchall()) def get_status(): conn = psycopg2.connect(database=config.get('database'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port')) conn.autocommit = True cur = conn.cursor() cur.execute( f"""select count(1) from raena_spider_management.rce_category where rce_source_id = (select id from raena_spider_management.rce_source where source_name = 'Hasaki')""") cat_count = cur.fetchone()[0] cur.execute(f"""select count(1) from raena_spider_management.crawler_tracker_hasaki""") product_total = cur.fetchone()[0] cur.execute(f"""select count(1) from raena_spider_management.crawler_tracker_hasaki where flag = 1""") product_successful = cur.fetchone()[0] cur.execute(f"""select count(1) from raena_spider_management.crawler_tracker_hasaki where flag = 0""") product_failed = cur.fetchone()[0] msg = f"""

Hasaki Crawler run is completed. Please check the status below,


""" cur.close() conn.close() return msg if __name__ == "__main__": logging.info("Starting Hasaki Crawler.......") try: logging.info("Loading config file.......") with open("conf.json", "r") as jsonfile: config = json.load(jsonfile) logging.info("Config file loaded.......") logging.info(config) conn = sqlite3.connect('process_tracker.db') conn.isolation_level = None cur = conn.cursor() # cur.execute(f"""update process_tracker set flag = 1 where process = 'category'""") # cur.execute(f"""update process_tracker set flag = 1 where process = 'category_product'""") # cur.execute(f"""update process_tracker set flag = 0 where process = 'product_info'""") init_tracker_tab(cur) try: main(cur) except Exception as e: logging.error(e) cur.close() conn.close() msg = get_status() send_mail(msg) except Exception as e: logging.info("Error: ".format(e)) logging.info("Error occurred. Please check config file or the internal SQLLITE DB. Exiting......") send_mail("Error occurred. Please check config file or the internal SQLLITE DB.") exit(1)