raena-crawler-engine/hasaki_crawler_engine/hasaki_crawler.py

205 lines
7.8 KiB
Python
Raw Normal View History

2024-03-14 05:16:59 +00:00
import logging
import json
import time
2024-03-14 09:32:49 +00:00
import smtplib
2024-03-27 07:01:53 +00:00
import sqlite3
2024-04-01 06:46:40 +00:00
import psycopg2
2024-03-14 05:16:59 +00:00
from hasaki_categories import HasakiCategories
from hasaki_category_products import HasakiCategoryProducts
from hasaki_product_info import HasakiProductInfo
2024-03-14 09:32:49 +00:00
from email.message import EmailMessage
2024-03-14 05:16:59 +00:00
2024-04-01 07:31:33 +00:00
###### Looger ######
logname = '/home/ubuntu/logs/hasaki_crawler.log'
logging.basicConfig(filename=logname,
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO)
2024-03-14 05:16:59 +00:00
config = {}
2024-03-27 07:01:53 +00:00
def main(cur):
2024-03-26 11:23:37 +00:00
2024-03-27 07:01:53 +00:00
cur.execute(f"""select flag from process_tracker where process = 'category'""")
cat_flags = cur.fetchone()
if cat_flags[0]==0:
hasaki_categories = HasakiCategories(config)
hasaki_categories.start_processing()
cur.execute(f"""update process_tracker set flag = 1 where process = 'category'""")
2024-03-26 11:23:37 +00:00
2024-04-01 07:31:33 +00:00
logging.info("Category collection completed........ Moving to collecting products ;ist of the categories.......")
2024-03-27 07:01:53 +00:00
#time.sleep(60)
cur.execute(f"""select flag from process_tracker where process = 'category_product'""")
cat_pro_flags = cur.fetchone()
if cat_pro_flags[0] == 0:
hasaki_category_products = HasakiCategoryProducts(config)
hasaki_category_products.start_processing()
cur.execute(f"""update process_tracker set flag = 1 where process = 'category_product'""")
2024-04-01 07:31:33 +00:00
logging.info("Category products collection completed........ Moving to collecting product info.......")
2024-03-27 07:01:53 +00:00
#time.sleep(60)
cur.execute(f"""select flag from process_tracker where process = 'product_info'""")
prod_flag = cur.fetchone()
if prod_flag[0] == 0:
hasaki_products = HasakiProductInfo(config)
hasaki_products.start_processing()
cur.execute(f"""update process_tracker set flag = 1 where process = 'product_info'""")
else:
cur.execute(f"""update process_tracker set flag = 0 where process = 'category'""")
cur.execute(f"""update process_tracker set flag = 0 where process = 'category_product'""")
cur.execute(f"""update process_tracker set flag = 0 where process = 'product_info'""")
2024-03-26 11:23:37 +00:00
2024-04-01 07:31:33 +00:00
logging.info("Product info collection done. Stopping........")
2024-03-14 05:16:59 +00:00
2024-03-14 09:32:49 +00:00
def send_mail(msg):
try:
EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5"
EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh"
From = 'data_reporting@raenabeauty.com'
2024-03-15 07:02:44 +00:00
To = 'shariar@raenabeauty.com, data_reporting@raenabeauty.com'
2024-04-01 06:46:40 +00:00
#To = 'shariar@raenabeauty.com'
2024-03-14 09:32:49 +00:00
html = f'''
<!DOCTYPE html>
<html>
<body>
<div style="background-color:#eee;padding:10px 20px;">
<h2 style="font-family:Georgia, 'Times New Roman', Times, serif;color#454349;">Hasaki Crawler Status</h2>
2024-04-01 06:46:40 +00:00
</div>
<div style="padding:20px 0px">
<div style="height: 800px;width:800px">
{msg}
<div style="text-align:Left;">
<p>This is system generated mail. Please do not reply.</p>
2024-03-14 09:32:49 +00:00
</div>
</div>
</div>
</body>
</html>
'''
msg = EmailMessage()
msg['Subject'] = 'Hasaki Crawler Status'
msg['From'] = From
msg['To'] = To
msg.set_content(html, subtype='html')
with smtplib.SMTP('email-smtp.ap-southeast-1.amazonaws.com', 587) as smtp:
smtp.ehlo()
smtp.starttls()
smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD)
smtp.send_message(msg)
except Exception as e:
logging.info("Error while sending mail: {}".format(e))
2024-03-27 07:01:53 +00:00
def init_tracker_tab(cur):
cur.execute(f"""CREATE TABLE IF NOT EXISTS process_tracker (
process TEXT,
flag int
)""")
2024-03-27 08:13:06 +00:00
# logging.info("++++++++++++++++++++++++++++++++++++++")
# cur.execute(f"""select * from process_tracker""")
# logging.info(cur.fetchall())
2024-03-27 07:01:53 +00:00
cur.execute(f"""select * from process_tracker where process = 'category'""")
if cur.fetchone() is None:
cur.execute(f"""insert into process_tracker (process, flag) values('category', 0)""")
cur.execute(f"""select * from process_tracker where process = 'category_product'""")
if cur.fetchone() is None:
cur.execute(f"""insert into process_tracker (process, flag) values('category_product', 0)""")
cur.execute(f"""select * from process_tracker where process = 'product_info'""")
if cur.fetchone() is None:
cur.execute(f"""insert into process_tracker (process, flag) values('product_info', 0)""")
2024-03-27 08:13:06 +00:00
logging.info("++++++++++++++++ process tracker tab status ++++++++++++++++++++++")
2024-03-27 07:01:53 +00:00
cur.execute(f"""select * from process_tracker""")
logging.info(cur.fetchall())
2024-04-01 06:46:40 +00:00
def get_status():
conn = psycopg2.connect(database=config.get('database'), user=config.get('db_user'),
password=config.get('db_pass'), host=config.get('db_host'),
port=config.get('db_port'))
conn.autocommit = True
cur = conn.cursor()
cur.execute(
f"""select count(1) from raena_spider_management.rce_category where rce_source_id = (select id from raena_spider_management.rce_source where source_name = 'Hasaki')""")
cat_count = cur.fetchone()[0]
cur.execute(f"""select count(1) from raena_spider_management.crawler_tracker_hasaki""")
product_total = cur.fetchone()[0]
cur.execute(f"""select count(1) from raena_spider_management.crawler_tracker_hasaki where flag = 1""")
product_successful = cur.fetchone()[0]
cur.execute(f"""select count(1) from raena_spider_management.crawler_tracker_hasaki where flag = 0""")
product_failed = cur.fetchone()[0]
msg = f"""
<p><b>Hasaki Crawler run is completed. Please check the status below,</b></p>
<br>
<ul style="list-style-type:disc">
<li>Total Collected categories: <b>{cat_count}</b></li>
<li>Total Collected products for categories: <b>{product_total}</b></li>
<li>Total successfully collected products: <b {'style="color: green;"' if product_successful == product_total else 'style="color: red;"'}>{product_successful}</b></li>
<li>Total failed to collect products: <b {'style="color: red;"' if product_failed > 0 else 'style="color: green;"'}>{product_failed}</b></li>
</ul>
"""
cur.close()
conn.close()
return msg
2024-03-27 07:01:53 +00:00
2024-03-14 05:16:59 +00:00
if __name__ == "__main__":
logging.info("Starting Hasaki Crawler.......")
try:
logging.info("Loading config file.......")
with open("conf.json", "r") as jsonfile:
config = json.load(jsonfile)
logging.info("Config file loaded.......")
2024-03-27 07:01:53 +00:00
logging.info(config)
conn = sqlite3.connect('process_tracker.db')
conn.isolation_level = None
cur = conn.cursor()
2024-03-27 07:02:09 +00:00
# cur.execute(f"""update process_tracker set flag = 1 where process = 'category'""")
# cur.execute(f"""update process_tracker set flag = 1 where process = 'category_product'""")
# cur.execute(f"""update process_tracker set flag = 0 where process = 'product_info'""")
2024-03-27 07:01:53 +00:00
init_tracker_tab(cur)
main(cur)
cur.close()
conn.close()
2024-04-01 06:46:40 +00:00
msg = get_status()
2024-03-14 05:16:59 +00:00
2024-04-01 06:46:40 +00:00
send_mail(msg)
2024-03-14 05:16:59 +00:00
except Exception as e:
logging.info("Error: ".format(e))
2024-03-27 07:01:53 +00:00
logging.info("Error occurred. Please check config file or the internal SQLLITE DB. Exiting......")
2024-03-27 07:02:55 +00:00
send_mail("Error occurred. Please check config file or the internal SQLLITE DB.")
2024-03-14 05:16:59 +00:00
exit(1)