added Hasaki crawler

This commit is contained in:
Shariar Imtiaz 2024-04-01 11:55:06 +04:00
parent 959fd9a03e
commit 6ed8a649ae
6 changed files with 32 additions and 46 deletions

View File

@ -21,6 +21,5 @@
"database": "analytics", "database": "analytics",
"db_host": "redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com", "db_host": "redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com",
"db_port": "5439", "db_port": "5439",
"crawler_main": "1", "log_loc": "/home/ubuntu/logs/hasaki_crawler.log"
"crawler_slave_no": ""
} }

View File

@ -10,21 +10,17 @@ from hasaki_db_writer import hasaki_db_writer
from Util import translate_text_to_english from Util import translate_text_to_english
###### Looger ######
logname = '/home/ubuntu/logs/hasaki_crawler.log'
#logname = 'hasaki_crawler.log'
logging.basicConfig(filename=logname,
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO)
class HasakiCategories: class HasakiCategories:
def __init__(self, config): def __init__(self, config):
###### Looger ######
logging.basicConfig(filename=config.get("log_loc"),
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO)
logging.info("Initializing HasakiSubCategories") logging.info("Initializing HasakiSubCategories")
self.master_category = [] self.master_category = []
self.config = config self.config = config

View File

@ -7,17 +7,15 @@ from playwright.sync_api import sync_playwright
from hasaki_db_writer import hasaki_db_writer from hasaki_db_writer import hasaki_db_writer
from Util import translate_text_to_english from Util import translate_text_to_english
class HasakiCategoryProducts:
def __init__(self, config):
###### Looger ###### ###### Looger ######
logname = '/home/ubuntu/logs/hasaki_crawler.log' logging.basicConfig(filename=config.get("log_loc"),
#logname = 'hasaki_crawler.log'
logging.basicConfig(filename=logname,
filemode='a', filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S", datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO) level=logging.INFO)
class HasakiCategoryProducts:
def __init__(self, config):
logging.info("Initializing HasakiCategoryProducts........") logging.info("Initializing HasakiCategoryProducts........")
self.config = config self.config = config
self.crawler_name = self.config.get("crawler_name") self.crawler_name = self.config.get("crawler_name")

View File

@ -13,16 +13,6 @@ from email.message import EmailMessage
config = {} config = {}
###### Looger ######
logname = '/home/ubuntu/logs/hasaki_crawler.log'
#logname = 'hasaki_crawler.log'
logging.basicConfig(filename=logname,
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO)
def main(cur): def main(cur):
@ -181,6 +171,13 @@ if __name__ == "__main__":
logging.info("Config file loaded.......") logging.info("Config file loaded.......")
logging.info(config) logging.info(config)
###### Looger ######
logging.basicConfig(filename=config.get("log_loc"),
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO)
conn = sqlite3.connect('process_tracker.db') conn = sqlite3.connect('process_tracker.db')
conn.isolation_level = None conn.isolation_level = None

View File

@ -1,17 +1,15 @@
import logging import logging
import psycopg2 import psycopg2
class hasaki_db_writer:
def __init__(self, config):
###### Looger ###### ###### Looger ######
logname = '/home/ubuntu/logs/hasaki_crawler.log' logging.basicConfig(filename=config.get("log_loc"),
#logname = 'hasaki_crawler.log'
logging.basicConfig(filename=logname,
filemode='a', filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S", datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO) level=logging.INFO)
class hasaki_db_writer:
def __init__(self, config):
self.config = config self.config = config
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True self.conn.autocommit = True

View File

@ -19,17 +19,15 @@ from webdriver_manager.chrome import ChromeDriverManager
import brotli import brotli
import json import json
class HasakiProductInfo:
def __init__(self, config):
###### Looger ###### ###### Looger ######
logname = '/home/ubuntu/logs/hasaki_crawler.log' logging.basicConfig(filename=config.get("log_loc"),
#logname = 'hasaki_crawler.log'
logging.basicConfig(filename=logname,
filemode='a', filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S", datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO) level=logging.INFO)
class HasakiProductInfo:
def __init__(self, config):
logging.info("Initializing HasakiProductInfo") logging.info("Initializing HasakiProductInfo")
self.pattern = r'[' + string.punctuation + ']' self.pattern = r'[' + string.punctuation + ']'
self.config = config self.config = config