added Hasaki crawler
This commit is contained in:
parent
6ed8a649ae
commit
82a3d9d9b8
|
@ -20,6 +20,5 @@
|
||||||
"db_pass": "5qCif6eyY3Kmg4z",
|
"db_pass": "5qCif6eyY3Kmg4z",
|
||||||
"database": "analytics",
|
"database": "analytics",
|
||||||
"db_host": "redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com",
|
"db_host": "redshift-cluster-1.cdqj58hfx4p7.ap-southeast-1.redshift.amazonaws.com",
|
||||||
"db_port": "5439",
|
"db_port": "5439"
|
||||||
"log_loc": "/home/ubuntu/logs/hasaki_crawler.log"
|
|
||||||
}
|
}
|
|
@ -9,18 +9,17 @@ from playwright.sync_api import sync_playwright
|
||||||
from hasaki_db_writer import hasaki_db_writer
|
from hasaki_db_writer import hasaki_db_writer
|
||||||
from Util import translate_text_to_english
|
from Util import translate_text_to_english
|
||||||
|
|
||||||
|
###### Looger ######
|
||||||
|
logging.basicConfig(filename="/home/ubuntu/logs/hasaki_crawler.log",
|
||||||
|
filemode='a',
|
||||||
|
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
||||||
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
|
level=logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class HasakiCategories:
|
class HasakiCategories:
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
###### Looger ######
|
|
||||||
logging.basicConfig(filename=config.get("log_loc"),
|
|
||||||
filemode='a',
|
|
||||||
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
|
||||||
datefmt="%Y-%m-%d %H:%M:%S",
|
|
||||||
level=logging.INFO)
|
|
||||||
logging.info("Initializing HasakiSubCategories")
|
logging.info("Initializing HasakiSubCategories")
|
||||||
self.master_category = []
|
self.master_category = []
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
|
@ -7,15 +7,15 @@ from playwright.sync_api import sync_playwright
|
||||||
from hasaki_db_writer import hasaki_db_writer
|
from hasaki_db_writer import hasaki_db_writer
|
||||||
from Util import translate_text_to_english
|
from Util import translate_text_to_english
|
||||||
|
|
||||||
|
|
||||||
class HasakiCategoryProducts:
|
|
||||||
def __init__(self, config):
|
|
||||||
###### Looger ######
|
###### Looger ######
|
||||||
logging.basicConfig(filename=config.get("log_loc"),
|
logging.basicConfig(filename="/home/ubuntu/logs/hasaki_crawler.log",
|
||||||
filemode='a',
|
filemode='a',
|
||||||
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
||||||
datefmt="%Y-%m-%d %H:%M:%S",
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
level=logging.INFO)
|
level=logging.INFO)
|
||||||
|
|
||||||
|
class HasakiCategoryProducts:
|
||||||
|
def __init__(self, config):
|
||||||
logging.info("Initializing HasakiCategoryProducts........")
|
logging.info("Initializing HasakiCategoryProducts........")
|
||||||
self.config = config
|
self.config = config
|
||||||
self.crawler_name = self.config.get("crawler_name")
|
self.crawler_name = self.config.get("crawler_name")
|
||||||
|
|
|
@ -12,6 +12,12 @@ from email.message import EmailMessage
|
||||||
|
|
||||||
config = {}
|
config = {}
|
||||||
|
|
||||||
|
###### Looger ######
|
||||||
|
logging.basicConfig(filename="/home/ubuntu/logs/hasaki_crawler.log",
|
||||||
|
filemode='a',
|
||||||
|
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
||||||
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
|
level=logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -171,13 +177,6 @@ if __name__ == "__main__":
|
||||||
logging.info("Config file loaded.......")
|
logging.info("Config file loaded.......")
|
||||||
logging.info(config)
|
logging.info(config)
|
||||||
|
|
||||||
###### Looger ######
|
|
||||||
logging.basicConfig(filename=config.get("log_loc"),
|
|
||||||
filemode='a',
|
|
||||||
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
|
||||||
datefmt="%Y-%m-%d %H:%M:%S",
|
|
||||||
level=logging.INFO)
|
|
||||||
|
|
||||||
conn = sqlite3.connect('process_tracker.db')
|
conn = sqlite3.connect('process_tracker.db')
|
||||||
conn.isolation_level = None
|
conn.isolation_level = None
|
||||||
|
|
||||||
|
|
|
@ -1,15 +1,15 @@
|
||||||
import logging
|
import logging
|
||||||
import psycopg2
|
import psycopg2
|
||||||
|
|
||||||
|
|
||||||
class hasaki_db_writer:
|
|
||||||
def __init__(self, config):
|
|
||||||
###### Looger ######
|
###### Looger ######
|
||||||
logging.basicConfig(filename=config.get("log_loc"),
|
logging.basicConfig(filename="/home/ubuntu/logs/hasaki_crawler.log",
|
||||||
filemode='a',
|
filemode='a',
|
||||||
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
||||||
datefmt="%Y-%m-%d %H:%M:%S",
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
level=logging.INFO)
|
level=logging.INFO)
|
||||||
|
|
||||||
|
class hasaki_db_writer:
|
||||||
|
def __init__(self, config):
|
||||||
self.config = config
|
self.config = config
|
||||||
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
||||||
self.conn.autocommit = True
|
self.conn.autocommit = True
|
||||||
|
|
|
@ -19,15 +19,15 @@ from webdriver_manager.chrome import ChromeDriverManager
|
||||||
import brotli
|
import brotli
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
|
||||||
class HasakiProductInfo:
|
|
||||||
def __init__(self, config):
|
|
||||||
###### Looger ######
|
###### Looger ######
|
||||||
logging.basicConfig(filename=config.get("log_loc"),
|
logging.basicConfig(filename="/home/ubuntu/logs/hasaki_crawler.log",
|
||||||
filemode='a',
|
filemode='a',
|
||||||
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
||||||
datefmt="%Y-%m-%d %H:%M:%S",
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
level=logging.INFO)
|
level=logging.INFO)
|
||||||
|
|
||||||
|
class HasakiProductInfo:
|
||||||
|
def __init__(self, config):
|
||||||
logging.info("Initializing HasakiProductInfo")
|
logging.info("Initializing HasakiProductInfo")
|
||||||
self.pattern = r'[' + string.punctuation + ']'
|
self.pattern = r'[' + string.punctuation + ']'
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
Loading…
Reference in New Issue