added Hasaki crawler

This commit is contained in:
Shariar Imtiaz 2024-04-01 11:31:33 +04:00
parent 54d49ea78f
commit d239129739
5 changed files with 61 additions and 5 deletions

View File

@ -3,11 +3,22 @@ import logging
import time import time
import psycopg2 import psycopg2
import pandas as pd import pandas as pd
from pyvirtualdisplay import Display
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
from hasaki_db_writer import hasaki_db_writer from hasaki_db_writer import hasaki_db_writer
from Util import translate_text_to_english from Util import translate_text_to_english
###### Looger ######
logname = '/home/ubuntu/logs/hasaki_crawler.log'
logging.basicConfig(filename=logname,
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO)
class HasakiCategories: class HasakiCategories:
@ -31,9 +42,13 @@ class HasakiCategories:
self.db_writer = hasaki_db_writer(config) self.db_writer = hasaki_db_writer(config)
self.display = Display(visible=0, size=(800, 600))
self.display.start()
def __del__(self): def __del__(self):
print("Closing connection.....") print("Closing connection.....")
self.conn.close() self.conn.close()
self.display.stop()
def start_processing(self): def start_processing(self):

View File

@ -2,9 +2,19 @@ import logging
import random import random
import time import time
import psycopg2 import psycopg2
from pyvirtualdisplay import Display
from playwright.sync_api import sync_playwright from playwright.sync_api import sync_playwright
from hasaki_db_writer import hasaki_db_writer from hasaki_db_writer import hasaki_db_writer
from Util import translate_text_to_english from Util import translate_text_to_english
###### Looger ######
logname = '/home/ubuntu/logs/hasaki_crawler.log'
logging.basicConfig(filename=logname,
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO)
class HasakiCategoryProducts: class HasakiCategoryProducts:
def __init__(self, config): def __init__(self, config):
logging.info("Initializing HasakiCategoryProducts........") logging.info("Initializing HasakiCategoryProducts........")
@ -26,9 +36,13 @@ class HasakiCategoryProducts:
self.db_writer = hasaki_db_writer(config) self.db_writer = hasaki_db_writer(config)
self.display = Display(visible=0, size=(800, 600))
self.display.start()
def __del__(self): def __del__(self):
print("Closing connection.....") print("Closing connection.....")
self.conn.close() self.conn.close()
self.display.stop()
def start_processing(self): def start_processing(self):

View File

@ -10,9 +10,13 @@ from hasaki_category_products import HasakiCategoryProducts
from hasaki_product_info import HasakiProductInfo from hasaki_product_info import HasakiProductInfo
from email.message import EmailMessage from email.message import EmailMessage
##### Looger ###### ###### Looger ######
format = "%(asctime)s: %(message)s" logname = '/home/ubuntu/logs/hasaki_crawler.log'
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") logging.basicConfig(filename=logname,
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO)
config = {} config = {}
@ -26,6 +30,8 @@ def main(cur):
hasaki_categories.start_processing() hasaki_categories.start_processing()
cur.execute(f"""update process_tracker set flag = 1 where process = 'category'""") cur.execute(f"""update process_tracker set flag = 1 where process = 'category'""")
logging.info("Category collection completed........ Moving to collecting products ;ist of the categories.......")
#time.sleep(60) #time.sleep(60)
cur.execute(f"""select flag from process_tracker where process = 'category_product'""") cur.execute(f"""select flag from process_tracker where process = 'category_product'""")
@ -35,6 +41,8 @@ def main(cur):
hasaki_category_products.start_processing() hasaki_category_products.start_processing()
cur.execute(f"""update process_tracker set flag = 1 where process = 'category_product'""") cur.execute(f"""update process_tracker set flag = 1 where process = 'category_product'""")
logging.info("Category products collection completed........ Moving to collecting product info.......")
#time.sleep(60) #time.sleep(60)
cur.execute(f"""select flag from process_tracker where process = 'product_info'""") cur.execute(f"""select flag from process_tracker where process = 'product_info'""")
@ -48,6 +56,8 @@ def main(cur):
cur.execute(f"""update process_tracker set flag = 0 where process = 'category_product'""") cur.execute(f"""update process_tracker set flag = 0 where process = 'category_product'""")
cur.execute(f"""update process_tracker set flag = 0 where process = 'product_info'""") cur.execute(f"""update process_tracker set flag = 0 where process = 'product_info'""")
logging.info("Product info collection done. Stopping........")

View File

@ -2,8 +2,12 @@ import logging
import psycopg2 import psycopg2
###### Looger ###### ###### Looger ######
format = "%(asctime)s: %(message)s" logname = '/home/ubuntu/logs/hasaki_crawler.log'
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S") logging.basicConfig(filename=logname,
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO)
class hasaki_db_writer: class hasaki_db_writer:
def __init__(self, config): def __init__(self, config):

View File

@ -12,12 +12,21 @@ from Util import translate_text_to_english
from fake_useragent import UserAgent from fake_useragent import UserAgent
import time import time
import random import random
from pyvirtualdisplay import Display
from seleniumwire import webdriver from seleniumwire import webdriver
from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager from webdriver_manager.chrome import ChromeDriverManager
import brotli import brotli
import json import json
###### Looger ######
logname = '/home/ubuntu/logs/hasaki_crawler.log'
logging.basicConfig(filename=logname,
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO)
class HasakiProductInfo: class HasakiProductInfo:
def __init__(self, config): def __init__(self, config):
logging.info("Initializing HasakiProductInfo") logging.info("Initializing HasakiProductInfo")
@ -40,9 +49,13 @@ class HasakiProductInfo:
self.db_writer = hasaki_db_writer(config) self.db_writer = hasaki_db_writer(config)
self.display = Display(visible=0, size=(800, 600))
self.display.start()
def __del__(self): def __del__(self):
print("Closing connection.....") print("Closing connection.....")
self.conn.close() self.conn.close()
self.display.stop()
def start_processing(self): def start_processing(self):
logging.info("Starting to collect product info from Hasaki........") logging.info("Starting to collect product info from Hasaki........")