added Hasaki crawler

This commit is contained in:
Shariar Imtiaz 2024-04-01 11:31:33 +04:00
parent 54d49ea78f
commit d239129739
5 changed files with 61 additions and 5 deletions

View File

@ -3,11 +3,22 @@ import logging
import time
import psycopg2
import pandas as pd
from pyvirtualdisplay import Display
from playwright.sync_api import sync_playwright
from hasaki_db_writer import hasaki_db_writer
from Util import translate_text_to_english
###### Looger ######
logname = '/home/ubuntu/logs/hasaki_crawler.log'
logging.basicConfig(filename=logname,
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO)
class HasakiCategories:
@ -31,9 +42,13 @@ class HasakiCategories:
self.db_writer = hasaki_db_writer(config)
self.display = Display(visible=0, size=(800, 600))
self.display.start()
def __del__(self):
print("Closing connection.....")
self.conn.close()
self.display.stop()
def start_processing(self):

View File

@ -2,9 +2,19 @@ import logging
import random
import time
import psycopg2
from pyvirtualdisplay import Display
from playwright.sync_api import sync_playwright
from hasaki_db_writer import hasaki_db_writer
from Util import translate_text_to_english
###### Looger ######
logname = '/home/ubuntu/logs/hasaki_crawler.log'
logging.basicConfig(filename=logname,
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO)
class HasakiCategoryProducts:
def __init__(self, config):
logging.info("Initializing HasakiCategoryProducts........")
@ -26,9 +36,13 @@ class HasakiCategoryProducts:
self.db_writer = hasaki_db_writer(config)
self.display = Display(visible=0, size=(800, 600))
self.display.start()
def __del__(self):
print("Closing connection.....")
self.conn.close()
self.display.stop()
def start_processing(self):

View File

@ -10,9 +10,13 @@ from hasaki_category_products import HasakiCategoryProducts
from hasaki_product_info import HasakiProductInfo
from email.message import EmailMessage
##### Looger ######
format = "%(asctime)s: %(message)s"
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
###### Looger ######
logname = '/home/ubuntu/logs/hasaki_crawler.log'
logging.basicConfig(filename=logname,
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO)
config = {}
@ -26,6 +30,8 @@ def main(cur):
hasaki_categories.start_processing()
cur.execute(f"""update process_tracker set flag = 1 where process = 'category'""")
logging.info("Category collection completed........ Moving to collecting products ;ist of the categories.......")
#time.sleep(60)
cur.execute(f"""select flag from process_tracker where process = 'category_product'""")
@ -35,6 +41,8 @@ def main(cur):
hasaki_category_products.start_processing()
cur.execute(f"""update process_tracker set flag = 1 where process = 'category_product'""")
logging.info("Category products collection completed........ Moving to collecting product info.......")
#time.sleep(60)
cur.execute(f"""select flag from process_tracker where process = 'product_info'""")
@ -48,6 +56,8 @@ def main(cur):
cur.execute(f"""update process_tracker set flag = 0 where process = 'category_product'""")
cur.execute(f"""update process_tracker set flag = 0 where process = 'product_info'""")
logging.info("Product info collection done. Stopping........")

View File

@ -2,8 +2,12 @@ import logging
import psycopg2
###### Looger ######
format = "%(asctime)s: %(message)s"
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
logname = '/home/ubuntu/logs/hasaki_crawler.log'
logging.basicConfig(filename=logname,
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO)
class hasaki_db_writer:
def __init__(self, config):

View File

@ -12,12 +12,21 @@ from Util import translate_text_to_english
from fake_useragent import UserAgent
import time
import random
from pyvirtualdisplay import Display
from seleniumwire import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import brotli
import json
###### Looger ######
logname = '/home/ubuntu/logs/hasaki_crawler.log'
logging.basicConfig(filename=logname,
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO)
class HasakiProductInfo:
def __init__(self, config):
logging.info("Initializing HasakiProductInfo")
@ -40,9 +49,13 @@ class HasakiProductInfo:
self.db_writer = hasaki_db_writer(config)
self.display = Display(visible=0, size=(800, 600))
self.display.start()
def __del__(self):
print("Closing connection.....")
self.conn.close()
self.display.stop()
def start_processing(self):
logging.info("Starting to collect product info from Hasaki........")