added Hasaki crawler
This commit is contained in:
parent
54d49ea78f
commit
d239129739
|
@ -3,11 +3,22 @@ import logging
|
||||||
import time
|
import time
|
||||||
import psycopg2
|
import psycopg2
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from pyvirtualdisplay import Display
|
||||||
|
|
||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright
|
||||||
from hasaki_db_writer import hasaki_db_writer
|
from hasaki_db_writer import hasaki_db_writer
|
||||||
from Util import translate_text_to_english
|
from Util import translate_text_to_english
|
||||||
|
|
||||||
|
###### Looger ######
|
||||||
|
logname = '/home/ubuntu/logs/hasaki_crawler.log'
|
||||||
|
logging.basicConfig(filename=logname,
|
||||||
|
filemode='a',
|
||||||
|
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
||||||
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
|
level=logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class HasakiCategories:
|
class HasakiCategories:
|
||||||
|
@ -31,9 +42,13 @@ class HasakiCategories:
|
||||||
|
|
||||||
self.db_writer = hasaki_db_writer(config)
|
self.db_writer = hasaki_db_writer(config)
|
||||||
|
|
||||||
|
self.display = Display(visible=0, size=(800, 600))
|
||||||
|
self.display.start()
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
print("Closing connection.....")
|
print("Closing connection.....")
|
||||||
self.conn.close()
|
self.conn.close()
|
||||||
|
self.display.stop()
|
||||||
|
|
||||||
def start_processing(self):
|
def start_processing(self):
|
||||||
|
|
||||||
|
|
|
@ -2,9 +2,19 @@ import logging
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
import psycopg2
|
import psycopg2
|
||||||
|
from pyvirtualdisplay import Display
|
||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright
|
||||||
from hasaki_db_writer import hasaki_db_writer
|
from hasaki_db_writer import hasaki_db_writer
|
||||||
from Util import translate_text_to_english
|
from Util import translate_text_to_english
|
||||||
|
|
||||||
|
###### Looger ######
|
||||||
|
logname = '/home/ubuntu/logs/hasaki_crawler.log'
|
||||||
|
logging.basicConfig(filename=logname,
|
||||||
|
filemode='a',
|
||||||
|
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
||||||
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
|
level=logging.INFO)
|
||||||
|
|
||||||
class HasakiCategoryProducts:
|
class HasakiCategoryProducts:
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
logging.info("Initializing HasakiCategoryProducts........")
|
logging.info("Initializing HasakiCategoryProducts........")
|
||||||
|
@ -26,9 +36,13 @@ class HasakiCategoryProducts:
|
||||||
|
|
||||||
self.db_writer = hasaki_db_writer(config)
|
self.db_writer = hasaki_db_writer(config)
|
||||||
|
|
||||||
|
self.display = Display(visible=0, size=(800, 600))
|
||||||
|
self.display.start()
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
print("Closing connection.....")
|
print("Closing connection.....")
|
||||||
self.conn.close()
|
self.conn.close()
|
||||||
|
self.display.stop()
|
||||||
|
|
||||||
def start_processing(self):
|
def start_processing(self):
|
||||||
|
|
||||||
|
|
|
@ -10,9 +10,13 @@ from hasaki_category_products import HasakiCategoryProducts
|
||||||
from hasaki_product_info import HasakiProductInfo
|
from hasaki_product_info import HasakiProductInfo
|
||||||
from email.message import EmailMessage
|
from email.message import EmailMessage
|
||||||
|
|
||||||
##### Looger ######
|
###### Looger ######
|
||||||
format = "%(asctime)s: %(message)s"
|
logname = '/home/ubuntu/logs/hasaki_crawler.log'
|
||||||
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
|
logging.basicConfig(filename=logname,
|
||||||
|
filemode='a',
|
||||||
|
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
||||||
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
|
level=logging.INFO)
|
||||||
|
|
||||||
config = {}
|
config = {}
|
||||||
|
|
||||||
|
@ -26,6 +30,8 @@ def main(cur):
|
||||||
hasaki_categories.start_processing()
|
hasaki_categories.start_processing()
|
||||||
cur.execute(f"""update process_tracker set flag = 1 where process = 'category'""")
|
cur.execute(f"""update process_tracker set flag = 1 where process = 'category'""")
|
||||||
|
|
||||||
|
logging.info("Category collection completed........ Moving to collecting products ;ist of the categories.......")
|
||||||
|
|
||||||
#time.sleep(60)
|
#time.sleep(60)
|
||||||
|
|
||||||
cur.execute(f"""select flag from process_tracker where process = 'category_product'""")
|
cur.execute(f"""select flag from process_tracker where process = 'category_product'""")
|
||||||
|
@ -35,6 +41,8 @@ def main(cur):
|
||||||
hasaki_category_products.start_processing()
|
hasaki_category_products.start_processing()
|
||||||
cur.execute(f"""update process_tracker set flag = 1 where process = 'category_product'""")
|
cur.execute(f"""update process_tracker set flag = 1 where process = 'category_product'""")
|
||||||
|
|
||||||
|
logging.info("Category products collection completed........ Moving to collecting product info.......")
|
||||||
|
|
||||||
#time.sleep(60)
|
#time.sleep(60)
|
||||||
|
|
||||||
cur.execute(f"""select flag from process_tracker where process = 'product_info'""")
|
cur.execute(f"""select flag from process_tracker where process = 'product_info'""")
|
||||||
|
@ -48,6 +56,8 @@ def main(cur):
|
||||||
cur.execute(f"""update process_tracker set flag = 0 where process = 'category_product'""")
|
cur.execute(f"""update process_tracker set flag = 0 where process = 'category_product'""")
|
||||||
cur.execute(f"""update process_tracker set flag = 0 where process = 'product_info'""")
|
cur.execute(f"""update process_tracker set flag = 0 where process = 'product_info'""")
|
||||||
|
|
||||||
|
logging.info("Product info collection done. Stopping........")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,8 +2,12 @@ import logging
|
||||||
import psycopg2
|
import psycopg2
|
||||||
|
|
||||||
###### Looger ######
|
###### Looger ######
|
||||||
format = "%(asctime)s: %(message)s"
|
logname = '/home/ubuntu/logs/hasaki_crawler.log'
|
||||||
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
|
logging.basicConfig(filename=logname,
|
||||||
|
filemode='a',
|
||||||
|
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
||||||
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
|
level=logging.INFO)
|
||||||
|
|
||||||
class hasaki_db_writer:
|
class hasaki_db_writer:
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
|
|
|
@ -12,12 +12,21 @@ from Util import translate_text_to_english
|
||||||
from fake_useragent import UserAgent
|
from fake_useragent import UserAgent
|
||||||
import time
|
import time
|
||||||
import random
|
import random
|
||||||
|
from pyvirtualdisplay import Display
|
||||||
from seleniumwire import webdriver
|
from seleniumwire import webdriver
|
||||||
from selenium.webdriver.chrome.service import Service
|
from selenium.webdriver.chrome.service import Service
|
||||||
from webdriver_manager.chrome import ChromeDriverManager
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
import brotli
|
import brotli
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
###### Looger ######
|
||||||
|
logname = '/home/ubuntu/logs/hasaki_crawler.log'
|
||||||
|
logging.basicConfig(filename=logname,
|
||||||
|
filemode='a',
|
||||||
|
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
|
||||||
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
|
level=logging.INFO)
|
||||||
|
|
||||||
class HasakiProductInfo:
|
class HasakiProductInfo:
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
logging.info("Initializing HasakiProductInfo")
|
logging.info("Initializing HasakiProductInfo")
|
||||||
|
@ -40,9 +49,13 @@ class HasakiProductInfo:
|
||||||
|
|
||||||
self.db_writer = hasaki_db_writer(config)
|
self.db_writer = hasaki_db_writer(config)
|
||||||
|
|
||||||
|
self.display = Display(visible=0, size=(800, 600))
|
||||||
|
self.display.start()
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
print("Closing connection.....")
|
print("Closing connection.....")
|
||||||
self.conn.close()
|
self.conn.close()
|
||||||
|
self.display.stop()
|
||||||
|
|
||||||
def start_processing(self):
|
def start_processing(self):
|
||||||
logging.info("Starting to collect product info from Hasaki........")
|
logging.info("Starting to collect product info from Hasaki........")
|
||||||
|
|
Loading…
Reference in New Issue