raena-crawler-engine/amazon_crawler_engine/amazon_product_adhoc.py

174 lines
7.2 KiB
Python
Raw Permalink Normal View History

2024-01-24 13:05:07 +00:00
import hashlib
import logging
import random
import sys
import string
#from selenium import webdriver
import undetected_chromedriver as webdriver
from selenium.webdriver.common.by import By
import psycopg2
import time
import re
from amazon_db_writer import amazon_db_writer
from datetime import datetime
from pyvirtualdisplay import Display
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
class amazon_products_adhoc:
def __init__(self, config):
self.config = config
self.crawler_name = self.config.get("crawler_name")
self.pattern = r'[' + string.punctuation + ']'
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
sql = f"""select * from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where rce_source_id=66 and product_price_min= '' order by id desc"""
self.cur.execute(sql)
self.items = self.cur.fetchall()
self.db_writer = amazon_db_writer(config)
#self.display = Display(visible=0, size=(800, 600))
#self.display.start()
def __del__(self):
print("Closing connection.....")
self.conn.close()
#self.display.stop()
def start_processing(self):
op = webdriver.ChromeOptions()
op.add_argument('--no-sandbox')
op.add_argument('--disable-notifications')
op.add_argument("--lang=en-GB")
op.add_argument('--user-data-dir=/home/ec2-user/chrome_cache/')
driver=webdriver.Chrome(options=op)
count = 0
for item in self.items:
count += 1
try:
logging.info("============== Getting info for {}/{}: {} ================".format(str(count),str(len(self.items)),str(item)))
start = datetime.now()
driver.get(item[3])
self.product_info(driver, item)
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 where product_page_url_hash='{item[4]}'
"""
self.cur.execute(sql)
end = datetime.now()
logging.info('Total time taken to fetch the product: {}'.format(str(end-start)))
time.sleep(5)
except Exception as e:
print(e)
driver.close()
def product_info(self, driver, item):
data_product = {}
data_product['rce_source_product_id'] = item[1]
data_product['rce_source_id'] = item[21]
data_product['rce_source_product_status'] = item[2]
data_product['product_page_url'] = item[3]
data_product['product_page_url_hash'] = item[4]
data_product['rce_category_id'] = item[5]
data_product['rce_brand_id'] = item[6]
data_product['rce_store_id'] = item[7]
data_product['rce_source_product_name'] = item[8]
data_product['product_images'] = item[9]
data_product['product_description'] = item[10]
data_product['product_sold_total'] = item[11]
data_product['product_sold'] = item[12]
data_product['product_price_min'] = item[13]
data_product['product_price_min_before_discount'] =item[14]
data_product['product_price_max'] = item[15]
data_product['product_price_max_before_discount'] = item[16]
data_product['ratings'] = item[17]
data_product['product_section'] = item[22]
# try:
# data_product['product_price_min'] = (driver.find_element(By.CSS_SELECTOR, '#corePrice_desktop > div > table > tbody > tr:nth-child(2) > td.a-span12 > span.a-price.a-text-price.a-size-medium.apexPriceToPay > span:nth-child(2)').text).replace('AED', '')
# data_product['product_price_max'] = data_product['product_price_min']
#
# except:
#
# try:
# price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text
# price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text
#
# price = price_whole+"."+price_fraction
# data_product['product_price_min'] = price
# data_product['product_price_max'] = price
# except:
# try:
# data_product['product_price_min'] =(driver.find_element(By.CSS_SELECTOR, '#sns-base-price > div > span.a-price.a-text-price.a-size-medium.apexPriceToPay > span:nth-child(2)').text).replace('AED','')
# data_product['product_price_max'] = data_product['product_price_min']
# except:
# data_product['product_price_min'] = (driver.find_element(By.CSS_SELECTOR, '#sns-base-price').text).replace('AED','')
# data_product['product_price_max'] = data_product['product_price_min']
# pass
# pass
#
# pass
try:
data_product['product_price_min'] = (driver.find_element(By.CSS_SELECTOR, '#sns-base-price').text).replace('AED', '')
data_product['product_price_max'] = data_product['product_price_min']
except:
price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text
price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text
price = price_whole+"."+price_fraction
data_product['product_price_min'] = price
data_product['product_price_max'] = price
pass
print("product_price_min: {}".format(data_product['product_price_min']))
try:
data_product['product_price_min_before_discount'] = (driver.find_element(By.CSS_SELECTOR, '.a-text-price').text).replace('AED', '')
data_product['product_price_max_before_discount'] = data_product['product_price_min_before_discount']
except:
pass
try:
self.db_writer.rce_product(data_product)
except Exception as e:
logging.info(e)
config = {
"crawler_name": "raena_crawler_enginer_amazon",
"crawler_schema": "raena_spider_management",
"category_tab": "rce_category",
"tracker_tab": "crawler_tracker",
"product_tab": "rce_product",
"variant_tab": "rce_product_variant",
"brand_tab": "rce_brand",
"reseller_tab": "rce_reseller",
"reseller_store_tab": "rce_reseller_store",
"review_tab": "rce_ratings_reviews",
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
"review_producttags_tab": "rce_ratings_reviews_producttags",
"review_tags": "rce_tags",
"source_tab": "rce_source",
"product_per_category": "1000",
"source_category": "11043145",
"db_user": "dbadmin",
"db_pass": "5qCif6eyY3Kmg4z",
"database": "analytics",
"db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
"db_port": "5432",
"crawler_main": "1",
"crawler_slave_no": ""
}
amazon_products_adhoc = amazon_products_adhoc(config)
amazon_products_adhoc.start_processing()