174 lines
7.2 KiB
Python
174 lines
7.2 KiB
Python
|
import hashlib
|
||
|
import logging
|
||
|
import random
|
||
|
import sys
|
||
|
import string
|
||
|
#from selenium import webdriver
|
||
|
import undetected_chromedriver as webdriver
|
||
|
from selenium.webdriver.common.by import By
|
||
|
import psycopg2
|
||
|
import time
|
||
|
import re
|
||
|
from amazon_db_writer import amazon_db_writer
|
||
|
from datetime import datetime
|
||
|
from pyvirtualdisplay import Display
|
||
|
|
||
|
import ssl
|
||
|
ssl._create_default_https_context = ssl._create_unverified_context
|
||
|
|
||
|
class amazon_products_adhoc:
|
||
|
def __init__(self, config):
|
||
|
self.config = config
|
||
|
self.crawler_name = self.config.get("crawler_name")
|
||
|
self.pattern = r'[' + string.punctuation + ']'
|
||
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
||
|
self.conn.autocommit = True
|
||
|
self.cur = self.conn.cursor()
|
||
|
sql = f"""select * from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where rce_source_id=66 and product_price_min= '' order by id desc"""
|
||
|
self.cur.execute(sql)
|
||
|
self.items = self.cur.fetchall()
|
||
|
self.db_writer = amazon_db_writer(config)
|
||
|
#self.display = Display(visible=0, size=(800, 600))
|
||
|
#self.display.start()
|
||
|
|
||
|
|
||
|
def __del__(self):
|
||
|
print("Closing connection.....")
|
||
|
self.conn.close()
|
||
|
#self.display.stop()
|
||
|
|
||
|
def start_processing(self):
|
||
|
op = webdriver.ChromeOptions()
|
||
|
op.add_argument('--no-sandbox')
|
||
|
op.add_argument('--disable-notifications')
|
||
|
op.add_argument("--lang=en-GB")
|
||
|
op.add_argument('--user-data-dir=/home/ec2-user/chrome_cache/')
|
||
|
driver=webdriver.Chrome(options=op)
|
||
|
count = 0
|
||
|
for item in self.items:
|
||
|
count += 1
|
||
|
try:
|
||
|
logging.info("============== Getting info for {}/{}: {} ================".format(str(count),str(len(self.items)),str(item)))
|
||
|
start = datetime.now()
|
||
|
|
||
|
driver.get(item[3])
|
||
|
self.product_info(driver, item)
|
||
|
|
||
|
sql = f"""
|
||
|
update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 where product_page_url_hash='{item[4]}'
|
||
|
"""
|
||
|
self.cur.execute(sql)
|
||
|
end = datetime.now()
|
||
|
logging.info('Total time taken to fetch the product: {}'.format(str(end-start)))
|
||
|
time.sleep(5)
|
||
|
except Exception as e:
|
||
|
print(e)
|
||
|
driver.close()
|
||
|
|
||
|
def product_info(self, driver, item):
|
||
|
|
||
|
data_product = {}
|
||
|
|
||
|
data_product['rce_source_product_id'] = item[1]
|
||
|
data_product['rce_source_id'] = item[21]
|
||
|
data_product['rce_source_product_status'] = item[2]
|
||
|
data_product['product_page_url'] = item[3]
|
||
|
data_product['product_page_url_hash'] = item[4]
|
||
|
data_product['rce_category_id'] = item[5]
|
||
|
data_product['rce_brand_id'] = item[6]
|
||
|
data_product['rce_store_id'] = item[7]
|
||
|
data_product['rce_source_product_name'] = item[8]
|
||
|
data_product['product_images'] = item[9]
|
||
|
data_product['product_description'] = item[10]
|
||
|
data_product['product_sold_total'] = item[11]
|
||
|
data_product['product_sold'] = item[12]
|
||
|
data_product['product_price_min'] = item[13]
|
||
|
data_product['product_price_min_before_discount'] =item[14]
|
||
|
data_product['product_price_max'] = item[15]
|
||
|
data_product['product_price_max_before_discount'] = item[16]
|
||
|
data_product['ratings'] = item[17]
|
||
|
data_product['product_section'] = item[22]
|
||
|
|
||
|
|
||
|
# try:
|
||
|
# data_product['product_price_min'] = (driver.find_element(By.CSS_SELECTOR, '#corePrice_desktop > div > table > tbody > tr:nth-child(2) > td.a-span12 > span.a-price.a-text-price.a-size-medium.apexPriceToPay > span:nth-child(2)').text).replace('AED', '')
|
||
|
# data_product['product_price_max'] = data_product['product_price_min']
|
||
|
#
|
||
|
# except:
|
||
|
#
|
||
|
# try:
|
||
|
# price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text
|
||
|
# price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text
|
||
|
#
|
||
|
# price = price_whole+"."+price_fraction
|
||
|
# data_product['product_price_min'] = price
|
||
|
# data_product['product_price_max'] = price
|
||
|
# except:
|
||
|
# try:
|
||
|
# data_product['product_price_min'] =(driver.find_element(By.CSS_SELECTOR, '#sns-base-price > div > span.a-price.a-text-price.a-size-medium.apexPriceToPay > span:nth-child(2)').text).replace('AED','')
|
||
|
# data_product['product_price_max'] = data_product['product_price_min']
|
||
|
# except:
|
||
|
# data_product['product_price_min'] = (driver.find_element(By.CSS_SELECTOR, '#sns-base-price').text).replace('AED','')
|
||
|
# data_product['product_price_max'] = data_product['product_price_min']
|
||
|
# pass
|
||
|
# pass
|
||
|
#
|
||
|
# pass
|
||
|
|
||
|
try:
|
||
|
data_product['product_price_min'] = (driver.find_element(By.CSS_SELECTOR, '#sns-base-price').text).replace('AED', '')
|
||
|
data_product['product_price_max'] = data_product['product_price_min']
|
||
|
|
||
|
except:
|
||
|
price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text
|
||
|
price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text
|
||
|
|
||
|
price = price_whole+"."+price_fraction
|
||
|
data_product['product_price_min'] = price
|
||
|
data_product['product_price_max'] = price
|
||
|
pass
|
||
|
|
||
|
print("product_price_min: {}".format(data_product['product_price_min']))
|
||
|
|
||
|
try:
|
||
|
data_product['product_price_min_before_discount'] = (driver.find_element(By.CSS_SELECTOR, '.a-text-price').text).replace('AED', '')
|
||
|
data_product['product_price_max_before_discount'] = data_product['product_price_min_before_discount']
|
||
|
except:
|
||
|
pass
|
||
|
|
||
|
try:
|
||
|
self.db_writer.rce_product(data_product)
|
||
|
except Exception as e:
|
||
|
logging.info(e)
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
config = {
|
||
|
"crawler_name": "raena_crawler_enginer_amazon",
|
||
|
"crawler_schema": "raena_spider_management",
|
||
|
"category_tab": "rce_category",
|
||
|
"tracker_tab": "crawler_tracker",
|
||
|
"product_tab": "rce_product",
|
||
|
"variant_tab": "rce_product_variant",
|
||
|
"brand_tab": "rce_brand",
|
||
|
"reseller_tab": "rce_reseller",
|
||
|
"reseller_store_tab": "rce_reseller_store",
|
||
|
"review_tab": "rce_ratings_reviews",
|
||
|
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
|
||
|
"review_producttags_tab": "rce_ratings_reviews_producttags",
|
||
|
"review_tags": "rce_tags",
|
||
|
"source_tab": "rce_source",
|
||
|
"product_per_category": "1000",
|
||
|
"source_category": "11043145",
|
||
|
"db_user": "dbadmin",
|
||
|
"db_pass": "5qCif6eyY3Kmg4z",
|
||
|
"database": "analytics",
|
||
|
"db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
|
||
|
"db_port": "5432",
|
||
|
"crawler_main": "1",
|
||
|
"crawler_slave_no": ""
|
||
|
}
|
||
|
|
||
|
amazon_products_adhoc = amazon_products_adhoc(config)
|
||
|
amazon_products_adhoc.start_processing()
|