import hashlib import logging import random import sys import string #from selenium import webdriver import undetected_chromedriver as webdriver from selenium.webdriver.common.by import By import psycopg2 import time import re from amazon_db_writer import amazon_db_writer from datetime import datetime from pyvirtualdisplay import Display import ssl ssl._create_default_https_context = ssl._create_unverified_context class amazon_products_adhoc: def __init__(self, config): self.config = config self.crawler_name = self.config.get("crawler_name") self.pattern = r'[' + string.punctuation + ']' self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) self.conn.autocommit = True self.cur = self.conn.cursor() sql = f"""select * from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where rce_source_id=66 and product_price_min= '' order by id desc""" self.cur.execute(sql) self.items = self.cur.fetchall() self.db_writer = amazon_db_writer(config) #self.display = Display(visible=0, size=(800, 600)) #self.display.start() def __del__(self): print("Closing connection.....") self.conn.close() #self.display.stop() def start_processing(self): op = webdriver.ChromeOptions() op.add_argument('--no-sandbox') op.add_argument('--disable-notifications') op.add_argument("--lang=en-GB") op.add_argument('--user-data-dir=/home/ec2-user/chrome_cache/') driver=webdriver.Chrome(options=op) count = 0 for item in self.items: count += 1 try: logging.info("============== Getting info for {}/{}: {} ================".format(str(count),str(len(self.items)),str(item))) start = datetime.now() driver.get(item[3]) self.product_info(driver, item) sql = f""" update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 where product_page_url_hash='{item[4]}' """ self.cur.execute(sql) end = datetime.now() logging.info('Total time taken to fetch the product: {}'.format(str(end-start))) time.sleep(5) except Exception as e: print(e) driver.close() def product_info(self, driver, item): data_product = {} data_product['rce_source_product_id'] = item[1] data_product['rce_source_id'] = item[21] data_product['rce_source_product_status'] = item[2] data_product['product_page_url'] = item[3] data_product['product_page_url_hash'] = item[4] data_product['rce_category_id'] = item[5] data_product['rce_brand_id'] = item[6] data_product['rce_store_id'] = item[7] data_product['rce_source_product_name'] = item[8] data_product['product_images'] = item[9] data_product['product_description'] = item[10] data_product['product_sold_total'] = item[11] data_product['product_sold'] = item[12] data_product['product_price_min'] = item[13] data_product['product_price_min_before_discount'] =item[14] data_product['product_price_max'] = item[15] data_product['product_price_max_before_discount'] = item[16] data_product['ratings'] = item[17] data_product['product_section'] = item[22] # try: # data_product['product_price_min'] = (driver.find_element(By.CSS_SELECTOR, '#corePrice_desktop > div > table > tbody > tr:nth-child(2) > td.a-span12 > span.a-price.a-text-price.a-size-medium.apexPriceToPay > span:nth-child(2)').text).replace('AED', '') # data_product['product_price_max'] = data_product['product_price_min'] # # except: # # try: # price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text # price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text # # price = price_whole+"."+price_fraction # data_product['product_price_min'] = price # data_product['product_price_max'] = price # except: # try: # data_product['product_price_min'] =(driver.find_element(By.CSS_SELECTOR, '#sns-base-price > div > span.a-price.a-text-price.a-size-medium.apexPriceToPay > span:nth-child(2)').text).replace('AED','') # data_product['product_price_max'] = data_product['product_price_min'] # except: # data_product['product_price_min'] = (driver.find_element(By.CSS_SELECTOR, '#sns-base-price').text).replace('AED','') # data_product['product_price_max'] = data_product['product_price_min'] # pass # pass # # pass try: data_product['product_price_min'] = (driver.find_element(By.CSS_SELECTOR, '#sns-base-price').text).replace('AED', '') data_product['product_price_max'] = data_product['product_price_min'] except: price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text price = price_whole+"."+price_fraction data_product['product_price_min'] = price data_product['product_price_max'] = price pass print("product_price_min: {}".format(data_product['product_price_min'])) try: data_product['product_price_min_before_discount'] = (driver.find_element(By.CSS_SELECTOR, '.a-text-price').text).replace('AED', '') data_product['product_price_max_before_discount'] = data_product['product_price_min_before_discount'] except: pass try: self.db_writer.rce_product(data_product) except Exception as e: logging.info(e) config = { "crawler_name": "raena_crawler_enginer_amazon", "crawler_schema": "raena_spider_management", "category_tab": "rce_category", "tracker_tab": "crawler_tracker", "product_tab": "rce_product", "variant_tab": "rce_product_variant", "brand_tab": "rce_brand", "reseller_tab": "rce_reseller", "reseller_store_tab": "rce_reseller_store", "review_tab": "rce_ratings_reviews", "review_productmodels_tab": "rce_ratings_reviews_productmodels", "review_producttags_tab": "rce_ratings_reviews_producttags", "review_tags": "rce_tags", "source_tab": "rce_source", "product_per_category": "1000", "source_category": "11043145", "db_user": "dbadmin", "db_pass": "5qCif6eyY3Kmg4z", "database": "analytics", "db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com", "db_port": "5432", "crawler_main": "1", "crawler_slave_no": "" } amazon_products_adhoc = amazon_products_adhoc(config) amazon_products_adhoc.start_processing()