import hashlib import logging import random import sys import string import undetected_chromedriver as webdriver from selenium.webdriver.common.by import By import psycopg2 import time import re from amazon_db_writer import amazon_db_writer from datetime import datetime from pyvirtualdisplay import Display import ssl ssl._create_default_https_context = ssl._create_unverified_context class amazon_products: def __init__(self, config): self.config = config self.crawler_name = self.config.get("crawler_name") self.pattern = r'[' + string.punctuation + ']' self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) self.conn.autocommit = True self.cur = self.conn.cursor() self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Amazon'") self.rce_source_id = self.cur.fetchone()[0] self.cur.execute("select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where crawler_name='raena_crawler_enginer_amazon' and flag=0 order by id") self.items = self.cur.fetchall() self.db_writer = amazon_db_writer(config) #self.display = Display(visible=0, size=(800, 600)) #self.display.start() def __del__(self): print("Closing connection.....") self.conn.close() #self.display.stop() def start_processing(self): count = 0 for item in self.items: count += 1 try: logging.info("============== Getting info for {}/{}: {} ================".format(str(count),str(len(self.items)),str(item))) start = datetime.now() self.get_product_info(item) end = datetime.now() logging.info('Total time taken to fetch the product: {}'.format(str(end-start))) except Exception as e: print(e) def reseller_info(self, driver): try: store_urls = [] try: driver.find_element(By.CSS_SELECTOR, '.a-icon.a-icon-arrow.a-icon-small.arrow-icon').click() time.sleep(5) offers = driver.find_elements(By.CSS_SELECTOR, '#aod-offer-soldBy') for offer in offers: try: store_url = offer.find_element(By.CSS_SELECTOR, '.a-fixed-left-grid-col.a-col-right').find_element(By.TAG_NAME, 'a').get_attribute('href') store_urls.append(store_url) except: pass except: try: store_url = driver.find_element(By.CSS_SELECTOR, '#sellerProfileTriggerId').get_attribute('href') store_urls.append(store_url) except: pass pass if store_urls: store_urls = list(set(store_urls)) return_item = "" flag = 0 for store_url in store_urls: driver.get(store_url) driver.implicitly_wait(5) ##### reseller info data_reseller = {} data_reseller['rce_source_id'] = self.rce_source_id data_reseller['rce_source_reseller_status'] = 1 data_reseller['reseller_name'] = "" data_reseller['reseller_average_rating'] = 0.0 data_reseller['reseller_description'] = "" try: data_reseller['reseller_name'] = driver.find_element(By.CSS_SELECTOR,'#seller-name').text data_reseller['reseller_name'] = data_reseller['reseller_name'].replace("'","") except: pass try: data_reseller['reseller_average_rating'] = float(driver.find_element(By.CSS_SELECTOR,'#effective-timeperiod-rating-year-description').text) except: try: data_reseller['reseller_average_rating'] = float(driver.find_element(By.CSS_SELECTOR,'#effective-timeperiod-rating-year-description').text) except: pass pass try: data_reseller['reseller_description'] = driver.find_element(By.CSS_SELECTOR, '#spp-expander-about-seller .a-row').text data_reseller['reseller_description'] = data_reseller['reseller_description'].replace("'","") except: pass try: self.db_writer.rce_reseller(data_reseller) except Exception as e: logging.info(e) ##### Store info data_reseller_store = {} data_reseller_store['rce_source_store_status'] = 1 data_reseller_store['store_page_url'] = store_url data_reseller_store['store_page_url_hash'] = hashlib.md5(data_reseller_store['store_page_url'].encode('utf-8')).hexdigest() data_reseller_store['store_location'] = "" data_reseller_store['rce_reseller_id'] = "" data_reseller_store['rce_source_id'] = self.rce_source_id try: self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data_reseller['reseller_name'])+"'") rce_reseller_id = self.cur.fetchone() data_reseller_store['rce_reseller_id'] = rce_reseller_id[0] if flag == 0: return_item = data_reseller_store['rce_reseller_id'] flag = 1 except: pass try: self.db_writer.rce_reseller_store(data_reseller_store) except Exception as e: logging.info(e) time.sleep(2) else: ##### reseller info data_reseller = {} data_reseller['rce_source_id'] = self.rce_source_id data_reseller['rce_source_reseller_status'] = 1 data_reseller['reseller_name'] = "Amazon.ae" data_reseller['reseller_average_rating'] = 0.0 data_reseller['reseller_description'] = "" try: self.db_writer.rce_reseller(data_reseller) except Exception as e: logging.info(e) ##### Store info data_reseller_store = {} data_reseller_store['rce_source_store_status'] = 1 data_reseller_store['store_page_url'] = "amazon.ae" data_reseller_store['store_page_url_hash'] = hashlib.md5(data_reseller_store['store_page_url'].encode('utf-8')).hexdigest() data_reseller_store['store_location'] = "" data_reseller_store['rce_reseller_id'] = "" data_reseller_store['rce_source_id'] = self.rce_source_id try: self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data_reseller['reseller_name'])+"'") rce_reseller_id = self.cur.fetchone() data_reseller_store['rce_reseller_id'] = rce_reseller_id[0] return_item = data_reseller_store['rce_reseller_id'] except: pass try: self.db_writer.rce_reseller_store(data_reseller_store) except Exception as e: logging.info(e) return return_item except Exception as e: print(e) def brand_info(self, driver): data_brand = {} data_brand['rce_source_id'] = self.rce_source_id data_brand['rce_source_brand_status'] = 1 data_brand['brand_page_url'] = "" data_brand['brand_page_url_hash'] = "" data_brand['brand_name'] = "" try: data_brand['brand_page_url'] = driver.find_element(By.CSS_SELECTOR, '#bylineInfo').get_attribute('href') data_brand['brand_page_url_hash'] = hashlib.md5(data_brand['brand_page_url'].encode('utf-8')).hexdigest() try: data_brand['brand_name'] = driver.find_element(By.CSS_SELECTOR, '.po-brand .po-break-word').text except: pass try: self.db_writer.rce_brand(data_brand) except Exception as e: logging.info(e) return data_brand['brand_name'] except: pass def product_info(self, driver, category, keyword, url, url_hash, brand_name, rce_reseller_id): data_product = {} data_product['rce_source_product_id'] = 0 data_product['rce_source_id'] = self.rce_source_id data_product['rce_source_product_status'] = 1 data_product['product_page_url'] = url.replace("'","''") data_product['product_page_url_hash'] = url_hash data_product['rce_category_id'] = category data_product['rce_brand_id'] = "" data_product['rce_store_id'] = "" data_product['rce_source_product_name'] = "" data_product['product_images'] = "" data_product['product_description'] = "" data_product['product_sold_total'] = 0 data_product['product_sold'] = 0 data_product['product_price_min'] = "" data_product['product_price_min_before_discount'] ="" data_product['product_price_max'] = "" data_product['product_price_max_before_discount'] = "" data_product['ratings'] = 0.0 data_product['product_section'] = keyword try: sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_name = '"+str(brand_name)+"'" self.cur.execute(sql) data_product['rce_brand_id'] = self.cur.fetchone()[0] except: pass try: sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+"" self.cur.execute(sql) data_product['rce_store_id'] = self.cur.fetchone()[0] except: pass try: rce_source_product_name = driver.find_element(By.CSS_SELECTOR,'#productTitle').text data_product['rce_source_product_name'] = str(re.sub(self.pattern, '', rce_source_product_name)).replace("'","''") except: pass try: product_images_element = driver.find_element(By.CSS_SELECTOR, '#magnifierLens') product_images_raw = product_images_element.find_elements(By.TAG_NAME, 'img') product_images = [] for product_image in product_images_raw: url = product_image.get_attribute('src') product_images.append(url) data_product['product_images'] = str(product_images) except: pass try: description = "" des_rank = "" try: des_raws = driver.find_element(By.CSS_SELECTOR, '.a-unordered-list.a-vertical.a-spacing-mini').find_elements(By.CSS_SELECTOR, '.a-list-item') for des_raw in des_raws: try: des = des_raw.text description += des except: pass except: pass try: des_rank = driver.find_element(By.XPATH, '/html/body/div[2]/div/div[6]/div[24]/div/ul[1]').find_element(By.CSS_SELECTOR, '.a-list-item').text except: pass data_product['product_description'] = description+des_rank except: pass try: price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text price = price_whole+"."+price_fraction data_product['product_price_min'] = price data_product['product_price_max'] = price except: pass try: d_price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text d_price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text price = d_price_whole+"."+d_price_fraction data_product['product_price_min'] = price data_product['product_price_max'] = price except: pass try: data_product['product_price_min_before_discount'] = (driver.find_element(By.CSS_SELECTOR, '.a-text-price').text).replace('AED', '') data_product['product_price_max_before_discount'] = data_product['product_price_min_before_discount'] except: pass try: data_product['ratings'] = driver.find_element(By.CSS_SELECTOR, '#averageCustomerReviews .a-color-base').text except: pass try: self.db_writer.rce_product(data_product) except Exception as e: logging.info(e) ### rce_product_variant try: is_variant = driver.find_element(By.CSS_SELECTOR, '.a-unordered-list.a-nostyle.a-button-list.a-declarative.a-button-toggle-group.a-horizontal.a-spacing-top-micro.swatches.swatchesSquare.imageSwatches') if is_variant: variants = is_variant.find_elements(By.TAG_NAME, 'li') #random.shuffle(variants) for variant in variants: variant.click() data_variant = {} data_variant['rce_source_variant_id'] = 0 data_variant['rce_product_id'] = "" data_variant['product_variant_name'] = "" data_variant['product_variant_price'] = "" data_variant['product_variant_price_before_discount'] = "" data_variant['product_variant_stock'] = 0 try: sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_name = '"+str(data_product['rce_source_product_name'])+"'" self.cur.execute(sql) data_variant['rce_product_id'] = self.cur.fetchone()[0] except: pass try: product_variant_name = driver.find_element(By.CSS_SELECTOR,'#productTitle').text data_variant['product_variant_name'] = str(re.sub(self.pattern, '', product_variant_name)).replace("'","''") except: pass try: d_price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text d_price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text price = d_price_whole+"."+d_price_fraction data_variant['product_variant_price'] = price except: pass try: data_variant['product_variant_price_before_discount'] = (driver.find_element(By.CSS_SELECTOR, '.a-text-price').text).replace('AED', '') except: pass try: self.db_writer.rce_product_variant(data_variant) except Exception as e: logging.info(e) time.sleep(random.randint(2,5)) else: logging.info('No variant found') except: logging.info('No variant found') pass def rating_info(self, driver, rce_reseller_id, url_hash): try: driver.find_element(By.CSS_SELECTOR, '#reviews-medley-footer .a-link-emphasis').click() driver.implicitly_wait(5) data_reviews = driver.find_elements(By.CSS_SELECTOR, '.a-section.review.aok-relative') for data in data_reviews: data_review = {} data_review["id"] = "" data_review["rce_product_id"] = "" data_review["username"] = "" data_review["review"] = "" data_review["img_url"] = "" data_review["review_like_count"] = 0 data_review["user_tier"] = "" data_review["shop_id"] = 0 data_review["video_url"] = "" data_review["rating"] = "" try: sql = "select max(id) from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab') self.cur.execute(sql) rating_id = self.cur.fetchone() if rating_id[0]==None: rating_id = 1 else: rating_id = int(rating_id[0]) + 1 data_review["id"] = rating_id except: pass try: sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where product_page_url_hash = '"+str(url_hash)+"'" self.cur.execute(sql) data_review["rce_product_id"] = self.cur.fetchone()[0] except: pass try: data_review["username"] = data.find_element(By.CSS_SELECTOR, '.a-profile-name').text except: pass try: data_review["review"] = data.find_element(By.CSS_SELECTOR, '.a-size-base.review-text.review-text-content').text data_review["review"] = data_review["review"].replace("'","") except: pass try: rating = data.find_element(By.CSS_SELECTOR, '.a-icon.a-icon-star.review-rating .a-icon-alt').get_attribute("textContent") data_review["rating"] = rating.replace(' out of 5 stars', '') except: pass try: sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+"" self.cur.execute(sql) data_review["shop_id"] = self.cur.fetchone()[0] except: pass try: self.db_writer.rce_ratings_reviews(data_review) except Exception as e: logging.info(e) except: pass def get_product_info(self,item): try: op = webdriver.ChromeOptions() op.add_argument('--no-sandbox') op.add_argument('--disable-notifications') op.add_argument("--lang=en-GB") op.add_argument('--user-data-dir=/home/ec2-user/chrome_cache/') #op.headless = True driver=webdriver.Chrome(options=op) try: driver.get('https://www.amazon.ae') time.sleep(3) except Exception as e: print(e) ##### Reseller info ##### driver.get(item[4]) driver.implicitly_wait(5) rce_reseller_id = self.reseller_info(driver) ##### Product Info ##### driver.get(item[4]) driver.implicitly_wait(5) ##### Brand Info brand_name = self.brand_info(driver) ##### Product info self.product_info(driver, item[2], item[3], item[4], item[5], brand_name, rce_reseller_id) ##### Rating Info ##### driver.get(item[4]) driver.implicitly_wait(5) self.rating_info(driver, rce_reseller_id, item[5]) sql = f""" update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 where product_page_url_hash='{item[5]}' """ self.cur.execute(sql) driver.close() except Exception as e: print(e) driver.close()