import hashlib import logging import undetected_chromedriver as webdriver import psycopg2 from selenium.webdriver.common.by import By from pyvirtualdisplay import Display from amazon_db_writer import amazon_db_writer import ssl ssl._create_default_https_context = ssl._create_unverified_context class amazon_categories: def __init__(self, config): self.config = config self.crawler_name = self.config.get("crawler_name") self.url = "https://www.amazon.ae/s?rh=n%3A11497859031&ref=lp_11497860031_sar" self.product_limit = int(self.config.get("product_per_category")) self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) self.conn.autocommit = True self.cur = self.conn.cursor() self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Amazon'") try : self.rce_source_id = self.cur.fetchone()[0] except: logging.info("Source tab is empty. Please check. Exiting.....") exit(1) self.db_writer = amazon_db_writer(config) #self.display = Display(visible=0, size=(800, 600)) #self.display.start() def __del__(self): print("Closing connection.....") self.conn.close() #self.display.stop() def start_processing(self): op = webdriver.ChromeOptions() op.add_argument('--no-sandbox') op.add_argument('--disable-notifications') op.add_argument("--lang=en-GB") #op.headless = True #driver=webdriver.Chrome(version_main = 113, options=op) driver=webdriver.Chrome(options=op) driver.get(self.url) driver.implicitly_wait(10) self.get_categories(driver) driver.close() def get_categories(self, driver): #element = driver.find_elements(By.CSS_SELECTOR,'.bxc-grid__container.bxc-grid__container--width-1500.bxc-grid__mp-gutter-layout') #sub_cats = element[0].find_elements(By.CSS_SELECTOR,'.bxc-grid__image.bxc-grid__image--light') sub_cats = driver.find_elements(By.CSS_SELECTOR,'.bxc-grid__image.bxc-grid__image--light') names = ['Perfumes', 'Skin care', 'Hair care', 'Bath & body', 'Makeup', 'Nail care'] categories = [] for sub_cat in sub_cats: name = sub_cat.find_element(By.TAG_NAME, 'a').get_attribute('aria-label') if name in names: link = sub_cat.find_element(By.TAG_NAME, 'a').get_attribute('href') category = { "name": name, "link": link } categories.append(category) print(categories) self.get_sub_categories(driver, categories) def get_sub_categories(self,driver,categories): sub_categories = [] for category in categories: print("=============== {} ===============".format(category["name"])) data = {} data['parent_category_id'] = 0 data['rce_source_id'] = self.rce_source_id data['rce_source_category_id'] = 0 data['rce_source_status'] = 1 data['category_name'] = category["name"] data['category_page_url'] = category["link"] data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest() self.db_writer.rce_category(data) driver.get(category["link"]) ##### Feature Categories try: f_cat = driver.find_element(By.CSS_SELECTOR, '.octopus-pc-category-card-v2-title .a-size-extra-large') if f_cat: cats_c = driver.find_element(By.CSS_SELECTOR, '.a-section.octopus-pc-category-card-v2-content') cats = cats_c.find_elements(By.CSS_SELECTOR, '.octopus-pc-category-card-v2-item') for cat in cats: cat_name = cat.find_element(By.CSS_SELECTOR, '.a-size-medium.a-color-base.a-text-bold').text url = cat.find_element(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-category-card-v2-category-link').get_attribute("href") # print('Name: {}, URL: {}'.format(cat_name,url)) # s_cat = { # "name": cat_name, # "link": url # } # sub_categories.append(s_cat) data = {} data['parent_category_id'] = 0 data['rce_source_id'] = self.rce_source_id data['rce_source_category_id'] = 0 data['rce_source_status'] = 1 data['category_name'] = cat_name data['category_page_url'] = url data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest() self.db_writer.rce_category(data) try: sub_cats = cat.find_elements(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-category-card-v2-subcategory-link') for sub_cat in sub_cats: s_url = sub_cat.get_attribute('href') s_title = sub_cat.get_attribute('title') # print('Title: {}, URL: {}'.format(s_title, s_url)) # s_cat = { # "name": s_title, # "link": s_url # } # sub_categories.append(s_cat) data = {} data['parent_category_id'] = 0 data['rce_source_id'] = self.rce_source_id data['rce_source_category_id'] = 0 data['rce_source_status'] = 1 data['category_name'] = s_title data['category_page_url'] = s_url data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest() self.db_writer.rce_category(data) except: pass except: print("Feature Cat not available.") pass ##### Shop by categories try: try: cat_h = driver.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-header') except: cat_h = driver.find_element(By.CSS_SELECTOR, '#contentGrid_292470') pass if cat_h: cats_c = driver.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-viewport-row-inner') cats = cats_c.find_elements(By.TAG_NAME, 'li') for cat in cats: cat_name = cat.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-sub-card-title').text url = cat.find_element(By.TAG_NAME, 'a').get_attribute('href') # print('Name: {}, URL: {}'.format(cat_name,url)) # s_cat = { # "name": cat_name, # "link": url # } # sub_categories.append(s_cat) data = {} data['parent_category_id'] = 0 data['rce_source_id'] = self.rce_source_id data['rce_source_category_id'] = 0 data['rce_source_status'] = 1 data['category_name'] = cat_name data['category_page_url'] = url data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest() self.db_writer.rce_category(data) except Exception as e: print('Cat not available') pass print(sub_categories) # categories = amazon_categories() # categories.start_processing()