194 lines
7.0 KiB
Python
194 lines
7.0 KiB
Python
import hashlib
|
|
import logging
|
|
import undetected_chromedriver as webdriver
|
|
import psycopg2
|
|
from selenium.webdriver.common.by import By
|
|
from pyvirtualdisplay import Display
|
|
|
|
from amazon_db_writer import amazon_db_writer
|
|
import ssl
|
|
ssl._create_default_https_context = ssl._create_unverified_context
|
|
|
|
|
|
class amazon_categories:
|
|
def __init__(self, config):
|
|
self.config = config
|
|
self.crawler_name = self.config.get("crawler_name")
|
|
self.url = "https://www.amazon.ae/s?rh=n%3A11497859031&ref=lp_11497860031_sar"
|
|
self.product_limit = int(self.config.get("product_per_category"))
|
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
|
self.conn.autocommit = True
|
|
self.cur = self.conn.cursor()
|
|
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Amazon'")
|
|
try : self.rce_source_id = self.cur.fetchone()[0]
|
|
except:
|
|
logging.info("Source tab is empty. Please check. Exiting.....")
|
|
exit(1)
|
|
self.db_writer = amazon_db_writer(config)
|
|
|
|
#self.display = Display(visible=0, size=(800, 600))
|
|
#self.display.start()
|
|
|
|
def __del__(self):
|
|
print("Closing connection.....")
|
|
self.conn.close()
|
|
#self.display.stop()
|
|
|
|
def start_processing(self):
|
|
op = webdriver.ChromeOptions()
|
|
op.add_argument('--no-sandbox')
|
|
op.add_argument('--disable-notifications')
|
|
op.add_argument("--lang=en-GB")
|
|
#op.headless = True
|
|
#driver=webdriver.Chrome(version_main = 113, options=op)
|
|
driver=webdriver.Chrome(options=op)
|
|
|
|
driver.get(self.url)
|
|
|
|
driver.implicitly_wait(10)
|
|
|
|
self.get_categories(driver)
|
|
|
|
driver.close()
|
|
|
|
|
|
def get_categories(self, driver):
|
|
|
|
#element = driver.find_elements(By.CSS_SELECTOR,'.bxc-grid__container.bxc-grid__container--width-1500.bxc-grid__mp-gutter-layout')
|
|
#sub_cats = element[0].find_elements(By.CSS_SELECTOR,'.bxc-grid__image.bxc-grid__image--light')
|
|
sub_cats = driver.find_elements(By.CSS_SELECTOR,'.bxc-grid__image.bxc-grid__image--light')
|
|
|
|
|
|
names = ['Perfumes', 'Skin care', 'Hair care', 'Bath & body', 'Makeup', 'Nail care']
|
|
|
|
categories = []
|
|
for sub_cat in sub_cats:
|
|
name = sub_cat.find_element(By.TAG_NAME, 'a').get_attribute('aria-label')
|
|
if name in names:
|
|
link = sub_cat.find_element(By.TAG_NAME, 'a').get_attribute('href')
|
|
|
|
category = {
|
|
"name": name,
|
|
"link": link
|
|
}
|
|
|
|
categories.append(category)
|
|
|
|
print(categories)
|
|
self.get_sub_categories(driver, categories)
|
|
|
|
def get_sub_categories(self,driver,categories):
|
|
|
|
sub_categories = []
|
|
for category in categories:
|
|
print("=============== {} ===============".format(category["name"]))
|
|
|
|
data = {}
|
|
data['parent_category_id'] = 0
|
|
data['rce_source_id'] = self.rce_source_id
|
|
data['rce_source_category_id'] = 0
|
|
data['rce_source_status'] = 1
|
|
data['category_name'] = category["name"]
|
|
data['category_page_url'] = category["link"]
|
|
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
|
|
self.db_writer.rce_category(data)
|
|
|
|
driver.get(category["link"])
|
|
|
|
##### Feature Categories
|
|
try:
|
|
f_cat = driver.find_element(By.CSS_SELECTOR, '.octopus-pc-category-card-v2-title .a-size-extra-large')
|
|
if f_cat:
|
|
cats_c = driver.find_element(By.CSS_SELECTOR, '.a-section.octopus-pc-category-card-v2-content')
|
|
cats = cats_c.find_elements(By.CSS_SELECTOR, '.octopus-pc-category-card-v2-item')
|
|
for cat in cats:
|
|
cat_name = cat.find_element(By.CSS_SELECTOR, '.a-size-medium.a-color-base.a-text-bold').text
|
|
url = cat.find_element(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-category-card-v2-category-link').get_attribute("href")
|
|
# print('Name: {}, URL: {}'.format(cat_name,url))
|
|
# s_cat = {
|
|
# "name": cat_name,
|
|
# "link": url
|
|
# }
|
|
# sub_categories.append(s_cat)
|
|
|
|
data = {}
|
|
data['parent_category_id'] = 0
|
|
data['rce_source_id'] = self.rce_source_id
|
|
data['rce_source_category_id'] = 0
|
|
data['rce_source_status'] = 1
|
|
data['category_name'] = cat_name
|
|
data['category_page_url'] = url
|
|
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
|
|
self.db_writer.rce_category(data)
|
|
|
|
try:
|
|
sub_cats = cat.find_elements(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-category-card-v2-subcategory-link')
|
|
|
|
for sub_cat in sub_cats:
|
|
s_url = sub_cat.get_attribute('href')
|
|
s_title = sub_cat.get_attribute('title')
|
|
# print('Title: {}, URL: {}'.format(s_title, s_url))
|
|
# s_cat = {
|
|
# "name": s_title,
|
|
# "link": s_url
|
|
# }
|
|
# sub_categories.append(s_cat)
|
|
data = {}
|
|
data['parent_category_id'] = 0
|
|
data['rce_source_id'] = self.rce_source_id
|
|
data['rce_source_category_id'] = 0
|
|
data['rce_source_status'] = 1
|
|
data['category_name'] = s_title
|
|
data['category_page_url'] = s_url
|
|
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
|
|
self.db_writer.rce_category(data)
|
|
except:
|
|
pass
|
|
except:
|
|
print("Feature Cat not available.")
|
|
pass
|
|
|
|
##### Shop by categories
|
|
try:
|
|
try:
|
|
cat_h = driver.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-header')
|
|
except:
|
|
cat_h = driver.find_element(By.CSS_SELECTOR, '#contentGrid_292470')
|
|
pass
|
|
if cat_h:
|
|
cats_c = driver.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-viewport-row-inner')
|
|
cats = cats_c.find_elements(By.TAG_NAME, 'li')
|
|
for cat in cats:
|
|
cat_name = cat.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-sub-card-title').text
|
|
url = cat.find_element(By.TAG_NAME, 'a').get_attribute('href')
|
|
# print('Name: {}, URL: {}'.format(cat_name,url))
|
|
# s_cat = {
|
|
# "name": cat_name,
|
|
# "link": url
|
|
# }
|
|
# sub_categories.append(s_cat)
|
|
data = {}
|
|
data['parent_category_id'] = 0
|
|
data['rce_source_id'] = self.rce_source_id
|
|
data['rce_source_category_id'] = 0
|
|
data['rce_source_status'] = 1
|
|
data['category_name'] = cat_name
|
|
data['category_page_url'] = url
|
|
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
|
|
self.db_writer.rce_category(data)
|
|
except Exception as e:
|
|
print('Cat not available')
|
|
pass
|
|
|
|
print(sub_categories)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# categories = amazon_categories()
|
|
# categories.start_processing() |