raena-crawler-engine/amazon_crawler_engine/amazon_categories.py

194 lines
7.0 KiB
Python
Raw Normal View History

2024-01-24 13:05:07 +00:00
import hashlib
import logging
import undetected_chromedriver as webdriver
import psycopg2
from selenium.webdriver.common.by import By
from pyvirtualdisplay import Display
from amazon_db_writer import amazon_db_writer
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
class amazon_categories:
def __init__(self, config):
self.config = config
self.crawler_name = self.config.get("crawler_name")
self.url = "https://www.amazon.ae/s?rh=n%3A11497859031&ref=lp_11497860031_sar"
self.product_limit = int(self.config.get("product_per_category"))
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Amazon'")
try : self.rce_source_id = self.cur.fetchone()[0]
except:
logging.info("Source tab is empty. Please check. Exiting.....")
exit(1)
self.db_writer = amazon_db_writer(config)
#self.display = Display(visible=0, size=(800, 600))
#self.display.start()
def __del__(self):
print("Closing connection.....")
self.conn.close()
#self.display.stop()
def start_processing(self):
op = webdriver.ChromeOptions()
op.add_argument('--no-sandbox')
op.add_argument('--disable-notifications')
op.add_argument("--lang=en-GB")
#op.headless = True
#driver=webdriver.Chrome(version_main = 113, options=op)
driver=webdriver.Chrome(options=op)
driver.get(self.url)
driver.implicitly_wait(10)
self.get_categories(driver)
driver.close()
def get_categories(self, driver):
#element = driver.find_elements(By.CSS_SELECTOR,'.bxc-grid__container.bxc-grid__container--width-1500.bxc-grid__mp-gutter-layout')
#sub_cats = element[0].find_elements(By.CSS_SELECTOR,'.bxc-grid__image.bxc-grid__image--light')
sub_cats = driver.find_elements(By.CSS_SELECTOR,'.bxc-grid__image.bxc-grid__image--light')
names = ['Perfumes', 'Skin care', 'Hair care', 'Bath & body', 'Makeup', 'Nail care']
categories = []
for sub_cat in sub_cats:
name = sub_cat.find_element(By.TAG_NAME, 'a').get_attribute('aria-label')
if name in names:
link = sub_cat.find_element(By.TAG_NAME, 'a').get_attribute('href')
category = {
"name": name,
"link": link
}
categories.append(category)
print(categories)
self.get_sub_categories(driver, categories)
def get_sub_categories(self,driver,categories):
sub_categories = []
for category in categories:
print("=============== {} ===============".format(category["name"]))
data = {}
data['parent_category_id'] = 0
data['rce_source_id'] = self.rce_source_id
data['rce_source_category_id'] = 0
data['rce_source_status'] = 1
data['category_name'] = category["name"]
data['category_page_url'] = category["link"]
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
self.db_writer.rce_category(data)
driver.get(category["link"])
##### Feature Categories
try:
f_cat = driver.find_element(By.CSS_SELECTOR, '.octopus-pc-category-card-v2-title .a-size-extra-large')
if f_cat:
cats_c = driver.find_element(By.CSS_SELECTOR, '.a-section.octopus-pc-category-card-v2-content')
cats = cats_c.find_elements(By.CSS_SELECTOR, '.octopus-pc-category-card-v2-item')
for cat in cats:
cat_name = cat.find_element(By.CSS_SELECTOR, '.a-size-medium.a-color-base.a-text-bold').text
url = cat.find_element(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-category-card-v2-category-link').get_attribute("href")
# print('Name: {}, URL: {}'.format(cat_name,url))
# s_cat = {
# "name": cat_name,
# "link": url
# }
# sub_categories.append(s_cat)
data = {}
data['parent_category_id'] = 0
data['rce_source_id'] = self.rce_source_id
data['rce_source_category_id'] = 0
data['rce_source_status'] = 1
data['category_name'] = cat_name
data['category_page_url'] = url
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
self.db_writer.rce_category(data)
try:
sub_cats = cat.find_elements(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-category-card-v2-subcategory-link')
for sub_cat in sub_cats:
s_url = sub_cat.get_attribute('href')
s_title = sub_cat.get_attribute('title')
# print('Title: {}, URL: {}'.format(s_title, s_url))
# s_cat = {
# "name": s_title,
# "link": s_url
# }
# sub_categories.append(s_cat)
data = {}
data['parent_category_id'] = 0
data['rce_source_id'] = self.rce_source_id
data['rce_source_category_id'] = 0
data['rce_source_status'] = 1
data['category_name'] = s_title
data['category_page_url'] = s_url
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
self.db_writer.rce_category(data)
except:
pass
except:
print("Feature Cat not available.")
pass
##### Shop by categories
try:
try:
cat_h = driver.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-header')
except:
cat_h = driver.find_element(By.CSS_SELECTOR, '#contentGrid_292470')
pass
if cat_h:
cats_c = driver.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-viewport-row-inner')
cats = cats_c.find_elements(By.TAG_NAME, 'li')
for cat in cats:
cat_name = cat.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-sub-card-title').text
url = cat.find_element(By.TAG_NAME, 'a').get_attribute('href')
# print('Name: {}, URL: {}'.format(cat_name,url))
# s_cat = {
# "name": cat_name,
# "link": url
# }
# sub_categories.append(s_cat)
data = {}
data['parent_category_id'] = 0
data['rce_source_id'] = self.rce_source_id
data['rce_source_category_id'] = 0
data['rce_source_status'] = 1
data['category_name'] = cat_name
data['category_page_url'] = url
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
self.db_writer.rce_category(data)
except Exception as e:
print('Cat not available')
pass
print(sub_categories)
# categories = amazon_categories()
# categories.start_processing()