first commit

This commit is contained in:
Shariar Imtiaz 2024-01-24 17:05:07 +04:00
commit 3154eec5ab
57 changed files with 7197 additions and 0 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

29
README.md Normal file
View File

@ -0,0 +1,29 @@
# README #
This README would normally document whatever steps are necessary to get your application up and running.
### What is this repository for? ###
* Quick summary
* Version
* [Learn Markdown](https://bitbucket.org/tutorials/markdowndemo)
### How do I get set up? ###
* Summary of set up
* Configuration
* Dependencies
* Database configuration
* How to run tests
* Deployment instructions
### Contribution guidelines ###
* Writing tests
* Code review
* Other guidelines
### Who do I talk to? ###
* Repo owner or admin
* Other community or team contact

View File

@ -0,0 +1,194 @@
import hashlib
import logging
import undetected_chromedriver as webdriver
import psycopg2
from selenium.webdriver.common.by import By
from pyvirtualdisplay import Display
from amazon_db_writer import amazon_db_writer
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
class amazon_categories:
def __init__(self, config):
self.config = config
self.crawler_name = self.config.get("crawler_name")
self.url = "https://www.amazon.ae/s?rh=n%3A11497859031&ref=lp_11497860031_sar"
self.product_limit = int(self.config.get("product_per_category"))
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Amazon'")
try : self.rce_source_id = self.cur.fetchone()[0]
except:
logging.info("Source tab is empty. Please check. Exiting.....")
exit(1)
self.db_writer = amazon_db_writer(config)
#self.display = Display(visible=0, size=(800, 600))
#self.display.start()
def __del__(self):
print("Closing connection.....")
self.conn.close()
#self.display.stop()
def start_processing(self):
op = webdriver.ChromeOptions()
op.add_argument('--no-sandbox')
op.add_argument('--disable-notifications')
op.add_argument("--lang=en-GB")
#op.headless = True
#driver=webdriver.Chrome(version_main = 113, options=op)
driver=webdriver.Chrome(options=op)
driver.get(self.url)
driver.implicitly_wait(10)
self.get_categories(driver)
driver.close()
def get_categories(self, driver):
#element = driver.find_elements(By.CSS_SELECTOR,'.bxc-grid__container.bxc-grid__container--width-1500.bxc-grid__mp-gutter-layout')
#sub_cats = element[0].find_elements(By.CSS_SELECTOR,'.bxc-grid__image.bxc-grid__image--light')
sub_cats = driver.find_elements(By.CSS_SELECTOR,'.bxc-grid__image.bxc-grid__image--light')
names = ['Perfumes', 'Skin care', 'Hair care', 'Bath & body', 'Makeup', 'Nail care']
categories = []
for sub_cat in sub_cats:
name = sub_cat.find_element(By.TAG_NAME, 'a').get_attribute('aria-label')
if name in names:
link = sub_cat.find_element(By.TAG_NAME, 'a').get_attribute('href')
category = {
"name": name,
"link": link
}
categories.append(category)
print(categories)
self.get_sub_categories(driver, categories)
def get_sub_categories(self,driver,categories):
sub_categories = []
for category in categories:
print("=============== {} ===============".format(category["name"]))
data = {}
data['parent_category_id'] = 0
data['rce_source_id'] = self.rce_source_id
data['rce_source_category_id'] = 0
data['rce_source_status'] = 1
data['category_name'] = category["name"]
data['category_page_url'] = category["link"]
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
self.db_writer.rce_category(data)
driver.get(category["link"])
##### Feature Categories
try:
f_cat = driver.find_element(By.CSS_SELECTOR, '.octopus-pc-category-card-v2-title .a-size-extra-large')
if f_cat:
cats_c = driver.find_element(By.CSS_SELECTOR, '.a-section.octopus-pc-category-card-v2-content')
cats = cats_c.find_elements(By.CSS_SELECTOR, '.octopus-pc-category-card-v2-item')
for cat in cats:
cat_name = cat.find_element(By.CSS_SELECTOR, '.a-size-medium.a-color-base.a-text-bold').text
url = cat.find_element(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-category-card-v2-category-link').get_attribute("href")
# print('Name: {}, URL: {}'.format(cat_name,url))
# s_cat = {
# "name": cat_name,
# "link": url
# }
# sub_categories.append(s_cat)
data = {}
data['parent_category_id'] = 0
data['rce_source_id'] = self.rce_source_id
data['rce_source_category_id'] = 0
data['rce_source_status'] = 1
data['category_name'] = cat_name
data['category_page_url'] = url
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
self.db_writer.rce_category(data)
try:
sub_cats = cat.find_elements(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-category-card-v2-subcategory-link')
for sub_cat in sub_cats:
s_url = sub_cat.get_attribute('href')
s_title = sub_cat.get_attribute('title')
# print('Title: {}, URL: {}'.format(s_title, s_url))
# s_cat = {
# "name": s_title,
# "link": s_url
# }
# sub_categories.append(s_cat)
data = {}
data['parent_category_id'] = 0
data['rce_source_id'] = self.rce_source_id
data['rce_source_category_id'] = 0
data['rce_source_status'] = 1
data['category_name'] = s_title
data['category_page_url'] = s_url
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
self.db_writer.rce_category(data)
except:
pass
except:
print("Feature Cat not available.")
pass
##### Shop by categories
try:
try:
cat_h = driver.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-header')
except:
cat_h = driver.find_element(By.CSS_SELECTOR, '#contentGrid_292470')
pass
if cat_h:
cats_c = driver.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-viewport-row-inner')
cats = cats_c.find_elements(By.TAG_NAME, 'li')
for cat in cats:
cat_name = cat.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-sub-card-title').text
url = cat.find_element(By.TAG_NAME, 'a').get_attribute('href')
# print('Name: {}, URL: {}'.format(cat_name,url))
# s_cat = {
# "name": cat_name,
# "link": url
# }
# sub_categories.append(s_cat)
data = {}
data['parent_category_id'] = 0
data['rce_source_id'] = self.rce_source_id
data['rce_source_category_id'] = 0
data['rce_source_status'] = 1
data['category_name'] = cat_name
data['category_page_url'] = url
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
self.db_writer.rce_category(data)
except Exception as e:
print('Cat not available')
pass
print(sub_categories)
# categories = amazon_categories()
# categories.start_processing()

View File

@ -0,0 +1,186 @@
import hashlib
import logging
import undetected_chromedriver as webdriver
from selenium.webdriver import ActionChains, Keys
from selenium.webdriver.chrome.service import Service
import psycopg2
from selenium.webdriver.common.by import By
from amazon_db_writer import amazon_db_writer
from pyvirtualdisplay import Display
from scroller.scroller import smartScroll
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
class amazon_category_products:
def __init__(self, config):
self.config = config
self.crawler_name = self.config.get("crawler_name")
#self.url = "https://www.amazon.ae/gp/browse.html?node=11497860031&ref_=nav_em_by_all_0_2_11_2"
self.product_limit = int(self.config.get("product_per_category"))
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
sql = "delete from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where crawler_name='"+str(self.crawler_name)+"'"
self.cur.execute(sql)
sql = "select id, category_page_url from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" where rce_source_id = 66"
self.cur.execute(sql)
self.categories = self.cur.fetchall()
#self.display = Display(visible=0, size=(800, 600))
#self.display.start()
def __del__(self):
print("Closing connection.....")
self.conn.close()
#self.display.stop()
def start_processing(self):
for category in self.categories:
logging.info("======= Fetching products of {}".format(category))
self.browse_category_page(category)
def browse_category_page(self, catagory):
try:
op = webdriver.ChromeOptions()
op.add_argument('--no-sandbox')
op.add_argument('--disable-notifications')
op.add_argument("--lang=en-GB")
#op.headless = True
#driver=webdriver.Chrome(version_main = 113, options=op)
driver=webdriver.Chrome(options=op)
driver.get(catagory[1])
driver.implicitly_wait(10)
#### Collect section name and section products ####
section_products = self.section_products(driver, catagory[0])
self.insert_tracker_tab(section_products)
#### Collect All products ####
self.base_products(driver, catagory[0])
driver.close()
except Exception as e:
print(e)
def section_products(self, driver, catagory):
elements = driver.find_elements(By.CSS_SELECTOR,".a-size-extra-large.a-color-base.a-text-bold")
section_name = []
for element in elements:
section_name.append(element.text)
elements = driver.find_elements(By.CSS_SELECTOR,".a-section.octopus-pc-card-content")
section_products = []
for element in elements:
objs = element.find_elements(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-item-link')
urls = []
for obj in objs:
url = obj.get_attribute("href")
urls.append(url)
section_products.append(urls)
result = []
for i in range(len(section_name)):
result.append({
"catagory": catagory,
"key": section_name[i],
"value": section_products[i]
})
return result
def insert_tracker_tab(self, objs):
for obj in objs:
category = obj['catagory']
key = obj['key']
items = obj['value']
for item in items:
product_page_url = item
product_page_url_hash = hashlib.md5(product_page_url.encode('utf-8')).hexdigest()
flag = 0
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where product_page_url = '"+product_page_url+"'"
self.cur.execute(sql)
res = self.cur.fetchall()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+"(crawler_name,category,keyword,product_page_url,product_page_url_hash,flag) values('"+str(self.crawler_name)+"','"+str(category)+"','"+str(key)+"','"+product_page_url+"','"+product_page_url_hash+"',"+str(flag)+")"
self.cur.execute(sql)
def base_products(self, driver, catagory):
try:
smartScroll(driver, stopAtBorder=True, distancePerSecond=500, humanBreaks=True)
all_res = driver.find_element(By.CSS_SELECTOR, '#apb-desktop-browse-search-see-all')
all_res.click()
driver.implicitly_wait(5)
for i in range(1,16):
items = driver.find_elements(By.CSS_SELECTOR, '.a-size-mini.a-spacing-none.a-color-base.s-line-clamp-4')
smartScroll(driver, stopAtBorder=True, distancePerSecond=500, humanBreaks=True)
urls = []
for item in items:
url = item.find_element(By.TAG_NAME, 'a').get_attribute('href')
urls.append(url)
result = [{
"catagory": catagory,
"key": "Base Product Page {}".format(str(i)),
"value": urls
}]
self.insert_tracker_tab(result)
try:
driver.find_element(By.CSS_SELECTOR, '.s-pagination-next').click()
driver.implicitly_wait(5)
except:
logging.info("No more page to navigate......")
except:
pass
# config = {
# "crawler_name": "raena_crawler_enginer_amazon",
# "crawler_schema": "raena_spider_management",
# "category_tab": "rce_category",
# "tracker_tab": "crawler_tracker",
# "product_tab": "rce_product",
# "variant_tab": "rce_product_variant",
# "brand_tab": "rce_brand",
# "reseller_tab": "rce_reseller",
# "reseller_store_tab": "rce_reseller_store",
# "review_tab": "rce_ratings_reviews",
# "review_productmodels_tab": "rce_ratings_reviews_productmodels",
# "review_producttags_tab": "rce_ratings_reviews_producttags",
# "review_tags": "rce_tags",
# "source_tab": "rce_source",
# "product_per_category": "1000",
# "source_category": "11043145",
# "db_user": "postgres",
# "db_pass": "postgres",
# "database": "postgres",
# "db_host": "localhost",
# "db_port": "5444",
# "crawler_main": "1",
# "crawler_slave_no": ""
# }
# amazon_category_products = amazon_category_products(config)
# amazon_category_products.start_processing()

View File

@ -0,0 +1,98 @@
import logging
import psycopg2
import json
from datetime import datetime
import smtplib
from email.message import EmailMessage
from amazon_categories import amazon_categories
from amazon_category_products import amazon_category_products
from amazon_products import amazon_products
##### Looger ######
format = "%(asctime)s: %(message)s"
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
config = {}
def send_mail():
try:
EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5"
EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh"
From = 'data_reporting@raenabeauty.com'
To = 'shariar@raenabeauty.com'
#To = 'shariar@raenabeauty.com'
html = f'''
<!DOCTYPE html>
<html>
<body>
<div style="background-color:#eee;padding:10px 20px;">
<h2 style="font-family:Georgia, 'Times New Roman', Times, serif;color#454349;">Amazon Crawler Status</h2>
</div>
<div style="padding:20px 0px">
<div style="height: 800px;width:800px">
Error occured. Please check Amazon Pipeline.
<div style="text-align:Left;">
<p>This is system generated mail. Please do not reply</p>
</div>
</div>
</div>
</body>
</html>
'''
msg = EmailMessage()
msg['Subject'] = 'Amazon Crawler Status'
msg['From'] = From
msg['To'] = To
msg.set_content(html, subtype='html')
with smtplib.SMTP('email-smtp.ap-southeast-1.amazonaws.com', 587) as smtp:
smtp.ehlo()
smtp.starttls()
smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD)
smtp.send_message(msg)
except Exception as e:
logging.info("Error while sending mail: {}".format(e))
def main():
# start = datetime.now()
# categories = amazon_categories(config)
# categories.start_processing()
# end = datetime.now()
# logging.info('Total time taken to fetch the categories: {}'.format(str(end-start)))
#
# start = datetime.now()
# products = amazon_category_products(config)
# products.start_processing()
# end = datetime.now()
# logging.info('Total time taken to fetch the category products: {}'.format(str(end-start)))
product_info = amazon_products(config)
product_info.start_processing()
# ###### For test
# item = (100, 'raena_crawler_enginer_amazon', '3066', 'Up to 25 AED', 'https://www.amazon.ae/Ross-Massager-Shampoo-Silicone-Bristles/dp/B09JGH1WM3?ref_=Oct_d_oup_d_12149480031_0&pd_rd_w=lfMTW&content-id=amzn1.sym.d6d96598-a48c-43a2-8244-52a2329bf791&pf_rd_p=d6d96598-a48c-43a2-8244-52a2329bf791&pf_rd_r=C1QM2XCSJDBVMS27JV7E&pd_rd_wg=gkRZv&pd_rd_r=f5af13ee-c6c4-4d8a-8677-cba9cbacdace&pd_rd_i=B09JGH1WM3', '8f0540b5919e176303cf24a1d46b0e1c', 0)
# product_info.get_product_info(item)
if __name__ == "__main__":
logging.info("Starting Shopee Crawler.......")
try:
logging.info("Loading config file.......")
with open("conf.json", "r") as jsonfile:
config = json.load(jsonfile)
logging.info("Config file loaded.......")
print(config)
main()
except Exception as e:
logging.info("Error: ".format(e))
#logging.info("Cannot load config file. Please check. Exiting......")
send_mail()
exit(1)

View File

@ -0,0 +1,589 @@
import logging
import psycopg2
###### Looger ######
format = "%(asctime)s: %(message)s"
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
class amazon_db_writer:
def __init__(self, config):
self.config = config
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
def __del__(self):
logging.info("Closing connection.....")
self.conn.close()
def rce_category(self, data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" where category_name = '"+str(data['category_name'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
cat_name = data['category_name'].replace("'","''")
cat_url = data['category_page_url'].replace("'","''")
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" (parent_category_id,rce_source_id," \
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name) values (" \
+str(data['parent_category_id'])+","+str(data['rce_source_id'])+", "+str(data['rce_source_category_id'])+", "+str(data['rce_source_status'])+", " \
"'"+str(cat_url)+"', '"+str(data['category_page_url_hash'])+"', '"+str(cat_name)+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(id,parent_category_id,rce_source_id," \
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat) " \
"select id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \
"category_name,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \
"where rce_source_category_id = "+ str(data['rce_source_category_id'])
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['parent_category_id'])==str(res[1]) and str(data['rce_source_category_id'])==str(res[3]) and str(data['category_name']) == str(res[7]) and \
str(data['category_page_url'])==str(res[5]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" set updatedat=now() " \
"where category_name = '"+ str(res[7])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" set parent_category_id = " \
""+str(data['parent_category_id'])+", rce_source_category_id = "+str(data['rce_source_category_id'])+", " \
"category_name='"+str(cat_name)+"', category_page_url='"+str(cat_url)+"', " \
"category_page_url_hash='"+str(data['category_page_url_hash'])+"', updatedat=now() where " \
"category_name = '"+ str(res[7])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(id,parent_category_id,rce_source_id," \
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat) " \
"select id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \
"category_name,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \
"where category_name = '"+ str(res[7])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_product(self, data):
data['product_page_url'] = data['product_page_url'].replace("'","''")
data['rce_source_product_name'] = data['rce_source_product_name'].replace("'","''")
data['product_description'] = data['product_description'].replace("'","''")
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where product_page_url = '"+str(data['product_page_url'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" (rce_source_product_id," \
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
"product_section,rce_source_id) values("+str(data['rce_source_product_id'])+","+str(data['rce_source_product_status'])+",'"+str(data['product_page_url'])+"'," \
"'"+str(data['product_page_url_hash'])+"',"+str(data['rce_category_id'])+","+str(data['rce_brand_id'])+","+str(data['rce_store_id'])+"," \
"'"+str(data['rce_source_product_name'])+"','"+str(data['product_images'])+"','"+str(data['product_description'])+"',"+str(data['product_sold_total'])+"," \
""+str(data['product_sold'])+",'"+str(data['product_price_min'])+"','"+str(data['product_price_min_before_discount'])+"','"+str(data['product_price_max'])+"'," \
"'"+str(data['product_price_max_before_discount'])+"','"+str(data['ratings'])+"','"+str(data['product_section'])+"',"+str(data['rce_source_id'])+")"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" (id,rce_source_product_id," \
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
"product_section,createdat,updatedat,rce_source_id) select id,rce_source_product_id," \
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
"product_section,createdat,updatedat,rce_source_id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where " \
"product_page_url='"+str(data['product_page_url'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_product_id'])==str(res[1]) and str(data['rce_source_product_status'])==str(res[2]) and \
str(data['product_page_url'])==str(res[3]) and str(data['product_page_url_hash'])==str(res[4]) and str(data['rce_category_id'])==str(res[5]) and \
str(data['rce_brand_id'])==str(res[6]) and str(data['rce_store_id'])==str(res[7]) and str(data['rce_source_product_name'])==str(res[8]) and \
str(data['product_images'])==str(res[9]) and str(data['product_sold_total'])==str(res[11]) and \
str(data['product_sold'])==str(res[12]) and str(data['product_price_min'])==str(res[13]) and str(data['product_price_min_before_discount'])==str(res[14]) and \
str(data['product_price_max'])==str(res[15]) and str(data['product_price_max_before_discount'])==str(res[16]) and str(data['ratings'])==str(res[17]) \
and str(data['rce_source_id'])==str(res[21]) \
and str(data['product_section'])==str(res[22]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" set updatedat=now() " \
"where product_page_url = '"+ str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" set rce_source_product_id="+str(data['rce_source_product_id'])+"," \
"rce_source_product_status="+str(data['rce_source_product_status'])+",product_page_url='"+str(data['product_page_url'])+"',product_page_url_hash= " \
"'"+str(data['product_page_url_hash'])+"',rce_category_id="+str(data['rce_category_id'])+",rce_brand_id="+str(data['rce_brand_id'])+"," \
"rce_store_id="+str(data['rce_store_id'])+",rce_source_product_name='"+str(data['rce_source_product_name'])+"',product_images='"+str(data['product_images'])+"'" \
",product_description='"+str(data['product_description'])+"',product_sold_total="+str(data['product_sold_total'])+",product_sold="+str(data['product_sold'])+"," \
"product_price_min='"+str(data['product_price_min'])+"',product_price_min_before_discount='"+str(data['product_price_min_before_discount'])+"'," \
"product_price_max='"+str(data['product_price_max'])+"',product_price_max_before_discount='"+str(data['product_price_max_before_discount'])+"',ratings='"+str(data['ratings'])+"'," \
"product_section='"+str(data['product_section'])+"', updatedat=now(), rce_source_id="+str(data['rce_source_id'])+" where product_page_url = '"+ str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" (id,rce_source_product_id," \
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
"product_section,createdat,updatedat,rce_source_id) select id,rce_source_product_id," \
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
"product_section,createdat,updatedat,rce_source_id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where " \
"product_page_url='"+str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_product_variant(self, data):
data['product_variant_name'] = data['product_variant_name'].replace("'","''")
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where product_variant_name = '"+str(data['product_variant_name'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" (rce_source_variant_id,rce_product_id," \
"product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock) values("+str(data['rce_source_variant_id'])+"," \
""+str(data['rce_product_id'])+",'"+str(data['product_variant_name'])+"','"+str(data['product_variant_price'])+"'," \
"'"+str(data['product_variant_price_before_discount'])+"',"+str(data['product_variant_stock'])+")"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" (id,rce_source_variant_id,rce_product_id," \
"product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,createdat,updatedat) select * from " \
""+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where product_variant_name='"+str(data['product_variant_name'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_variant_id'])==str(res[1]) and str(data['rce_product_id'])==str(res[2]) and str(data['product_variant_name'])==str(res[3]) and \
str(data['product_variant_price'])==str(res[4]) and str(data['product_variant_price_before_discount'])==str(res[5]) and str(data['product_variant_stock'])==str(res[6]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" set updatedat=now() " \
"where product_variant_name = '"+ str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" set rce_source_variant_id="+str(data['rce_source_variant_id'])+", " \
"rce_product_id="+str(data['rce_product_id'])+", product_variant_name='"+str(data['product_variant_name'])+"', product_variant_price=" \
"'"+str(data['product_variant_price'])+"',product_variant_price_before_discount='"+str(data['product_variant_price_before_discount'])+"'," \
"product_variant_stock="+str(data['product_variant_stock'])+", updatedat=now() where product_variant_name = '"+ str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" (id,rce_source_variant_id,rce_product_id," \
"product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,createdat,updatedat) select * from " \
""+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where product_variant_name='"+str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_brand(self, data):
data['brand_page_url'] = data['brand_page_url'].replace("'","''")
data['brand_name'] = data['brand_name'].replace("'","''")
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_page_url = '"+str(data['brand_page_url'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" (rce_source_id,rce_source_brand_status," \
"brand_page_url,brand_page_url_hash,brand_name) values("+str(data['rce_source_id'])+"," \
""+str(data['rce_source_brand_status'])+",'"+str(data['brand_page_url'])+"','"+str(data['brand_page_url_hash'])+"'," \
"'"+str(data['brand_name'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" (id,rce_source_id,rce_source_brand_status," \
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat) select id,rce_source_id,rce_source_brand_status," \
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_page_url='"+str(data['brand_page_url'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_id'])==str(res[1]) and str(data['rce_source_brand_status'])==str(res[3]) and str(data['brand_page_url'])==str(res[4]) and \
str(data['brand_page_url_hash'])==str(res[5]) and str(data['brand_name'])==str(res[6]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" set updatedat=now() " \
"where brand_page_url = '"+ str(res[4])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" set rce_source_id="+str(data['rce_source_id'])+", " \
"rce_source_brand_status="+str(data['rce_source_brand_status'])+", brand_page_url='"+str(data['brand_page_url'])+"', brand_page_url_hash=" \
"'"+str(data['brand_page_url_hash'])+"',brand_name='"+str(data['brand_name'])+"', updatedat=now() where brand_page_url = '"+ str(res[4])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" (id,rce_source_id,rce_source_brand_status," \
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat) select id,rce_source_id,rce_source_brand_status, " \
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_page_url='"+str(res[4])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_reseller(self, data):
data['reseller_name'] = data['reseller_name'].replace("'","''")
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data['reseller_name'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" (rce_source_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_description) values("+str(data['rce_source_id'])+"," \
""+str(data['rce_source_reseller_status'])+",'"+str(data['reseller_name'])+"','"+str(data['reseller_average_rating'])+"'," \
"'"+str(data['reseller_description'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (id,rce_source_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat) select id,rce_source_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name='"+str(data['reseller_name'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_reseller_status'])==str(res[3]) and str(data['reseller_name'])==str(res[4]) and \
str(data['reseller_average_rating'])==str(res[5]) and str(data['reseller_description'])==str(res[6]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set updatedat=now() " \
"where reseller_name = '"+ str(res[4])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set rce_source_id="+str(data['rce_source_id'])+", " \
"rce_source_reseller_status="+str(data['rce_source_reseller_status'])+", reseller_name='"+str(data['reseller_name'])+"', reseller_average_rating=" \
"'"+str(data['reseller_average_rating'])+"',reseller_description='"+str(data['reseller_description'])+"', updatedat=now() where reseller_name = '"+ str(res[4])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (id,rce_source_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat) select id,rce_source_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name='"+str(res[4])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_reseller_store(self, data):
data['store_page_url'] = data['store_page_url'].replace("'","''")
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url = '"+str(data['store_page_url'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" (rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,rce_source_id) values(" \
""+str(data['rce_source_store_status'])+",'"+str(data['store_page_url'])+"','"+str(data['store_page_url_hash'])+"'," \
"'"+str(data['store_location'])+"', "+str(data['rce_reseller_id'])+", "+str(data['rce_source_id'])+")"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (id,rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id) select id,rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id from " \
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url='"+str(data['store_page_url'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_store_status'])==str(res[2]) and str(data['store_page_url'])==str(res[3]) and \
str(data['store_page_url_hash'])==str(res[4]) and str(data['store_location'])==str(res[5]) and \
str(data['rce_reseller_id'])==str(res[6]) and str(data['rce_source_id'])==str(res[9]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set updatedat=now() " \
"where store_page_url = '"+ str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set " \
"rce_source_store_status="+str(data['rce_source_store_status'])+", store_page_url='"+str(data['store_page_url'])+"', store_page_url_hash=" \
"'"+str(data['store_page_url_hash'])+"',store_location='"+str(data['store_location'])+"', rce_reseller_id="+str(data['rce_reseller_id'])+", " \
"updatedat=now(), rce_source_id="+str(data['rce_source_id'])+" where store_page_url = '"+ str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (id,rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id) select id,rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id from " \
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url='"+str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_ratings_reviews(self, data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id = "+str(data['rce_product_id'])+" and username ='"+str(data['username'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
data['username'] = data['username'].replace("'","''")
data['img_url'] = data['img_url'].replace("'","''")
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" (id,rce_product_id,username," \
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating) values("+str(data['id'])+","+str(data['rce_product_id'])+"," \
"'"+str(data['username'])+"','"+str(data['review'])+"','"+str(data['img_url'])+"',"+str(data['review_like_count'])+",'"+str(data['user_tier'])+"'," \
""+str(data['shop_id'])+", '"+str(data['video_url'])+"', '"+str(data['rating'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" (id,rce_product_id,username," \
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select id,rce_product_id,username," \
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id="+str(data['rce_product_id'])+" and username ='"+str(data['username'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_product_id'])==str(res[1]) and str(data['username'])==str(res[2]) and str(data['review'])==str(res[3]) and \
str(data['img_url'])==str(res[4]) and str(data['review_like_count'])==str(res[5]) and str(data['user_tier'])==str(res[6]) and \
str(data['shop_id'])==str(res[7]) and str(data['video_url'])==str(res[8]) and str(data['rating'])==str(res[9]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set updatedat=now() " \
"where rce_product_id = "+ str(res[1])+" and username ='"+res[2]+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set rce_product_id="+str(data['rce_product_id'])+", " \
"username='"+str(data['username'])+"', review='"+str(data['review'])+"', img_url=" \
"'"+str(data['img_url'])+"',review_like_count="+str(data['review_like_count'])+", user_tier='"+str(data['user_tier'])+"', " \
"shop_id="+str(data['shop_id'])+", video_url='"+str(data['video_url'])+"', rating='"+str(data['rating'])+"', updatedat=now() " \
"where rce_product_id = "+ str(res[1])+" and username ='"+str(data['username'])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" (id,rce_product_id,username," \
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select id,rce_product_id,username," \
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id="+str(res[1])+" and username ='"+str(data['username'])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_ratings_reviews_productmodels(self,data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id = "+str(data['rce_rating_id'])
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" (rce_rating_id,model_id) " \
"values("+str(data['rce_rating_id'])+",'"+str(data['model_id'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
"createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(data['rce_rating_id'])+""
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_rating_id'])==str(res[1]) and str(data['model_id'])==str(res[2]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set updatedat=now() " \
"where rce_rating_id = "+ str(res[1])
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set model_id="+str(data['model_id'])+", " \
"updatedat=now() where rce_source_store_id = "+ str(res[1])
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
"createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(res[1])+""
#logging.info(sql)
self.cur.execute(sql)
def rce_tags(self,data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description = '"+str(data['description'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" (id,description) " \
"values("+str(data['id'])+",'"+str(data['description'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
"createdat,updatedat) select id,description,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(data['description'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['description'])==str(res[1]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set updatedat=now() " \
"where description = '"+ str(res[1])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set description='"+str(data['description'])+"', " \
"updatedat=now() where description = "+ str(res[1])
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
"createdat,updatedat) select id,description,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(res[1])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_ratings_reviews_producttags(self,data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id = '"+str(data['rce_rating_id'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" (rce_rating_id,tag_ids) " \
"values("+str(data['rce_rating_id'])+",'"+str(data['tag_ids'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
"createdat,updatedat) select id,rce_rating_id,tag_ids,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id='"+str(data['rce_rating_id'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_rating_id'])==str(res[1]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set updatedat=now() " \
"where rce_rating_id = '"+ str(res[1])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set rce_rating_id='"+str(data['rce_rating_id'])+"', " \
"updatedat=now() where rce_rating_id = "+ str(res[1])
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
"createdat,updatedat) select id,description,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where description='"+str(res[1])+"'"
#logging.info(sql)
self.cur.execute(sql)

View File

@ -0,0 +1,174 @@
import hashlib
import logging
import random
import sys
import string
#from selenium import webdriver
import undetected_chromedriver as webdriver
from selenium.webdriver.common.by import By
import psycopg2
import time
import re
from amazon_db_writer import amazon_db_writer
from datetime import datetime
from pyvirtualdisplay import Display
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
class amazon_products_adhoc:
def __init__(self, config):
self.config = config
self.crawler_name = self.config.get("crawler_name")
self.pattern = r'[' + string.punctuation + ']'
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
sql = f"""select * from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where rce_source_id=66 and product_price_min= '' order by id desc"""
self.cur.execute(sql)
self.items = self.cur.fetchall()
self.db_writer = amazon_db_writer(config)
#self.display = Display(visible=0, size=(800, 600))
#self.display.start()
def __del__(self):
print("Closing connection.....")
self.conn.close()
#self.display.stop()
def start_processing(self):
op = webdriver.ChromeOptions()
op.add_argument('--no-sandbox')
op.add_argument('--disable-notifications')
op.add_argument("--lang=en-GB")
op.add_argument('--user-data-dir=/home/ec2-user/chrome_cache/')
driver=webdriver.Chrome(options=op)
count = 0
for item in self.items:
count += 1
try:
logging.info("============== Getting info for {}/{}: {} ================".format(str(count),str(len(self.items)),str(item)))
start = datetime.now()
driver.get(item[3])
self.product_info(driver, item)
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 where product_page_url_hash='{item[4]}'
"""
self.cur.execute(sql)
end = datetime.now()
logging.info('Total time taken to fetch the product: {}'.format(str(end-start)))
time.sleep(5)
except Exception as e:
print(e)
driver.close()
def product_info(self, driver, item):
data_product = {}
data_product['rce_source_product_id'] = item[1]
data_product['rce_source_id'] = item[21]
data_product['rce_source_product_status'] = item[2]
data_product['product_page_url'] = item[3]
data_product['product_page_url_hash'] = item[4]
data_product['rce_category_id'] = item[5]
data_product['rce_brand_id'] = item[6]
data_product['rce_store_id'] = item[7]
data_product['rce_source_product_name'] = item[8]
data_product['product_images'] = item[9]
data_product['product_description'] = item[10]
data_product['product_sold_total'] = item[11]
data_product['product_sold'] = item[12]
data_product['product_price_min'] = item[13]
data_product['product_price_min_before_discount'] =item[14]
data_product['product_price_max'] = item[15]
data_product['product_price_max_before_discount'] = item[16]
data_product['ratings'] = item[17]
data_product['product_section'] = item[22]
# try:
# data_product['product_price_min'] = (driver.find_element(By.CSS_SELECTOR, '#corePrice_desktop > div > table > tbody > tr:nth-child(2) > td.a-span12 > span.a-price.a-text-price.a-size-medium.apexPriceToPay > span:nth-child(2)').text).replace('AED', '')
# data_product['product_price_max'] = data_product['product_price_min']
#
# except:
#
# try:
# price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text
# price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text
#
# price = price_whole+"."+price_fraction
# data_product['product_price_min'] = price
# data_product['product_price_max'] = price
# except:
# try:
# data_product['product_price_min'] =(driver.find_element(By.CSS_SELECTOR, '#sns-base-price > div > span.a-price.a-text-price.a-size-medium.apexPriceToPay > span:nth-child(2)').text).replace('AED','')
# data_product['product_price_max'] = data_product['product_price_min']
# except:
# data_product['product_price_min'] = (driver.find_element(By.CSS_SELECTOR, '#sns-base-price').text).replace('AED','')
# data_product['product_price_max'] = data_product['product_price_min']
# pass
# pass
#
# pass
try:
data_product['product_price_min'] = (driver.find_element(By.CSS_SELECTOR, '#sns-base-price').text).replace('AED', '')
data_product['product_price_max'] = data_product['product_price_min']
except:
price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text
price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text
price = price_whole+"."+price_fraction
data_product['product_price_min'] = price
data_product['product_price_max'] = price
pass
print("product_price_min: {}".format(data_product['product_price_min']))
try:
data_product['product_price_min_before_discount'] = (driver.find_element(By.CSS_SELECTOR, '.a-text-price').text).replace('AED', '')
data_product['product_price_max_before_discount'] = data_product['product_price_min_before_discount']
except:
pass
try:
self.db_writer.rce_product(data_product)
except Exception as e:
logging.info(e)
config = {
"crawler_name": "raena_crawler_enginer_amazon",
"crawler_schema": "raena_spider_management",
"category_tab": "rce_category",
"tracker_tab": "crawler_tracker",
"product_tab": "rce_product",
"variant_tab": "rce_product_variant",
"brand_tab": "rce_brand",
"reseller_tab": "rce_reseller",
"reseller_store_tab": "rce_reseller_store",
"review_tab": "rce_ratings_reviews",
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
"review_producttags_tab": "rce_ratings_reviews_producttags",
"review_tags": "rce_tags",
"source_tab": "rce_source",
"product_per_category": "1000",
"source_category": "11043145",
"db_user": "dbadmin",
"db_pass": "5qCif6eyY3Kmg4z",
"database": "analytics",
"db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
"db_port": "5432",
"crawler_main": "1",
"crawler_slave_no": ""
}
amazon_products_adhoc = amazon_products_adhoc(config)
amazon_products_adhoc.start_processing()

View File

@ -0,0 +1,516 @@
import hashlib
import logging
import random
import sys
import string
import undetected_chromedriver as webdriver
from selenium.webdriver.common.by import By
import psycopg2
import time
import re
from amazon_db_writer import amazon_db_writer
from datetime import datetime
from pyvirtualdisplay import Display
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
class amazon_products:
def __init__(self, config):
self.config = config
self.crawler_name = self.config.get("crawler_name")
self.pattern = r'[' + string.punctuation + ']'
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Amazon'")
self.rce_source_id = self.cur.fetchone()[0]
self.cur.execute("select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where crawler_name='raena_crawler_enginer_amazon' and flag=0 order by id")
self.items = self.cur.fetchall()
self.db_writer = amazon_db_writer(config)
#self.display = Display(visible=0, size=(800, 600))
#self.display.start()
def __del__(self):
print("Closing connection.....")
self.conn.close()
#self.display.stop()
def start_processing(self):
count = 0
for item in self.items:
count += 1
try:
logging.info("============== Getting info for {}/{}: {} ================".format(str(count),str(len(self.items)),str(item)))
start = datetime.now()
self.get_product_info(item)
end = datetime.now()
logging.info('Total time taken to fetch the product: {}'.format(str(end-start)))
except Exception as e:
print(e)
def reseller_info(self, driver):
try:
store_urls = []
try:
driver.find_element(By.CSS_SELECTOR, '.a-icon.a-icon-arrow.a-icon-small.arrow-icon').click()
time.sleep(5)
offers = driver.find_elements(By.CSS_SELECTOR, '#aod-offer-soldBy')
for offer in offers:
try:
store_url = offer.find_element(By.CSS_SELECTOR, '.a-fixed-left-grid-col.a-col-right').find_element(By.TAG_NAME, 'a').get_attribute('href')
store_urls.append(store_url)
except:
pass
except:
try:
store_url = driver.find_element(By.CSS_SELECTOR, '#sellerProfileTriggerId').get_attribute('href')
store_urls.append(store_url)
except:
pass
pass
if store_urls:
store_urls = list(set(store_urls))
return_item = ""
flag = 0
for store_url in store_urls:
driver.get(store_url)
driver.implicitly_wait(5)
##### reseller info
data_reseller = {}
data_reseller['rce_source_id'] = self.rce_source_id
data_reseller['rce_source_reseller_status'] = 1
data_reseller['reseller_name'] = ""
data_reseller['reseller_average_rating'] = 0.0
data_reseller['reseller_description'] = ""
try:
data_reseller['reseller_name'] = driver.find_element(By.CSS_SELECTOR,'#seller-name').text
data_reseller['reseller_name'] = data_reseller['reseller_name'].replace("'","")
except:
pass
try:
data_reseller['reseller_average_rating'] = float(driver.find_element(By.CSS_SELECTOR,'#effective-timeperiod-rating-year-description').text)
except:
try:
data_reseller['reseller_average_rating'] = float(driver.find_element(By.CSS_SELECTOR,'#effective-timeperiod-rating-year-description').text)
except:
pass
pass
try:
data_reseller['reseller_description'] = driver.find_element(By.CSS_SELECTOR, '#spp-expander-about-seller .a-row').text
data_reseller['reseller_description'] = data_reseller['reseller_description'].replace("'","")
except:
pass
try:
self.db_writer.rce_reseller(data_reseller)
except Exception as e:
logging.info(e)
##### Store info
data_reseller_store = {}
data_reseller_store['rce_source_store_status'] = 1
data_reseller_store['store_page_url'] = store_url
data_reseller_store['store_page_url_hash'] = hashlib.md5(data_reseller_store['store_page_url'].encode('utf-8')).hexdigest()
data_reseller_store['store_location'] = ""
data_reseller_store['rce_reseller_id'] = ""
data_reseller_store['rce_source_id'] = self.rce_source_id
try:
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data_reseller['reseller_name'])+"'")
rce_reseller_id = self.cur.fetchone()
data_reseller_store['rce_reseller_id'] = rce_reseller_id[0]
if flag == 0:
return_item = data_reseller_store['rce_reseller_id']
flag = 1
except:
pass
try:
self.db_writer.rce_reseller_store(data_reseller_store)
except Exception as e:
logging.info(e)
time.sleep(2)
else:
##### reseller info
data_reseller = {}
data_reseller['rce_source_id'] = self.rce_source_id
data_reseller['rce_source_reseller_status'] = 1
data_reseller['reseller_name'] = "Amazon.ae"
data_reseller['reseller_average_rating'] = 0.0
data_reseller['reseller_description'] = ""
try:
self.db_writer.rce_reseller(data_reseller)
except Exception as e:
logging.info(e)
##### Store info
data_reseller_store = {}
data_reseller_store['rce_source_store_status'] = 1
data_reseller_store['store_page_url'] = "amazon.ae"
data_reseller_store['store_page_url_hash'] = hashlib.md5(data_reseller_store['store_page_url'].encode('utf-8')).hexdigest()
data_reseller_store['store_location'] = ""
data_reseller_store['rce_reseller_id'] = ""
data_reseller_store['rce_source_id'] = self.rce_source_id
try:
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data_reseller['reseller_name'])+"'")
rce_reseller_id = self.cur.fetchone()
data_reseller_store['rce_reseller_id'] = rce_reseller_id[0]
return_item = data_reseller_store['rce_reseller_id']
except:
pass
try:
self.db_writer.rce_reseller_store(data_reseller_store)
except Exception as e:
logging.info(e)
return return_item
except Exception as e:
print(e)
def brand_info(self, driver):
data_brand = {}
data_brand['rce_source_id'] = self.rce_source_id
data_brand['rce_source_brand_status'] = 1
data_brand['brand_page_url'] = ""
data_brand['brand_page_url_hash'] = ""
data_brand['brand_name'] = ""
try:
data_brand['brand_page_url'] = driver.find_element(By.CSS_SELECTOR, '#bylineInfo').get_attribute('href')
data_brand['brand_page_url_hash'] = hashlib.md5(data_brand['brand_page_url'].encode('utf-8')).hexdigest()
try:
data_brand['brand_name'] = driver.find_element(By.CSS_SELECTOR, '.po-brand .po-break-word').text
except:
pass
try:
self.db_writer.rce_brand(data_brand)
except Exception as e:
logging.info(e)
return data_brand['brand_name']
except:
pass
def product_info(self, driver, category, keyword, url, url_hash, brand_name, rce_reseller_id):
data_product = {}
data_product['rce_source_product_id'] = 0
data_product['rce_source_id'] = self.rce_source_id
data_product['rce_source_product_status'] = 1
data_product['product_page_url'] = url.replace("'","''")
data_product['product_page_url_hash'] = url_hash
data_product['rce_category_id'] = category
data_product['rce_brand_id'] = ""
data_product['rce_store_id'] = ""
data_product['rce_source_product_name'] = ""
data_product['product_images'] = ""
data_product['product_description'] = ""
data_product['product_sold_total'] = 0
data_product['product_sold'] = 0
data_product['product_price_min'] = ""
data_product['product_price_min_before_discount'] =""
data_product['product_price_max'] = ""
data_product['product_price_max_before_discount'] = ""
data_product['ratings'] = 0.0
data_product['product_section'] = keyword
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_name = '"+str(brand_name)+"'"
self.cur.execute(sql)
data_product['rce_brand_id'] = self.cur.fetchone()[0]
except: pass
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+""
self.cur.execute(sql)
data_product['rce_store_id'] = self.cur.fetchone()[0]
except: pass
try:
rce_source_product_name = driver.find_element(By.CSS_SELECTOR,'#productTitle').text
data_product['rce_source_product_name'] = str(re.sub(self.pattern, '', rce_source_product_name)).replace("'","''")
except: pass
try:
product_images_element = driver.find_element(By.CSS_SELECTOR, '#magnifierLens')
product_images_raw = product_images_element.find_elements(By.TAG_NAME, 'img')
product_images = []
for product_image in product_images_raw:
url = product_image.get_attribute('src')
product_images.append(url)
data_product['product_images'] = str(product_images)
except: pass
try:
description = ""
des_rank = ""
try:
des_raws = driver.find_element(By.CSS_SELECTOR, '.a-unordered-list.a-vertical.a-spacing-mini').find_elements(By.CSS_SELECTOR, '.a-list-item')
for des_raw in des_raws:
try:
des = des_raw.text
description += des
except:
pass
except:
pass
try:
des_rank = driver.find_element(By.XPATH, '/html/body/div[2]/div/div[6]/div[24]/div/ul[1]').find_element(By.CSS_SELECTOR, '.a-list-item').text
except:
pass
data_product['product_description'] = description+des_rank
except:
pass
try:
price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text
price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text
price = price_whole+"."+price_fraction
data_product['product_price_min'] = price
data_product['product_price_max'] = price
except:
pass
try:
d_price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text
d_price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text
price = d_price_whole+"."+d_price_fraction
data_product['product_price_min'] = price
data_product['product_price_max'] = price
except:
pass
try:
data_product['product_price_min_before_discount'] = (driver.find_element(By.CSS_SELECTOR, '.a-text-price').text).replace('AED', '')
data_product['product_price_max_before_discount'] = data_product['product_price_min_before_discount']
except:
pass
try:
data_product['ratings'] = driver.find_element(By.CSS_SELECTOR, '#averageCustomerReviews .a-color-base').text
except:
pass
try:
self.db_writer.rce_product(data_product)
except Exception as e:
logging.info(e)
### rce_product_variant
try:
is_variant = driver.find_element(By.CSS_SELECTOR, '.a-unordered-list.a-nostyle.a-button-list.a-declarative.a-button-toggle-group.a-horizontal.a-spacing-top-micro.swatches.swatchesSquare.imageSwatches')
if is_variant:
variants = is_variant.find_elements(By.TAG_NAME, 'li')
#random.shuffle(variants)
for variant in variants:
variant.click()
data_variant = {}
data_variant['rce_source_variant_id'] = 0
data_variant['rce_product_id'] = ""
data_variant['product_variant_name'] = ""
data_variant['product_variant_price'] = ""
data_variant['product_variant_price_before_discount'] = ""
data_variant['product_variant_stock'] = 0
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_name = '"+str(data_product['rce_source_product_name'])+"'"
self.cur.execute(sql)
data_variant['rce_product_id'] = self.cur.fetchone()[0]
except:
pass
try:
product_variant_name = driver.find_element(By.CSS_SELECTOR,'#productTitle').text
data_variant['product_variant_name'] = str(re.sub(self.pattern, '', product_variant_name)).replace("'","''")
except: pass
try:
d_price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text
d_price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text
price = d_price_whole+"."+d_price_fraction
data_variant['product_variant_price'] = price
except:
pass
try:
data_variant['product_variant_price_before_discount'] = (driver.find_element(By.CSS_SELECTOR, '.a-text-price').text).replace('AED', '')
except:
pass
try:
self.db_writer.rce_product_variant(data_variant)
except Exception as e:
logging.info(e)
time.sleep(random.randint(2,5))
else:
logging.info('No variant found')
except:
logging.info('No variant found')
pass
def rating_info(self, driver, rce_reseller_id, url_hash):
try:
driver.find_element(By.CSS_SELECTOR, '#reviews-medley-footer .a-link-emphasis').click()
driver.implicitly_wait(5)
data_reviews = driver.find_elements(By.CSS_SELECTOR, '.a-section.review.aok-relative')
for data in data_reviews:
data_review = {}
data_review["id"] = ""
data_review["rce_product_id"] = ""
data_review["username"] = ""
data_review["review"] = ""
data_review["img_url"] = ""
data_review["review_like_count"] = 0
data_review["user_tier"] = ""
data_review["shop_id"] = 0
data_review["video_url"] = ""
data_review["rating"] = ""
try:
sql = "select max(id) from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')
self.cur.execute(sql)
rating_id = self.cur.fetchone()
if rating_id[0]==None:
rating_id = 1
else:
rating_id = int(rating_id[0]) + 1
data_review["id"] = rating_id
except:
pass
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where product_page_url_hash = '"+str(url_hash)+"'"
self.cur.execute(sql)
data_review["rce_product_id"] = self.cur.fetchone()[0]
except: pass
try: data_review["username"] = data.find_element(By.CSS_SELECTOR, '.a-profile-name').text
except: pass
try:
data_review["review"] = data.find_element(By.CSS_SELECTOR, '.a-size-base.review-text.review-text-content').text
data_review["review"] = data_review["review"].replace("'","")
except: pass
try:
rating = data.find_element(By.CSS_SELECTOR, '.a-icon.a-icon-star.review-rating .a-icon-alt').get_attribute("textContent")
data_review["rating"] = rating.replace(' out of 5 stars', '')
except: pass
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+""
self.cur.execute(sql)
data_review["shop_id"] = self.cur.fetchone()[0]
except: pass
try:
self.db_writer.rce_ratings_reviews(data_review)
except Exception as e:
logging.info(e)
except:
pass
def get_product_info(self,item):
try:
op = webdriver.ChromeOptions()
op.add_argument('--no-sandbox')
op.add_argument('--disable-notifications')
op.add_argument("--lang=en-GB")
op.add_argument('--user-data-dir=/home/ec2-user/chrome_cache/')
#op.headless = True
driver=webdriver.Chrome(options=op)
try:
driver.get('https://www.amazon.ae')
time.sleep(3)
except Exception as e:
print(e)
##### Reseller info #####
driver.get(item[4])
driver.implicitly_wait(5)
rce_reseller_id = self.reseller_info(driver)
##### Product Info #####
driver.get(item[4])
driver.implicitly_wait(5)
##### Brand Info
brand_name = self.brand_info(driver)
##### Product info
self.product_info(driver, item[2], item[3], item[4], item[5], brand_name, rce_reseller_id)
##### Rating Info #####
driver.get(item[4])
driver.implicitly_wait(5)
self.rating_info(driver, rce_reseller_id, item[5])
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 where product_page_url_hash='{item[5]}'
"""
self.cur.execute(sql)
driver.close()
except Exception as e:
print(e)
driver.close()

25
amazon_crawler_engine/conf.json Executable file
View File

@ -0,0 +1,25 @@
{
"crawler_name": "raena_crawler_enginer_amazon",
"crawler_schema": "raena_spider_management",
"category_tab": "rce_category",
"tracker_tab": "crawler_tracker",
"product_tab": "rce_product",
"variant_tab": "rce_product_variant",
"brand_tab": "rce_brand",
"reseller_tab": "rce_reseller",
"reseller_store_tab": "rce_reseller_store",
"review_tab": "rce_ratings_reviews",
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
"review_producttags_tab": "rce_ratings_reviews_producttags",
"review_tags": "rce_tags",
"source_tab": "rce_source",
"product_per_category": "1000",
"source_category": "11043145",
"db_user": "dbadmin",
"db_pass": "5qCif6eyY3Kmg4z",
"database": "analytics",
"db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
"db_port": "5432",
"crawler_main": "1",
"crawler_slave_no": ""
}

View File

@ -0,0 +1,44 @@
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
op = webdriver.ChromeOptions()
op.add_argument('--no-sandbox')
op.add_argument('--disable-notifications')
op.add_argument("--lang=en-GB")
#op.headless = True
driver=webdriver.Chrome( options=op)
driver.get('https://www.noon.com/uae-en/beauty/')
time.sleep(10)
element = driver.find_element(By.CSS_SELECTOR, '.componentArea-9')
title = element.find_element(By.CSS_SELECTOR, '.truncate-title-header').text
products = element.find_elements(By.CSS_SELECTOR, '.sc-kCMKrZ.ealOXE')
urls = []
for product in products:
url = product.find_element(By.TAG_NAME, 'a').get_attribute('href')
urls.append(url)
data = {
"title": title,
"products": urls
}
print(data)
driver.close()

View File

@ -0,0 +1,83 @@
import hashlib
import logging
import sys
import string
import undetected_chromedriver as webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import psycopg2
import bs4
from webdriver_manager.chrome import ChromeDriverManager
import random
from bs4 import BeautifulSoup
import json
import time
import gzip
import re
import random
from amazon_db_writer import amazon_db_writer
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
def reseller_info(store_url):
op = webdriver.ChromeOptions()
op.add_argument('--no-sandbox')
op.add_argument('--disable-notifications')
op.add_argument("--lang=en-GB")
#op.headless = True
driver=webdriver.Chrome( options=op)
driver.get(store_url)
driver.implicitly_wait(5)
try:
driver.get(store_url)
driver.implicitly_wait(5)
##### reseller info
avg_rating = driver.find_element(By.CSS_SELECTOR,'#effective-timeperiod-rating-year-description.ratings-reviews').text
print(avg_rating)
except Exception as e:
print(e)
config = {
"crawler_name": "raena_crawler_enginer_amazon",
"crawler_schema": "raena_spider_management",
"category_tab": "rce_category",
"tracker_tab": "crawler_tracker",
"product_tab": "rce_product",
"variant_tab": "rce_product_variant",
"brand_tab": "rce_brand",
"reseller_tab": "rce_reseller",
"reseller_store_tab": "rce_reseller_store",
"review_tab": "rce_ratings_reviews",
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
"review_producttags_tab": "rce_ratings_reviews_producttags",
"review_tags": "rce_tags",
"source_tab": "rce_source",
"product_per_category": "1000",
"source_category": "11043145",
"db_user": "postgres",
"db_pass": "postgres",
"database": "postgres",
"db_host": "localhost",
"db_port": "5444",
"crawler_main": "1",
"crawler_slave_no": ""
}
conn = psycopg2.connect(database=config.get('database'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port'))
conn.autocommit = True
cur = conn.cursor()
db_writer = amazon_db_writer(config)
reseller_info('https://www.amazon.ae/sp?ie=UTF8&seller=A3TFGX22P341AN&isAmazonFulfilled=0&asin=B09BR31PF9&ref_=olp_merch_name_1')

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,77 @@
import hashlib
from amazon_db_writer import amazon_db_writer
config = {
"crawler_name": "raena_crawler_enginer_amazon",
"crawler_schema": "raena_spider_management",
"category_tab": "rce_category",
"tracker_tab": "crawler_tracker",
"product_tab": "rce_product",
"variant_tab": "rce_product_variant",
"brand_tab": "rce_brand",
"reseller_tab": "rce_reseller",
"reseller_store_tab": "rce_reseller_store",
"review_tab": "rce_ratings_reviews",
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
"review_producttags_tab": "rce_ratings_reviews_producttags",
"review_tags": "rce_tags",
"source_tab": "rce_source",
"product_per_category": "1000",
"source_category": "11043145",
"db_user": "postgres",
"db_pass": "postgres",
"database": "postgres",
"db_host": "localhost",
"db_port": "5444",
"crawler_main": "1",
"crawler_slave_no": ""
}
db_writer = amazon_db_writer(config)
data_product = {}
data_product['rce_source_product_id'] = 0
data_product['rce_source_id'] = 1
data_product['rce_source_product_status'] = 1
data_product['product_page_url'] = 'https://www.amazon.ae/Davidoff-Water-Perfume-Toilette-110ML/dp/B002S8PT8U/?_encoding=UTF8&pd_rd_w=VQ6dh&content-id=amzn1.sym.baa1fbbd-9373-444b-8104-61fa134741c5%3Aamzn1.symc.36bd837a-d66d-47d1-8457-ffe9a9f3ddab&pf_rd_p=baa1fbbd-9373-444b-8104-61fa134741c5&pf_rd_r=6EKKA9QC40Y5MFKGRWYQ&pd_rd_wg=nsmjm&pd_rd_r=6d02ccd2-297c-4b73-8586-a9ac9b355d4a&ref_=pd_gw_ci_mcx_mr_hp_atf_m'
data_product['product_page_url_hash'] = 'bjhgfds867ty3iuhbfew'
data_product['rce_category_id'] = 3
data_product['rce_brand_id'] = 2
data_product['rce_store_id'] = 6
data_product['rce_source_product_name'] = "Hot Water by Davidoff for Men"
data_product['product_images'] = ""
data_product['product_description'] = "Davidoff Hot Water hits you first with its fresh spicy aroma owing to the vegetal top notes of wormwood and basil. While the o"
data_product['product_sold_total'] = 0
data_product['product_sold'] = 0
data_product['product_price_min'] = "99.00"
data_product['product_price_min_before_discount'] ="340.00"
data_product['product_price_max'] = "99.00"
data_product['product_price_max_before_discount'] = "340.00"
data_product['ratings'] = 4.1
data_product['product_section'] = "Fragrance"
data_variant = {}
data_variant['rce_source_variant_id'] = 0
data_variant['rce_product_id'] = 2
data_variant['product_variant_name'] = "abc"
data_variant['product_variant_price'] = "67.3"
data_variant['product_variant_price_before_discount'] = "100.90"
data_variant['product_variant_stock'] = 0
data_review = {}
data_review["id"] = 1
data_review["rce_product_id"] = 5
data_review["username"] = "adnan"
data_review["review"] = "very good product"
data_review["img_url"] = ""
data_review["review_like_count"] = 0
data_review["user_tier"] = ""
data_review["shop_id"] = 2
data_review["video_url"] = ""
data_review["rating"] = "4.9"
db_writer.rce_ratings_reviews(data_review)

View File

@ -0,0 +1,9 @@
1. Log into Facebook and go to the group from which you want to export the members.
2. Navigate to the “Members“ tab.
3. Open the Developer console on chrome and paste the code from "chrome_group_export".
4. Paste the code from "chrome_auto_scroll" to the auto-scroll page.
5. Download and save the file once the limit (10K) is reached.

View File

@ -0,0 +1,37 @@
(function() {
var intervalObj = null;
var retry = 0;
var clickHandler = function() {
console.log("Clicked; stopping autoscroll");
clearInterval(intervalObj);
document.body.removeEventListener("click", clickHandler);
}
function scrollDown() {
var scrollHeight = document.body.scrollHeight,
scrollTop = document.body.scrollTop,
innerHeight = window.innerHeight,
difference = (scrollHeight - scrollTop) - innerHeight
if (difference > 0) {
window.scrollBy(0, difference);
if (retry > 0) {
retry = 0;
}
console.log("scrolling down more");
} else {
if (retry >= 3) {
console.log("reached bottom of page; stopping");
clearInterval(intervalObj);
document.body.removeEventListener("click", clickHandler);
} else {
console.log("[apparenty] hit bottom of page; retrying: " + (retry + 1));
retry++;
}
}
}
document.body.addEventListener("click", clickHandler);
intervalObj = setInterval(scrollDown, 1000);
})()

View File

@ -0,0 +1 @@
function exportToCsv(e,t){for(var n="",o=0;o<t.length;o++)n+=function(e){for(var t="",n=0;n<e.length;n++){var o=null===e[n]||void 0===e[n]?"":e[n].toString(),o=(o=e[n]instanceof Date?e[n].toLocaleString():o).replace(/"/g,'""');0<n&&(t+=","),t+=o=0<=o.search(/("|,|\n)/g)?'"'+o+'"':o}return t+"\n"}(t[o]);var r=new Blob([n],{type:"text/csv;charset=utf-8;"}),i=document.createElement("a");void 0!==i.download&&(r=URL.createObjectURL(r),i.setAttribute("href",r),i.setAttribute("download",e),document.body.appendChild(i),i.click(),document.body.removeChild(i))}function buildCTABtn(){var e=document.createElement("div"),t=(e.setAttribute("style",["position: fixed;","top: 0;","left: 0;","z-index: 10;","width: 100%;","height: 100%;","pointer-events: none;"].join("")),document.createElement("div")),n=(t.setAttribute("style",["position: absolute;","bottom: 30px;","right: 130px;","color: white;","min-width: 150px;","background: var(--primary-button-background);","border-radius: var(--button-corner-radius);","padding: 0px 12px;","cursor: pointer;","font-weight:600;","font-size:15px;","display: inline-flex;","pointer-events: auto;","height: 36px;","align-items: center;","justify-content: center;"].join("")),document.createTextNode("Download ")),o=document.createElement("span"),r=(o.setAttribute("id","fb-group-scraper-number-tracker"),o.textContent="0",document.createTextNode(" members"));return t.appendChild(n),t.appendChild(o),t.appendChild(r),t.addEventListener("click",function(){var e=(new Date).toISOString();exportToCsv("groupMemberExport-".concat(e,".csv"),window.members_list)}),e.appendChild(t),document.body.appendChild(e),e}function processResponse(e){var t;if(null!==(n=null==e?void 0:e.data)&&void 0!==n&&n.group)o=e.data.group;else{if("Group"!==(null===(n=null===(n=null==e?void 0:e.data)||void 0===n?void 0:n.node)||void 0===n?void 0:n.__typename))return;o=e.data.node}if(null!==(n=null==o?void 0:o.new_members)&&void 0!==n&&n.edges)t=o.new_members.edges;else{if(null===(e=null==o?void 0:o.new_forum_members)||void 0===e||!e.edges)return;t=o.new_forum_members.edges}var n=t.map(function(e){var t=e.node,n=t.id,o=t.name,r=t.bio_text,i=t.url,d=t.profile_picture,t=t.__isProfile,s=(null===(s=null==e?void 0:e.join_status_text)||void 0===s?void 0:s.text)||(null===(s=null===(s=null==e?void 0:e.membership)||void 0===s?void 0:s.join_status_text)||void 0===s?void 0:s.text),e=null===(e=e.node.group_membership)||void 0===e?void 0:e.associated_group.id;return[n,o,i,(null==r?void 0:r.text)||"",(null==d?void 0:d.uri)||"",e,s||"",t]}),o=((e=window.members_list).push.apply(e,n),document.getElementById("fb-group-scraper-number-tracker"));o&&(o.textContent=window.members_list.length.toString())}function parseResponse(e){var n=[];try{n.push(JSON.parse(e))}catch(t){var o=e.split("\n");if(o.length<=1)return void console.error("Fail to parse API response",t);for(var r=0;r<o.length;r++){var i=o[r];try{n.push(JSON.parse(i))}catch(e){console.error("Fail to parse API response",t)}}}for(var t=0;t<n.length;t++)processResponse(n[t])}function main(){buildCTABtn();var e=XMLHttpRequest.prototype.send;XMLHttpRequest.prototype.send=function(){this.addEventListener("readystatechange",function(){this.responseURL.includes("/api/graphql/")&&4===this.readyState&&parseResponse(this.responseText)},!1),e.apply(this,arguments)}}window.members_list=window.members_list||[["ProfileId","FulName","ProfileLink","Bio","ImageSrc","GroupId","GroupJoining","ProfileType"]],main();

25
noon_crawler_engine/conf.json Executable file
View File

@ -0,0 +1,25 @@
{
"crawler_name": "raena_crawler_enginer_noon",
"crawler_schema": "raena_spider_management",
"category_tab": "rce_category",
"tracker_tab": "crawler_tracker_noon",
"product_tab": "rce_product",
"variant_tab": "rce_product_variant",
"brand_tab": "rce_brand",
"reseller_tab": "rce_reseller",
"reseller_store_tab": "rce_reseller_store",
"review_tab": "rce_ratings_reviews",
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
"review_producttags_tab": "rce_ratings_reviews_producttags",
"review_tags": "rce_tags",
"source_tab": "rce_source",
"product_per_category": "1000",
"source_category": "11043145",
"db_user": "dbadmin",
"db_pass": "5qCif6eyY3Kmg4z",
"database": "analytics",
"db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
"db_port": "5432",
"crawler_main": "1",
"crawler_slave_no": ""
}

View File

@ -0,0 +1,194 @@
import hashlib
import logging
import undetected_chromedriver as webdriver
import psycopg2
from selenium.webdriver.common.by import By
from pyvirtualdisplay import Display
from amazon_db_writer import amazon_db_writer
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
class amazon_categories:
def __init__(self, config):
self.config = config
self.crawler_name = self.config.get("crawler_name")
self.url = "https://www.amazon.ae/s?rh=n%3A11497859031&ref=lp_11497860031_sar"
self.product_limit = int(self.config.get("product_per_category"))
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Amazon'")
try : self.rce_source_id = self.cur.fetchone()[0]
except:
logging.info("Source tab is empty. Please check. Exiting.....")
exit(1)
self.db_writer = amazon_db_writer(config)
#self.display = Display(visible=0, size=(800, 600))
#self.display.start()
def __del__(self):
print("Closing connection.....")
self.conn.close()
#self.display.stop()
def start_processing(self):
op = webdriver.ChromeOptions()
op.add_argument('--no-sandbox')
op.add_argument('--disable-notifications')
op.add_argument("--lang=en-GB")
#op.headless = True
#driver=webdriver.Chrome(version_main = 113, options=op)
driver=webdriver.Chrome(options=op)
driver.get(self.url)
driver.implicitly_wait(10)
self.get_categories(driver)
driver.close()
def get_categories(self, driver):
#element = driver.find_elements(By.CSS_SELECTOR,'.bxc-grid__container.bxc-grid__container--width-1500.bxc-grid__mp-gutter-layout')
#sub_cats = element[0].find_elements(By.CSS_SELECTOR,'.bxc-grid__image.bxc-grid__image--light')
sub_cats = driver.find_elements(By.CSS_SELECTOR,'.bxc-grid__image.bxc-grid__image--light')
names = ['Perfumes', 'Skin care', 'Hair care', 'Bath & body', 'Makeup', 'Nail care']
categories = []
for sub_cat in sub_cats:
name = sub_cat.find_element(By.TAG_NAME, 'a').get_attribute('aria-label')
if name in names:
link = sub_cat.find_element(By.TAG_NAME, 'a').get_attribute('href')
category = {
"name": name,
"link": link
}
categories.append(category)
print(categories)
self.get_sub_categories(driver, categories)
def get_sub_categories(self,driver,categories):
sub_categories = []
for category in categories:
print("=============== {} ===============".format(category["name"]))
data = {}
data['parent_category_id'] = 0
data['rce_source_id'] = self.rce_source_id
data['rce_source_category_id'] = 0
data['rce_source_status'] = 1
data['category_name'] = category["name"]
data['category_page_url'] = category["link"]
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
self.db_writer.rce_category(data)
driver.get(category["link"])
##### Feature Categories
try:
f_cat = driver.find_element(By.CSS_SELECTOR, '.octopus-pc-category-card-v2-title .a-size-extra-large')
if f_cat:
cats_c = driver.find_element(By.CSS_SELECTOR, '.a-section.octopus-pc-category-card-v2-content')
cats = cats_c.find_elements(By.CSS_SELECTOR, '.octopus-pc-category-card-v2-item')
for cat in cats:
cat_name = cat.find_element(By.CSS_SELECTOR, '.a-size-medium.a-color-base.a-text-bold').text
url = cat.find_element(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-category-card-v2-category-link').get_attribute("href")
# print('Name: {}, URL: {}'.format(cat_name,url))
# s_cat = {
# "name": cat_name,
# "link": url
# }
# sub_categories.append(s_cat)
data = {}
data['parent_category_id'] = 0
data['rce_source_id'] = self.rce_source_id
data['rce_source_category_id'] = 0
data['rce_source_status'] = 1
data['category_name'] = cat_name
data['category_page_url'] = url
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
self.db_writer.rce_category(data)
try:
sub_cats = cat.find_elements(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-category-card-v2-subcategory-link')
for sub_cat in sub_cats:
s_url = sub_cat.get_attribute('href')
s_title = sub_cat.get_attribute('title')
# print('Title: {}, URL: {}'.format(s_title, s_url))
# s_cat = {
# "name": s_title,
# "link": s_url
# }
# sub_categories.append(s_cat)
data = {}
data['parent_category_id'] = 0
data['rce_source_id'] = self.rce_source_id
data['rce_source_category_id'] = 0
data['rce_source_status'] = 1
data['category_name'] = s_title
data['category_page_url'] = s_url
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
self.db_writer.rce_category(data)
except:
pass
except:
print("Feature Cat not available.")
pass
##### Shop by categories
try:
try:
cat_h = driver.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-header')
except:
cat_h = driver.find_element(By.CSS_SELECTOR, '#contentGrid_292470')
pass
if cat_h:
cats_c = driver.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-viewport-row-inner')
cats = cats_c.find_elements(By.TAG_NAME, 'li')
for cat in cats:
cat_name = cat.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-sub-card-title').text
url = cat.find_element(By.TAG_NAME, 'a').get_attribute('href')
# print('Name: {}, URL: {}'.format(cat_name,url))
# s_cat = {
# "name": cat_name,
# "link": url
# }
# sub_categories.append(s_cat)
data = {}
data['parent_category_id'] = 0
data['rce_source_id'] = self.rce_source_id
data['rce_source_category_id'] = 0
data['rce_source_status'] = 1
data['category_name'] = cat_name
data['category_page_url'] = url
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
self.db_writer.rce_category(data)
except Exception as e:
print('Cat not available')
pass
print(sub_categories)
# categories = amazon_categories()
# categories.start_processing()

View File

@ -0,0 +1,255 @@
import hashlib
import logging
#import undetected_chromedriver as webdriver
from selenium import webdriver
from selenium.webdriver import ActionChains, Keys
from selenium.webdriver.chrome.service import Service
import psycopg2
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from noon_db_writer import noon_db_writer
from pyvirtualdisplay import Display
from scroller.scroller import smartScroll
import time
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
class noon_category_products:
def __init__(self, config):
self.config = config
self.crawler_name = self.config.get("crawler_name")
#self.url = "https://www.amazon.ae/gp/browse.html?node=11497860031&ref_=nav_em_by_all_0_2_11_2"
self.product_limit = int(self.config.get("product_per_category"))
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
sql = "delete from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where crawler_name='"+str(self.crawler_name)+"'"
self.cur.execute(sql)
sql = f"""
select a.id, a.category_page_url from {self.config.get('crawler_schema')}.{self.config.get('category_tab')} a
where a.rce_source_id = (
select id from {self.config.get('crawler_schema')}.{self.config.get('source_tab')} where source_name = 'Noon')
"""
self.cur.execute(sql)
self.categories = self.cur.fetchall()
#self.display = Display(visible=0, size=(800, 600))
#self.display.start()
def __del__(self):
print("Closing connection.....")
self.conn.close()
#self.display.stop()
def start_processing(self):
# #### Collect section name and section products ####
section_products = self.section_products()
self.insert_tracker_tab(section_products)
# if self.categories:
# for category in self.categories:
# logging.info("======= Fetching products of {}".format(category))
# self.browse_category_page(category)
# else:
# logging.info("No category available. Stopping.......")
def browse_category_page(self, catagory):
try:
# op = webdriver.ChromeOptions()
# op.add_argument('--no-sandbox')
# op.add_argument('--disable-notifications')
# op.add_argument("--lang=en-GB")
#op.headless = True
#driver=webdriver.Chrome(version_main = 113, options=op)
# driver=webdriver.Chrome(options=op)
driver = webdriver.Firefox()
driver.get(catagory[1])
driver.implicitly_wait(10)
### Collect All products ####
self.base_products(driver, catagory[0])
driver.close()
except Exception as e:
print(e)
def section_products(self):
driver = webdriver.Firefox()
driver.get('https://www.noon.com/uae-en/beauty/')
driver.implicitly_wait(5)
results = []
#Bestsellers
elements = driver.find_element(By.CSS_SELECTOR, '.componentArea-4 > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > div:nth-child(1)').find_elements(By.CSS_SELECTOR,'.sc-kCMKrZ.ealOXE')
urls = []
for element in elements:
link = element.find_element(By.TAG_NAME, 'a').get_attribute('href')
urls.append(link)
result = {
"catagory": '3184',
"key": "Bestsellers",
"value": urls
}
results.append(result)
# New arrivals
elements = driver.find_element(By.CSS_SELECTOR, '.componentArea-18 > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > div:nth-child(1)').find_elements(By.CSS_SELECTOR,'.swiper-slide')
urls = []
for element in elements:
link = element.find_element(By.TAG_NAME, 'a').get_attribute('href')
urls.append(link)
result = {
"catagory": '3184',
"key": "New arrivals",
"value": urls
}
results.append(result)
# Clearance deals
elements = driver.find_element(By.CSS_SELECTOR, '.componentArea-21 > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > div:nth-child(1)').find_elements(By.CSS_SELECTOR,'.swiper-slide')
urls = []
for element in elements:
link = element.find_element(By.TAG_NAME, 'a').get_attribute('href')
urls.append(link.replace("'",""))
result = {
"catagory": '3184',
"key": "Clearance deals",
"value": urls
}
results.append(result)
print(results)
return results
def insert_tracker_tab(self, objs):
for obj in objs:
category = str(obj['catagory'])
key = str(obj['key'])
items = obj['value']
for item in items:
product_page_url = item
product_page_url_hash = hashlib.md5(product_page_url.encode('utf-8')).hexdigest()
flag = 0
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where product_page_url = '"+product_page_url+"'"
self.cur.execute(sql)
res = self.cur.fetchall()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+"(crawler_name,category,keyword,product_page_url,product_page_url_hash,flag) values('"+str(self.crawler_name)+"','"+str(category)+"','"+str(key)+"','"+product_page_url+"','"+product_page_url_hash+"',"+str(flag)+")"
print(sql)
self.cur.execute(sql)
def base_products(self, driver, catagory):
try:
for i in range(1,16):
smartScroll(driver, stopAtBorder=True, distancePerSecond=500, humanBreaks=True)
# ##############
# SCROLL_PAUSE_TIME = 0.5
#
# # Get scroll height
# last_height = driver.execute_script("return document.body.scrollHeight")
#
# while True:
# # Scroll down to bottom
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
#
# # Wait to load page
# time.sleep(SCROLL_PAUSE_TIME)
#
# # Calculate new scroll height and compare with last scroll height
# new_height = driver.execute_script("return document.body.scrollHeight")
# if new_height == last_height:
# break
# last_height = new_height
# #############
items = driver.find_element(By.CSS_SELECTOR, '.sc-810b5658-7.upghB.grid').find_elements(By.CSS_SELECTOR,'.sc-ff3f80d5-0.iBVDAS.wrapper.productContainer')
#smartScroll(driver, stopAtBorder=True, distancePerSecond=500, humanBreaks=True)
urls = []
for item in items:
url = item.find_element(By.TAG_NAME, 'a').get_attribute('href')
urls.append(url)
result = [{
"catagory": catagory,
"key": "Base Product Page {}".format(str(i)),
"value": urls
}]
self.insert_tracker_tab(result)
try:
driver.find_elements(By.CSS_SELECTOR, '.arrowLink')[1].click()
html = driver.find_element(By.TAG_NAME, 'html')
html.send_keys(Keys.HOME)
driver.implicitly_wait(5)
except:
logging.info("No more page to navigate......")
break
except Exception as e:
print(e)
pass
config = {
"crawler_name": "raena_crawler_enginer_noon",
"crawler_schema": "raena_spider_management",
"category_tab": "rce_category",
"tracker_tab": "crawler_tracker_noon",
"product_tab": "rce_product",
"variant_tab": "rce_product_variant",
"brand_tab": "rce_brand",
"reseller_tab": "rce_reseller",
"reseller_store_tab": "rce_reseller_store",
"review_tab": "rce_ratings_reviews",
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
"review_producttags_tab": "rce_ratings_reviews_producttags",
"review_tags": "rce_tags",
"source_tab": "rce_source",
"product_per_category": "1000",
"source_category": "11043145",
"db_user": "dbadmin",
"db_pass": "5qCif6eyY3Kmg4z",
"database": "analytics",
"db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
"db_port": "5432",
"crawler_main": "1",
"crawler_slave_no": ""
}
noon_category_products = noon_category_products(config)
noon_category_products.start_processing()

View File

@ -0,0 +1,115 @@
import logging
import psycopg2
import json
from datetime import datetime
import smtplib
from email.message import EmailMessage
import requests
from noon_products import noon_products
##### Looger ######
format = "%(asctime)s: %(message)s"
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
config = {}
def slack_notification(message):
webhook_url = "https://hooks.slack.com/services/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"
slack_data = {"text": "Issue occurred on Noon Crawler. Error: " + str(message)}
response = requests.post(
webhook_url, data=json.dumps(slack_data),
headers={"Content-Type": "application/json"}
)
if response.status_code != 200:
raise ValueError(
f"Request to Slack returned an error {response.status_code}, {response.text}"
)
def send_mail():
try:
EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5"
EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh"
From = 'data_reporting@raenabeauty.com'
To = 'shariar@raenabeauty.com'
#To = 'shariar@raenabeauty.com'
html = f'''
<!DOCTYPE html>
<html>
<body>
<div style="background-color:#eee;padding:10px 20px;">
<h2 style="font-family:Georgia, 'Times New Roman', Times, serif;color#454349;">Amazon Crawler Status</h2>
</div>
<div style="padding:20px 0px">
<div style="height: 800px;width:800px">
Error occured. Please check Amazon Pipeline.
<div style="text-align:Left;">
<p>This is system generated mail. Please do not reply</p>
</div>
</div>
</div>
</body>
</html>
'''
msg = EmailMessage()
msg['Subject'] = 'Amazon Crawler Status'
msg['From'] = From
msg['To'] = To
msg.set_content(html, subtype='html')
with smtplib.SMTP('email-smtp.ap-southeast-1.amazonaws.com', 587) as smtp:
smtp.ehlo()
smtp.starttls()
smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD)
smtp.send_message(msg)
except Exception as e:
logging.info("Error while sending mail: {}".format(e))
def main():
# start = datetime.now()
# categories = amazon_categories(config)
# categories.start_processing()
# end = datetime.now()
# logging.info('Total time taken to fetch the categories: {}'.format(str(end-start)))
#
# start = datetime.now()
# products = amazon_category_products(config)
# products.start_processing()
# end = datetime.now()
# logging.info('Total time taken to fetch the category products: {}'.format(str(end-start)))
product_info = noon_products(config)
product_info.start_processing()
# ###### For test
# item = (100, 'raena_crawler_enginer_amazon', '3066', 'Up to 25 AED', 'https://www.amazon.ae/Ross-Massager-Shampoo-Silicone-Bristles/dp/B09JGH1WM3?ref_=Oct_d_oup_d_12149480031_0&pd_rd_w=lfMTW&content-id=amzn1.sym.d6d96598-a48c-43a2-8244-52a2329bf791&pf_rd_p=d6d96598-a48c-43a2-8244-52a2329bf791&pf_rd_r=C1QM2XCSJDBVMS27JV7E&pd_rd_wg=gkRZv&pd_rd_r=f5af13ee-c6c4-4d8a-8677-cba9cbacdace&pd_rd_i=B09JGH1WM3', '8f0540b5919e176303cf24a1d46b0e1c', 0)
# product_info.get_product_info(item)
if __name__ == "__main__":
logging.info("Starting Shopee Crawler.......")
try:
logging.info("Loading config file.......")
with open("conf.json", "r") as jsonfile:
config = json.load(jsonfile)
logging.info("Config file loaded.......")
print(config)
main()
#raise Exception("Sorry, no numbers below zero")
except Exception as e:
logging.info("Error: ".format(e))
#logging.info("Cannot load config file. Please check. Exiting......")
#send_mail()
slack_notification(e)
exit(1)

View File

@ -0,0 +1,590 @@
import logging
import psycopg2
###### Looger ######
format = "%(asctime)s: %(message)s"
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
class noon_db_writer:
def __init__(self, config):
self.config = config
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
def __del__(self):
logging.info("Closing connection.....")
self.conn.close()
def rce_category(self, data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" where category_name = '"+str(data['category_name'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
cat_name = data['category_name'].replace("'","''")
cat_url = data['category_page_url'].replace("'","''")
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" (parent_category_id,rce_source_id," \
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name) values (" \
+str(data['parent_category_id'])+","+str(data['rce_source_id'])+", "+str(data['rce_source_category_id'])+", "+str(data['rce_source_status'])+", " \
"'"+str(cat_url)+"', '"+str(data['category_page_url_hash'])+"', '"+str(cat_name)+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(id,parent_category_id,rce_source_id," \
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat) " \
"select id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \
"category_name,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \
"where rce_source_category_id = "+ str(data['rce_source_category_id'])
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['parent_category_id'])==str(res[1]) and str(data['rce_source_category_id'])==str(res[3]) and str(data['category_name']) == str(res[7]) and \
str(data['category_page_url'])==str(res[5]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" set updatedat=now() " \
"where category_name = '"+ str(res[7])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" set parent_category_id = " \
""+str(data['parent_category_id'])+", rce_source_category_id = "+str(data['rce_source_category_id'])+", " \
"category_name='"+str(cat_name)+"', category_page_url='"+str(cat_url)+"', " \
"category_page_url_hash='"+str(data['category_page_url_hash'])+"', updatedat=now() where " \
"category_name = '"+ str(res[7])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(id,parent_category_id,rce_source_id," \
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat) " \
"select id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \
"category_name,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \
"where category_name = '"+ str(res[7])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_product(self, data):
data['product_page_url'] = data['product_page_url'].replace("'","")
data['rce_source_product_name'] = data['rce_source_product_name'].replace("'","")
data['product_description'] = data['product_description'].replace("'","")
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where product_page_url = '"+str(data['product_page_url'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" (rce_source_product_id," \
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
"product_section,rce_source_id) values("+str(data['rce_source_product_id'])+","+str(data['rce_source_product_status'])+",'"+str(data['product_page_url'])+"'," \
"'"+str(data['product_page_url_hash'])+"',"+str(data['rce_category_id'])+","+str(data['rce_brand_id'])+","+str(data['rce_store_id'])+"," \
"'"+str(data['rce_source_product_name'])+"','"+str(data['product_images'])+"','"+str(data['product_description'])+"',"+str(data['product_sold_total'])+"," \
""+str(data['product_sold'])+",'"+str(data['product_price_min'])+"','"+str(data['product_price_min_before_discount'])+"','"+str(data['product_price_max'])+"'," \
"'"+str(data['product_price_max_before_discount'])+"','"+str(data['ratings'])+"','"+str(data['product_section'])+"',"+str(data['rce_source_id'])+")"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" (id,rce_source_product_id," \
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
"product_section,createdat,updatedat,rce_source_id) select id,rce_source_product_id," \
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
"product_section,createdat,updatedat,rce_source_id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where " \
"product_page_url='"+str(data['product_page_url'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_product_id'])==str(res[1]) and str(data['rce_source_product_status'])==str(res[2]) and \
str(data['product_page_url'])==str(res[3]) and str(data['product_page_url_hash'])==str(res[4]) and str(data['rce_category_id'])==str(res[5]) and \
str(data['rce_brand_id'])==str(res[6]) and str(data['rce_store_id'])==str(res[7]) and str(data['rce_source_product_name'])==str(res[8]) and \
str(data['product_images'])==str(res[9]) and str(data['product_sold_total'])==str(res[11]) and \
str(data['product_sold'])==str(res[12]) and str(data['product_price_min'])==str(res[13]) and str(data['product_price_min_before_discount'])==str(res[14]) and \
str(data['product_price_max'])==str(res[15]) \
and str(data['product_price_max_before_discount'])==str(res[16]) \
and str(data['ratings'])==str(res[17]) and str(data['rce_source_id'])==str(res[21]) and \
str(data['product_section'])==str(res[22]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" set updatedat=now() " \
"where product_page_url = '"+ str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" set rce_source_product_id="+str(data['rce_source_product_id'])+"," \
"rce_source_product_status="+str(data['rce_source_product_status'])+",product_page_url='"+str(data['product_page_url'])+"',product_page_url_hash= " \
"'"+str(data['product_page_url_hash'])+"',rce_category_id="+str(data['rce_category_id'])+",rce_brand_id="+str(data['rce_brand_id'])+"," \
"rce_store_id="+str(data['rce_store_id'])+",rce_source_product_name='"+str(data['rce_source_product_name'])+"',product_images='"+str(data['product_images'])+"'" \
",product_description='"+str(data['product_description'])+"',product_sold_total="+str(data['product_sold_total'])+",product_sold="+str(data['product_sold'])+"," \
"product_price_min='"+str(data['product_price_min'])+"',product_price_min_before_discount='"+str(data['product_price_min_before_discount'])+"'," \
"product_price_max='"+str(data['product_price_max'])+"',product_price_max_before_discount='"+str(data['product_price_max_before_discount'])+"',ratings='"+str(data['ratings'])+"'," \
"product_section='"+str(data['product_section'])+"', updatedat=now(), rce_source_id="+str(data['rce_source_id'])+" where product_page_url = '"+ str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" (id,rce_source_product_id," \
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
"product_section,createdat,updatedat,rce_source_id) select id,rce_source_product_id," \
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
"product_section,createdat,updatedat,rce_source_id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where " \
"product_page_url='"+str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_product_variant(self, data):
data['product_variant_name'] = data['product_variant_name'].replace("'","''")
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where product_variant_name = '"+str(data['product_variant_name'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" (rce_source_variant_id,rce_product_id," \
"product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock) values("+str(data['rce_source_variant_id'])+"," \
""+str(data['rce_product_id'])+",'"+str(data['product_variant_name'])+"','"+str(data['product_variant_price'])+"'," \
"'"+str(data['product_variant_price_before_discount'])+"',"+str(data['product_variant_stock'])+")"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" (id,rce_source_variant_id,rce_product_id," \
"product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,createdat,updatedat) select * from " \
""+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where product_variant_name='"+str(data['product_variant_name'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_variant_id'])==str(res[1]) and str(data['rce_product_id'])==str(res[2]) and str(data['product_variant_name'])==str(res[3]) and \
str(data['product_variant_price'])==str(res[4]) and str(data['product_variant_price_before_discount'])==str(res[5]) and str(data['product_variant_stock'])==str(res[6]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" set updatedat=now() " \
"where product_variant_name = '"+ str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" set rce_source_variant_id="+str(data['rce_source_variant_id'])+", " \
"rce_product_id="+str(data['rce_product_id'])+", product_variant_name='"+str(data['product_variant_name'])+"', product_variant_price=" \
"'"+str(data['product_variant_price'])+"',product_variant_price_before_discount='"+str(data['product_variant_price_before_discount'])+"'," \
"product_variant_stock="+str(data['product_variant_stock'])+", updatedat=now() where product_variant_name = '"+ str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" (id,rce_source_variant_id,rce_product_id," \
"product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,createdat,updatedat) select * from " \
""+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where product_variant_name='"+str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_brand(self, data):
data['brand_page_url'] = data['brand_page_url'].replace("'","''")
data['brand_name'] = data['brand_name'].replace("'","''")
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_page_url = '"+str(data['brand_page_url'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" (rce_source_id,rce_source_brand_status," \
"brand_page_url,brand_page_url_hash,brand_name) values("+str(data['rce_source_id'])+"," \
""+str(data['rce_source_brand_status'])+",'"+str(data['brand_page_url'])+"','"+str(data['brand_page_url_hash'])+"'," \
"'"+str(data['brand_name'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" (id,rce_source_id,rce_source_brand_status," \
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat) select id,rce_source_id,rce_source_brand_status," \
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_page_url='"+str(data['brand_page_url'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_id'])==str(res[1]) and str(data['rce_source_brand_status'])==str(res[3]) and str(data['brand_page_url'])==str(res[4]) and \
str(data['brand_page_url_hash'])==str(res[5]) and str(data['brand_name'])==str(res[6]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" set updatedat=now() " \
"where brand_page_url = '"+ str(res[4])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" set rce_source_id="+str(data['rce_source_id'])+", " \
"rce_source_brand_status="+str(data['rce_source_brand_status'])+", brand_page_url='"+str(data['brand_page_url'])+"', brand_page_url_hash=" \
"'"+str(data['brand_page_url_hash'])+"',brand_name='"+str(data['brand_name'])+"', updatedat=now() where brand_page_url = '"+ str(res[4])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" (id,rce_source_id,rce_source_brand_status," \
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat) select id,rce_source_id,rce_source_brand_status, " \
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_page_url='"+str(res[4])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_reseller(self, data):
data['reseller_name'] = data['reseller_name'].replace("'","''")
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data['reseller_name'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" (rce_source_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_description) values("+str(data['rce_source_id'])+"," \
""+str(data['rce_source_reseller_status'])+",'"+str(data['reseller_name'])+"','"+str(data['reseller_average_rating'])+"'," \
"'"+str(data['reseller_description'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (id,rce_source_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat) select id,rce_source_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name='"+str(data['reseller_name'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if data['rce_source_id']==res[1] and str(data['rce_source_reseller_status'])==str(res[3]) and str(data['reseller_name'])==str(res[4]) and \
str(data['reseller_average_rating'])==str(res[5]) and str(data['reseller_description'])==str(res[6]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set updatedat=now() " \
"where reseller_name = '"+ str(res[4])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set rce_source_id="+str(data['rce_source_id'])+", " \
"rce_source_reseller_status="+str(data['rce_source_reseller_status'])+", reseller_name='"+str(data['reseller_name'])+"', reseller_average_rating=" \
"'"+str(data['reseller_average_rating'])+"',reseller_description='"+str(data['reseller_description'])+"', updatedat=now() where reseller_name = '"+ str(res[4])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (id,rce_source_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat) select id,rce_source_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name='"+str(res[4])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_reseller_store(self, data):
data['store_page_url'] = data['store_page_url'].replace("'","''")
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url = '"+str(data['store_page_url'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" (rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,rce_source_id) values(" \
""+str(data['rce_source_store_status'])+",'"+str(data['store_page_url'])+"','"+str(data['store_page_url_hash'])+"'," \
"'"+str(data['store_location'])+"', "+str(data['rce_reseller_id'])+", "+str(data['rce_source_id'])+")"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (id,rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id) select id,rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id from " \
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url='"+str(data['store_page_url'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_store_status'])==str(res[2]) and str(data['store_page_url'])==str(res[3]) and \
str(data['store_page_url_hash'])==str(res[4]) and str(data['store_location'])==str(res[5]) and \
str(data['rce_reseller_id'])==str(res[6]) and str(data['rce_source_id'])==str(res[9]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set updatedat=now() " \
"where store_page_url = '"+ str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set " \
"rce_source_store_status="+str(data['rce_source_store_status'])+", store_page_url='"+str(data['store_page_url'])+"', store_page_url_hash=" \
"'"+str(data['store_page_url_hash'])+"',store_location='"+str(data['store_location'])+"', rce_reseller_id="+str(data['rce_reseller_id'])+", " \
"updatedat=now(), rce_source_id="+str(data['rce_source_id'])+" where store_page_url = '"+ str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (id,rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id) select id,rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id from " \
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url='"+str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_ratings_reviews(self, data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id = "+str(data['rce_product_id'])+" and username ='"+str(data['username'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
data['username'] = data['username'].replace("'","''")
data['img_url'] = data['img_url'].replace("'","''")
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" (id,rce_product_id,username," \
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating) values("+str(data['id'])+","+str(data['rce_product_id'])+"," \
"'"+str(data['username'])+"','"+str(data['review'])+"','"+str(data['img_url'])+"',"+str(data['review_like_count'])+",'"+str(data['user_tier'])+"'," \
""+str(data['shop_id'])+", '"+str(data['video_url'])+"', '"+str(data['rating'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" (id,rce_product_id,username," \
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select id,rce_product_id,username," \
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id="+str(data['rce_product_id'])+" and username ='"+str(data['username'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_product_id'])==str(res[1]) and str(data['username'])==str(res[2]) and str(data['review'])==str(res[3]) and \
str(data['img_url'])==str(res[4]) and str(data['review_like_count'])==str(res[5]) and str(data['user_tier'])==str(res[6]) and \
str(data['shop_id'])==str(res[7]) and str(data['video_url'])==str(res[8]) and str(data['rating'])==str(res[9]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set updatedat=now() " \
"where rce_product_id = "+ str(res[1])+" and username ='"+res[2]+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set rce_product_id="+str(data['rce_product_id'])+", " \
"username='"+str(data['username'])+"', review='"+str(data['review'])+"', img_url=" \
"'"+str(data['img_url'])+"',review_like_count="+str(data['review_like_count'])+", user_tier='"+str(data['user_tier'])+"', " \
"shop_id="+str(data['shop_id'])+", video_url='"+str(data['video_url'])+"', rating='"+str(data['rating'])+"', updatedat=now() " \
"where rce_product_id = "+ str(res[1])+" and username ='"+str(data['username'])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" (id,rce_product_id,username," \
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select id,rce_product_id,username," \
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id="+str(res[1])+" and username ='"+str(data['username'])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_ratings_reviews_productmodels(self,data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id = "+str(data['rce_rating_id'])
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" (rce_rating_id,model_id) " \
"values("+str(data['rce_rating_id'])+",'"+str(data['model_id'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
"createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(data['rce_rating_id'])+""
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_rating_id'])==str(res[1]) and str(data['model_id'])==str(res[2]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set updatedat=now() " \
"where rce_rating_id = "+ str(res[1])
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set model_id="+str(data['model_id'])+", " \
"updatedat=now() where rce_source_store_id = "+ str(res[1])
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
"createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(res[1])+""
#logging.info(sql)
self.cur.execute(sql)
def rce_tags(self,data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description = '"+str(data['description'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" (id,description) " \
"values("+str(data['id'])+",'"+str(data['description'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
"createdat,updatedat) select id,description,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(data['description'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['description'])==str(res[1]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set updatedat=now() " \
"where description = '"+ str(res[1])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set description='"+str(data['description'])+"', " \
"updatedat=now() where description = "+ str(res[1])
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
"createdat,updatedat) select id,description,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(res[1])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_ratings_reviews_producttags(self,data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id = '"+str(data['rce_rating_id'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" (rce_rating_id,tag_ids) " \
"values("+str(data['rce_rating_id'])+",'"+str(data['tag_ids'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
"createdat,updatedat) select id,rce_rating_id,tag_ids,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id='"+str(data['rce_rating_id'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_rating_id'])==str(res[1]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set updatedat=now() " \
"where rce_rating_id = '"+ str(res[1])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set rce_rating_id='"+str(data['rce_rating_id'])+"', " \
"updatedat=now() where rce_rating_id = "+ str(res[1])
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
"createdat,updatedat) select id,description,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where description='"+str(res[1])+"'"
#logging.info(sql)
self.cur.execute(sql)

View File

@ -0,0 +1,426 @@
import hashlib
import json
import logging
import random
import sys
import string
import psycopg2
import time
import re
import requests
from noon_db_writer import noon_db_writer
from datetime import datetime
from noon_raw_product import get_product_info_raw
class noon_products:
def __init__(self, config):
self.config = config
self.crawler_name = self.config.get("crawler_name")
self.pattern = r'[' + string.punctuation + ']'
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Noon'")
self.rce_source_id = self.cur.fetchone()[0]
self.cur.execute("select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where crawler_name='raena_crawler_enginer_noon' and flag=0")
self.items = self.cur.fetchall()
self.db_writer = noon_db_writer(config)
#self.display = Display(visible=0, size=(800, 600))
#self.display.start()
def __del__(self):
print("Closing connection.....")
self.conn.close()
#self.display.stop()
def slack_notification(message):
webhook_url = "https://hooks.slack.com/services/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"
slack_data = {"text": "Issue occurred on Noon Crawler. Error: " + str(message)}
response = requests.post(
webhook_url, data=json.dumps(slack_data),
headers={"Content-Type": "application/json"}
)
if response.status_code != 200:
raise ValueError(
f"Request to Slack returned an error {response.status_code}, {response.text}"
)
def start_processing(self):
count = 0
for item in self.items:
count += 1
try:
logging.info("============== Getting info for {}/{}: {} ================".format(str(count),str(len(self.items)),str(item)))
start = datetime.now()
self.get_product_info(item)
end = datetime.now()
logging.info('Total time taken to fetch the product: {}'.format(str(end-start)))
# sleeptime = random.randint(20,50)
# logging.info("Sleeping for {} sec".format(str(sleeptime)))
# time.sleep(sleeptime)
time.sleep(5)
except Exception as e:
print(e)
self.slack_notification(e)
def reseller_info(self, data):
try:
stores = data["product"]["variants"][0]["offers"]
if stores:
return_item = ""
flag = 0
for store in stores:
##### reseller info
data_reseller = {}
data_reseller['rce_source_id'] = self.rce_source_id
data_reseller['rce_source_reseller_status'] = 1
data_reseller['reseller_name'] = ""
data_reseller['reseller_average_rating'] = 0.0
data_reseller['reseller_description'] = ""
try:
data_reseller['reseller_name'] = store["store_name"]
data_reseller['reseller_name'] = data_reseller['reseller_name'].replace("'","")
except:
pass
try:
data_reseller['reseller_average_rating'] = float(store["partner_ratings_sellerlab"]["partner_rating"])
except:
pass
try:
self.db_writer.rce_reseller(data_reseller)
except Exception as e:
logging.info(e)
##### Store info
data_reseller_store = {}
data_reseller_store['rce_source_store_status'] = 1
data_reseller_store['store_page_url'] = ""
data_reseller_store['store_page_url_hash'] = ""
data_reseller_store['store_location'] = ""
data_reseller_store['rce_reseller_id'] = ""
data_reseller_store['rce_source_id'] = self.rce_source_id
try:
data_reseller_store['store_page_url'] = "https://www.noon.com/uae-en/seller/" + store["store_code"]
data_reseller_store['store_page_url'] = data_reseller_store['store_page_url'].replace("'","")
data_reseller_store['store_page_url_hash'] = hashlib.md5(data_reseller_store['store_page_url'].encode('utf-8')).hexdigest()
except:
pass
try:
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data_reseller['reseller_name'])+"'")
rce_reseller_id = self.cur.fetchone()
data_reseller_store['rce_reseller_id'] = rce_reseller_id[0]
if flag == 0:
return_item = data_reseller_store['rce_reseller_id']
flag = 1
except:
pass
try:
self.db_writer.rce_reseller_store(data_reseller_store)
except Exception as e:
logging.info(e)
return return_item
except Exception as e:
print(e)
def brand_info(self, data):
data_brand = {}
data_brand['rce_source_id'] = self.rce_source_id
data_brand['rce_source_brand_status'] = 1
data_brand['brand_page_url'] = ""
data_brand['brand_page_url_hash'] = ""
data_brand['brand_name'] = ""
try:
data_brand['brand_page_url'] = "https://www.noon.com/uae-en/" + data["product"]["brand_code"]
data_brand['brand_page_url_hash'] = hashlib.md5(data_brand['brand_page_url'].encode('utf-8')).hexdigest()
try:
data_brand['brand_name'] = data["product"]["brand"]
data_brand['brand_name'] = data_brand['brand_name'].replace("'","")
except:
pass
try:
self.db_writer.rce_brand(data_brand)
except Exception as e:
logging.info(e)
return data_brand['brand_name']
except:
pass
def product_info(self, data, category, keyword, url, url_hash, brand_name, rce_reseller_id):
data_product = {}
data_product['rce_source_product_id'] = 0
data_product['rce_source_id'] = self.rce_source_id
data_product['rce_source_product_status'] = 1
data_product['product_page_url'] = url.replace("'","''")
data_product['product_page_url_hash'] = url_hash
data_product['rce_category_id'] = int(category)
data_product['rce_brand_id'] = ""
data_product['rce_store_id'] = ""
data_product['rce_source_product_name'] = ""
data_product['product_images'] = ""
data_product['product_description'] = ""
data_product['product_sold_total'] = 0
data_product['product_sold'] = 0
data_product['product_price_min'] = ""
data_product['product_price_min_before_discount'] =""
data_product['product_price_max'] = ""
data_product['product_price_max_before_discount'] = ""
data_product['ratings'] = 0.0
data_product['product_section'] = keyword
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_name = '"+str(brand_name)+"'"
self.cur.execute(sql)
data_product['rce_brand_id'] = self.cur.fetchone()[0]
except: pass
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+""
self.cur.execute(sql)
data_product['rce_store_id'] = self.cur.fetchone()[0]
except: pass
try:
rce_source_product_name = data["product"]["product_title"]
data_product['rce_source_product_name'] = str(re.sub(self.pattern, '', rce_source_product_name)).replace("'","")
except: pass
try:
images = data["product"]["image_keys"]
data_product['product_images'] = ','.join(images)
#print(data_product['product_images'])
except: pass
try:
data_product['product_description'] = data["product"]["long_description"] + " ".join(data["product"]["feature_bullets"])
data_product['product_description'] = str(re.sub(self.pattern, '', data_product['product_description'])).replace("'","")
except:
pass
try:
data_product['product_price_min'] = str(data["product"]["variants"][0]["offers"][0]["sale_price"])
data_product['product_price_max'] = data_product['product_price_min']
except:
data_product['product_price_min'] = str(data["product"]["variants"][0]["offers"][0]["price"])
data_product['product_price_max'] = data_product['product_price_min']
pass
try:
data_product['product_price_min_before_discount'] = str(data["product"]["variants"][0]["offers"][0]["price"])
data_product['product_price_max_before_discount'] = str(data["product"]["variants"][0]["offers"][0]["price"])
except:
pass
try:
data_product['ratings'] = float(data["product"]["product_rating"]["value"])
#print(data_product['ratings'])
except:
pass
try:
self.db_writer.rce_product(data_product)
except Exception as e:
logging.info(e)
### rce_product_variant
try:
variants = data["product"]["groups"][0]["options"]
if variants:
for variant in variants:
data_variant = {}
data_variant['rce_source_variant_id'] = 0
data_variant['rce_product_id'] = ""
data_variant['product_variant_name'] = ""
data_variant['product_variant_price'] = 0
data_variant['product_variant_price_before_discount'] = 0
data_variant['product_variant_stock'] = 0
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_name = '"+str(data_product['rce_source_product_name'])+"'"
self.cur.execute(sql)
data_variant['rce_product_id'] = self.cur.fetchone()[0]
except:
pass
try:
product_variant_name = variant["name"]
data_variant['product_variant_name'] = str(re.sub(self.pattern, '', product_variant_name)).replace("'","''")
except: pass
try:
self.db_writer.rce_product_variant(data_variant)
except Exception as e:
logging.info(e)
time.sleep(random.randint(2,5))
else:
logging.info('No variant found')
except:
logging.info('No variant found')
pass
def rating_info(self, data, rce_reseller_id, url_hash):
try:
data_reviews = []
data_reviews_ar = []
data_reviews_en = []
try:
if data["product"]["reviews"]["comments"]["ar"]["reviews"]:
data_reviews_ar = data["product"]["reviews"]["comments"]["ar"]["reviews"]
data_reviews.extend(data_reviews_ar)
except:
pass
try:
if data["product"]["reviews"]["comments"]["en"]["reviews"]:
data_reviews_en = data["product"]["reviews"]["comments"]["en"]["reviews"]
data_reviews.extend(data_reviews_en)
except:
pass
for review in data_reviews:
data_review = {}
data_review["id"] = ""
data_review["rce_product_id"] = ""
data_review["username"] = ""
data_review["review"] = ""
data_review["img_url"] = ""
data_review["review_like_count"] = 0
data_review["user_tier"] = ""
data_review["shop_id"] = 0
data_review["video_url"] = ""
data_review["rating"] = ""
try:
sql = "select max(id) from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')
self.cur.execute(sql)
rating_id = self.cur.fetchone()
if rating_id[0]==None:
rating_id = 1
else:
rating_id = int(rating_id[0]) + 1
data_review["id"] = rating_id
except:
pass
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where product_page_url_hash = '"+str(url_hash)+"'"
self.cur.execute(sql)
data_review["rce_product_id"] = self.cur.fetchone()[0]
except: pass
try: data_review["username"] = review["displayName"]
except: pass
try:
try:
title = review["title"]
except:
pass
try:
comment = review["comment"]
except:
pass
data_review["review"] = title + comment
data_review["review"] = data_review["review"].replace("'","")
except: pass
try:
data_review["review_like_count"] = review["helpfulCount"]
except:
pass
try:
data_review["rating"] = review["rating"]
except: pass
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+""
self.cur.execute(sql)
data_review["shop_id"] = self.cur.fetchone()[0]
except: pass
try:
self.db_writer.rce_ratings_reviews(data_review)
except Exception as e:
logging.info(e)
except:
pass
def get_product_info(self,item):
try:
data = get_product_info_raw(item[4])
##### Reseller info #####
rce_reseller_id = self.reseller_info(data)
##### Product Info #####
##### Brand Info
brand_name = self.brand_info(data)
##### Product info
self.product_info(data, item[2], item[3], item[4], item[5], brand_name, rce_reseller_id)
##### Rating Info #####
self.rating_info(data, rce_reseller_id, item[5])
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 where product_page_url_hash='{item[5]}'
"""
self.cur.execute(sql)
except Exception as e:
print(e)

View File

@ -0,0 +1,62 @@
import json
import logging
import requests
# import random
# import string
# import uuid
# import time
# import jwt
from urllib.parse import urlparse, quote
##### Looger ######
format = "%(asctime)s: %(message)s"
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
def get_product_info_raw(url):
#parsed_url = urlparse(url)
parsed_url = url.replace("noon.com/uae-en/", "noon.com/_svc/catalog/api/v3/u/")
print(parsed_url)
encoded_url = quote(parsed_url, safe='')
api_url= 'http://localhost:3090/rcs/v1/noon/'
print(url)
print(api_url+encoded_url)
response = requests.request("GET", api_url+encoded_url)
logging.info(response)
print(api_url+encoded_url)
data = json.loads(response.text)
return data['data']
# def generate_sentry_trace():
# trace_id = ''.join(random.choices(string.ascii_lowercase + string.digits, k=32))
# span_id = ''.join(random.choices(string.ascii_lowercase + string.digits, k=16))
# sampling_decision = random.randint(0, 1)
#
# sentry_trace = f'{trace_id}-{span_id}-{sampling_decision}'
# return sentry_trace
#
# def generate_x_visitor_id():
# x_visitor_id = str(uuid.uuid4())
# return x_visitor_id
#
# def generate_cookie():
# payload = {
# 'raId': 'd1e3f451135d40958672d78da1f8c612',
# 'iat': int(time.time()),
# 'exp': int(time.time()+60)
# }
# # Generate the cookie string without a secret key
# cookie = jwt.encode(payload, '', algorithm='HS256')
#
# return cookie
# url = 'https://www.noon.com/uae-en/niacinamide-10-and-zinc-1-clear-30ml/N23772548A/p/?o=cbd635fab2298abe'
# #
# print(get_product_info_raw(url))

View File

@ -0,0 +1,30 @@
import hashlib
import logging
#import undetected_chromedriver as webdriver
from selenium import webdriver
from selenium.webdriver import ActionChains, Keys
from selenium.webdriver.chrome.service import Service
import psycopg2
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from noon_db_writer import noon_db_writer
from pyvirtualdisplay import Display
from scroller.scroller import smartScroll
import time
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
driver = webdriver.Firefox()
driver.get('https://www.noon.com/uae-en/beauty/')
driver.implicitly_wait(5)
elements = driver.find_element(By.XPATH, '//*[@id="__next"]/div/section/div/div/div[23]/div/div/div/div/div/div/div/div/div[2]/div[1]/div').find_elements(By.CSS_SELECTOR,'.swiper-slide')
for element in elements:
link = element.find_element(By.TAG_NAME, 'a').get_attribute('href')
print(link)
driver.close()

View File

@ -0,0 +1,20 @@
import requests
import json
def slack_notification(message):
webhook_url = "https://hooks.slack.com/services/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"
slack_data = {"text": message}
response = requests.post(
webhook_url, data=json.dumps(slack_data),
headers={"Content-Type": "application/json"}
)
if response.status_code != 200:
raise ValueError(
f"Request to Slack returned an error {response.status_code}, {response.text}"
)
message = "Hello from Python!"
slack_notification(message)

View File

View File

@ -0,0 +1,12 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class RaenaCrawlerItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass

View File

@ -0,0 +1,103 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class RaenaCrawlerSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class RaenaCrawlerDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)

View File

@ -0,0 +1,18 @@
# pipelines.py
import json
class OliveYoungPipeline:
def __init__(self):
self.file = None
def open_spider(self, spider):
self.file = open('output.json', 'w')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(item) + "\n"
self.file.write(line)
return item

View File

@ -0,0 +1,111 @@
appdirs==1.4.4
appnope @ file:///opt/concourse/worker/volumes/live/4f734db2-9ca8-4d8b-5b29-6ca15b4b4772/volume/appnope_1606859466979/work
argon2-cffi @ file:///opt/conda/conda-bld/argon2-cffi_1645000214183/work
argon2-cffi-bindings @ file:///opt/concourse/worker/volumes/live/c6f9b05d-dc80-4dbc-7473-70bfcb66883c/volume/argon2-cffi-bindings_1644569703264/work
attrs @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_33k1uces4n/croot/attrs_1668696162258/work
Automat==22.10.0
backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work
bleach @ file:///opt/conda/conda-bld/bleach_1641577558959/work
brotlipy==0.7.0
certifi @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_477u68wvzm/croot/certifi_1671487773341/work/certifi
cffi @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_1b0qzba5nr/croot/cffi_1670423213150/work
charset-normalizer @ file:///tmp/build/80754af9/charset-normalizer_1630003229654/work
colorama @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_f5t80kwp9l/croot/colorama_1672386533201/work
ConfigUpdater @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_495uyr_0u4/croot/configupdater_1668698019809/work
constantly==15.1.0
cryptography @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_19cvzxmeb9/croot/cryptography_1677533085498/work
cssselect==1.2.0
debugpy @ file:///opt/concourse/worker/volumes/live/32b11d06-4d64-4ec8-497a-cf4fc97343d2/volume/debugpy_1637091821874/work
decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work
defusedxml @ file:///tmp/build/80754af9/defusedxml_1615228127516/work
entrypoints @ file:///opt/concourse/worker/volumes/live/194c0a28-55ce-4e83-6a87-0d9f2e06ab2c/volume/entrypoints_1649926487944/work
fake-useragent==1.2.1
Faker==18.13.0
fastjsonschema @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b5c1gee32t/croots/recipe/python-fastjsonschema_1661368622875/work
filelock==3.12.2
hyperlink @ file:///tmp/build/80754af9/hyperlink_1610130746837/work
idna @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_00jf0h4zbt/croot/idna_1666125573348/work
imagesize @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_4a6ed1be-fe30-4d6a-91d4-f867600caa0be5_dxzvt/croots/recipe/imagesize_1657179500955/work
importlib-metadata @ file:///opt/concourse/worker/volumes/live/4e1a3384-472f-4bcb-7776-cb0076aaea40/volume/importlib-metadata_1648562431336/work
importlib-resources @ file:///tmp/build/80754af9/importlib_resources_1625135880749/work
incremental==22.10.0
ipykernel @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_05yte6zd0k/croots/recipe/ipykernel_1662361808878/work
ipython @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b9echyik_d/croots/recipe/ipython_1659529861316/work
ipython-genutils @ file:///tmp/build/80754af9/ipython_genutils_1606773439826/work
itemadapter==0.8.0
itemloaders==1.1.0
jedi @ file:///opt/concourse/worker/volumes/live/c9d2fa99-8bc1-4572-41e7-6beba6391441/volume/jedi_1644315238822/work
Jinja2 @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_6adj7x0ejx/croot/jinja2_1666908137966/work
jmespath==1.0.1
jsonschema @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_21cqeq1xnk/croot/jsonschema_1676558686956/work
jupyter_client @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_41tzpfqkok/croots/recipe/jupyter_client_1661848920196/work
jupyter_core @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_fc_0us_ta7/croot/jupyter_core_1668084443574/work
jupyterlab-pygments @ file:///tmp/build/80754af9/jupyterlab_pygments_1601490720602/work
langcodes @ file:///opt/conda/conda-bld/langcodes_1643477751144/work
lxml==4.9.3
MarkupSafe @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_d4a9444f-bd4c-4043-b47d-cede33979b0fve7bm42r/croots/recipe/markupsafe_1654597878200/work
matplotlib-inline @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_9ddl71oqte/croots/recipe/matplotlib-inline_1662014471815/work
mccabe @ file:///opt/conda/conda-bld/mccabe_1644221741721/work
mistune==0.8.4
nbclient @ file:///opt/concourse/worker/volumes/live/2b77047f-e15a-4d19-54ac-7d87d20b74de/volume/nbclient_1650308375803/work
nbconvert @ file:///opt/concourse/worker/volumes/live/84c159ef-8fac-4372-7b64-25f831ab7aec/volume/nbconvert_1624479064764/work
nbformat @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_2daun1fill/croot/nbformat_1670352339504/work
nest-asyncio @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_64pfm74mxq/croot/nest-asyncio_1672387129786/work
nose @ file:///opt/conda/conda-bld/nose_1642704612149/work
notebook @ file:///opt/concourse/worker/volumes/live/f984e24b-6ef4-4a5b-55be-c5db1417e27a/volume/notebook_1621528337539/work
packaging @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_bet5qdixgt/croot/packaging_1671697440883/work
pandocfilters @ file:///opt/conda/conda-bld/pandocfilters_1643405455980/work
parsel==1.8.1
parso @ file:///tmp/build/80754af9/parso_1617223946239/work
pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work
pickleshare @ file:///tmp/build/80754af9/pickleshare_1606932040724/work
pkgutil_resolve_name @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_c9l5hym8w0/croots/recipe/pkgutil-resolve-name_1661463329338/work
prometheus-client @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_19kjbndib7/croots/recipe/prometheus_client_1659455105394/work
prompt-toolkit @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_82emz7mook/croot/prompt-toolkit_1672387300396/work
Protego==0.2.1
psutil @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_c9b604bf-685f-47f6-8304-238e4e70557e1o7mmsot/croots/recipe/psutil_1656431274701/work
psycopg2-binary==2.9.7
ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
py @ file:///tmp/build/80754af9/py_1607971587848/work
pyasn1==0.5.0
pyasn1-modules==0.3.0
pycodestyle @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_a7riaf725h/croot/pycodestyle_1674267226642/work
pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work
PyDispatcher==2.0.7
pyflakes @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_a87qrne4ps/croot/pyflakes_1674165135821/work
Pygments @ file:///opt/conda/conda-bld/pygments_1644249106324/work
pyOpenSSL @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_6dweji2whw/croot/pyopenssl_1677607689781/work
pyrsistent @ file:///opt/concourse/worker/volumes/live/24b7a9ab-37d8-463c-575f-69184f9cfbc8/volume/pyrsistent_1636111022304/work
PySocks @ file:///opt/concourse/worker/volumes/live/ef943889-94fc-4539-798d-461c60b77804/volume/pysocks_1605305801690/work
python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work
pyzmq @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_15f7a459-ad98-422b-b8da-cbf1f626e2115nt0ocwy/croots/recipe/pyzmq_1657724193704/work
queuelib==1.6.2
requests @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_516b78ce-034d-4395-b9b5-1d78c2847384qtnol99l/croots/recipe/requests_1657734628886/work
requests-file @ file:///Users/ktietz/demo/mc3/conda-bld/requests-file_1629455781986/work
Scrapy==2.9.0
scrapy-fake-useragent==1.4.4
scrapy-rotating-proxies==0.6.2
scrapy-splash==0.9.0
Send2Trash @ file:///tmp/build/80754af9/send2trash_1632406701022/work
service-identity==21.1.0
six @ file:///tmp/build/80754af9/six_1644875935023/work
snowballstemmer @ file:///tmp/build/80754af9/snowballstemmer_1637937080595/work
sphinxcontrib-devhelp @ file:///home/ktietz/src/ci/sphinxcontrib-devhelp_1611920923094/work
sphinxcontrib-jsmath @ file:///home/ktietz/src/ci/sphinxcontrib-jsmath_1611920942228/work
sphinxcontrib-qthelp @ file:///home/ktietz/src/ci/sphinxcontrib-qthelp_1611921055322/work
sphinxcontrib-serializinghtml @ file:///tmp/build/80754af9/sphinxcontrib-serializinghtml_1624451540180/work
terminado @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_18_p3gbeio/croot/terminado_1671751835656/work
testpath @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_aaf4aec7-dbb6-43d6-9707-824338b4efc82yrt6xjp/croots/recipe/testpath_1655908558843/work
tldextract==3.4.4
toml @ file:///tmp/build/80754af9/toml_1616166611790/work
tornado @ file:///opt/concourse/worker/volumes/live/d531d395-893c-4ca1-6a5f-717b318eb08c/volume/tornado_1606942307627/work
traitlets @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_0dtilxc0bw/croot/traitlets_1671143889152/work
Twisted==22.10.0
typing==3.7.4.3
typing_extensions @ file:///opt/conda/conda-bld/typing_extensions_1647553014482/work
urllib3 @ file:///opt/conda/conda-bld/urllib3_1643638302206/work
w3lib==2.1.1
wcwidth @ file:///Users/ktietz/demo/mc3/conda-bld/wcwidth_1629357192024/work
webencodings==0.5.1
zipp @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b71z79bye2/croot/zipp_1672387125902/work
zope.interface==6.0

View File

@ -0,0 +1,18 @@
#!/bin/bash
docker rm splash-local
docker pull scrapinghub/splash
docker run --name splash-local -p 8050:8050 -d scrapinghub/splash
sleep 10
scrapy crawl oliveyoung_product
sleep 10
scrapy crawl tiktok_hashtag
docker stop splash-local
docker rm splash-local

View File

@ -0,0 +1,117 @@
# Scrapy settings for raena_crawler project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "raena_crawler"
SPIDER_MODULES = ["raena_crawler.spiders"]
NEWSPIDER_MODULE = "raena_crawler.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "raena_crawler (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 10
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "raena_crawler.middlewares.RaenaCrawlerSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# "raena_crawler.middlewares.RaenaCrawlerDownloaderMiddleware": 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'raena_crawler.pipelines.OliveYoungPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
SPLASH_URL = 'http://localhost:8050'
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
'scrapy_fake_useragent.middleware.RetryUserAgentMiddleware': 401,
'rotating_proxies.middlewares.RotatingProxyMiddleware': 610,
'rotating_proxies.middlewares.BanDetectionMiddleware': 620,
}
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
FAKEUSERAGENT_PROVIDERS = [
'scrapy_fake_useragent.providers.FakeUserAgentProvider', # This is the first provider we'll try
'scrapy_fake_useragent.providers.FakerProvider', # If FakeUserAgentProvider fails, we'll use faker to generate a user-agent string for us
'scrapy_fake_useragent.providers.FixedUserAgentProvider', # Fall back to USER_AGENT value
]
USER_AGENT = 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'

View File

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@ -0,0 +1,94 @@
import scrapy
from scrapy_splash import SplashRequest
import psycopg2
import logging
config = {
"db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
"db_port": "5432",
"db": "analytics",
"db_user": "dbadmin",
"db_pass": "5qCif6eyY3Kmg4z"
}
class OliveyoungSpider(scrapy.Spider):
name = 'oliveyoung_product'
allowed_domains = ['https://global.oliveyoung.com/?gad=1&gclid=CjwKCAjwq4imBhBQEiwA9Nx1Bi5w7mSF9wgKTFqfX37hyG_c3ocYHldGoXbIX1XfYKQQFxLOPECJCxoCxpEQAvD_BwE']
def start_requests(self):
url = 'https://global.oliveyoung.com/'
yield SplashRequest(url, self.parse, args={'wait': 5})
def parse(self, response):
conn = psycopg2.connect(database=config.get('db'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port'))
logging.info(conn)
conn.autocommit = True
cur = conn.cursor()
product_sections = [
('Best Sellers','#\#tab12'),
('MDS PICK','#\#tab22'),
('K-POP','div.main-section:nth-child(6) > div:nth-child(2)'),
('Featured','.main-brand-banner'),
('RECOMMENDATION','div.main-section:nth-child(9) > div:nth-child(2)'),
('FEATURED BRANDS', '#featuredBrands > div:nth-child(2)')
]
for product_section in product_sections:
products = response.css(str(product_section[1]))
product_selector = '.wrap-prd-info'
brand_selector = '.list-thumb-tit::text'
if 'FEATURED BRANDS' in product_section[0]:
product_selector = '.fig-title.ellipsis'
brand_selector = '.fig-title.ellipsis::text'
for product in products:
items = product.css(product_selector)
for item in items:
product_brand = (item.css(brand_selector).extract_first("")).replace("'","").strip()
product_name = item.css('.list-thumb-info::text').extract_first("").replace("'","").strip()
original_price = item.css('.price-cost::text').extract_first("").strip()
discounted_price = item.css('.prd-list-amountDue::text').extract_first("").strip()
logging.info("Collecting data for: {}".format(product_name))
sql = f"""
select product_section,product_brand,product_name from raena_spider_management.oliveyoung_products where product_section='{product_section[0]}' and product_brand='{product_brand}' and product_name='{product_name}'
"""
#logging.info(sql)
cur.execute(sql)
res = cur.fetchone()
if res:
sql = f"""
update raena_spider_management.oliveyoung_products set original_price='{original_price}',
discounted_price='{discounted_price}', updatedat=now()
where product_section='{product_section[0]}' and product_brand='{product_brand}' and product_name='{product_name}'
"""
#logging.info(sql)
cur.execute(sql)
else:
sql = f"""
insert into raena_spider_management.oliveyoung_products(product_section,product_brand,product_name,original_price,discounted_price,createdat,updatedat)
values('{product_section[0]}','{product_brand}','{product_name}','{original_price}','{discounted_price}',now(),now())
"""
#logging.info(sql)
cur.execute(sql)
conn.close()

View File

@ -0,0 +1,63 @@
# oliveyoung.py
import scrapy
import requests
class OliveYoungSpider(scrapy.Spider):
name = 'oliveyoung_bk'
start_urls = [
'https://global.oliveyoung.com/?gad=1&gclid=CjwKCAjwq4imBhBQEiwA9Nx1Bi5w7mSF9wgKTFqfX37hyG_c3ocYHldGoXbIX1XfYKQQFxLOPECJCxoCxpEQAvD_BwE']
def parse(self, response):
sections = {
"Best Sellers": "//div[@class='slick-slider-customized']/div[contains(@class,'slick-slide')]",
# "MD's Pick": "//section[@id='md_pick']/div[@class='item']/div[@class='product-item']",
# "Featured Brands": "//section[@id='brand_list']/div[@class='product-item']",
# "K-Pop": "//section[@id='kpop_list']/div[@class='product-item']",
# "INNISFREE": "//section[@id='brand_zone']/div[contains(@class,'brand-inn-store')]//div["
# "@class='product-item']",
# "Recommendation": "//section[@id='recommendation']/div[contains(@class,'product-item')]",
}
# Extract data from each section
for section_name, section_xpath in sections.items():
products = response.xpath(section_xpath)
for product in products:
brand_name = product.xpath(".//span[@class='brand']/text()").get()
product_name = product.xpath(".//span[@class='name']/text()").get()
price = product.xpath(".//span[@class='num']/text()").get()
if brand_name:
yield {
"brand_name": brand_name.strip(),
"product_name": product_name.strip(),
"price": price.strip(),
"section": section_name,
}
# # Generate hashtags for each brand name
# hashtags = [word.lower() for word in brand_name.split()]
# hashtags = '#'.join(hashtags)
# yield {
# "brand_name": brand_name.strip(),
# "hashtags": f"#{hashtags}",
# }
#
# # Fetch views data from TikTok API using tiktok_api.py
# views_all, views = get_hashtag_views(hashtags)
# yield {
# "brand_name": brand_name.strip(),
# "hashtags": f"#{hashtags}",
# "views_all": views_all,
# "views": views,
# }
def get_hashtag_views(hashtag):
url = f'https://ads.tiktok.com/creative_radar_api/v1/popular_trend/hashtag/detail?period=7&hashtag_name={hashtag}&country_code=IS'
headers = {
# Add the headers from the CURL request here
}
response = requests.get(url, headers=headers)
data = response.json()
return data.get('hashtag', {}).get('video_views_all', 0), data.get('hashtag', {}).get('video_views', 0)

View File

@ -0,0 +1,103 @@
import scrapy
import psycopg2
import logging
import time
import random
config = {
"db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
"db_port": "5432",
"db": "analytics",
"db_user": "dbadmin",
"db_pass": "5qCif6eyY3Kmg4z"
}
class TiktokHashtag(scrapy.Spider):
name = 'tiktok_hashtag'
start_urls = ['https://ads.tiktok.com/business/creativecenter/hashtag/beautyofjoseon/pc/en?countryCode=ID&period=7']
def start_requests(self):
conn = psycopg2.connect(database=config.get('db'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port'))
logging.info(conn)
conn.autocommit = True
cur = conn.cursor()
sql = f"""
select distinct product_brand, hashtag from (
select distinct product_brand, replace(regexp_replace(lower(product_brand), '[^\w]+','','g'),' ','') hashtag
from raena_spider_management.oliveyoung_products
union
select distinct product_brand, replace(regexp_replace(lower(product_brand), '[^\w]+',' ','g'),' ','_') hashtag
from raena_spider_management.oliveyoung_products) a
order by product_brand
"""
logging.info(sql)
cur.execute(sql)
brands = cur.fetchall()
logging.info(brands)
for brand in brands:
url_hashtag = "https://ads.tiktok.com/business/creativecenter/hashtag/"+brand[1]+"/pc/en?countryCode=ID&period=7"
yield scrapy.Request(url_hashtag, self.get_hashtag_info, meta={'meta': brand})
time.sleep(random.randint(10,20))
def get_hashtag_info(self, response):
logging.info("Collecting hashTag info")
conn = psycopg2.connect(database=config.get('db'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port'))
conn.autocommit = True
cur = conn.cursor()
brand = response.meta.get('meta')
post_last7days = "0"
post_overall = "0"
view_last7days = "0"
view_overall = "0"
try:
post_last7days = response.xpath('/html/body/div[1]/div/main/div/div[1]/div/div[2]/div[1]/div[2]/div[1]/div/div[1]/span[1]/text()').get()
post_overall = response.xpath('/html/body/div[1]/div/main/div/div[1]/div/div[2]/div[1]/div[2]/div[1]/div/div[3]/span[1]/text()').get()
view_last7days = response.xpath('/html/body/div[1]/div/main/div/div[1]/div/div[2]/div[1]/div[2]/div[2]/div/div[1]/span[1]/text()').get()
view_overall = response.xpath('/html/body/div[1]/div/main/div/div[1]/div/div[2]/div[1]/div[2]/div[2]/div/div[3]/span[1]/text()').get()
except:
pass
sql = f"""
select product_brand,brand_hashtag from raena_spider_management.oliveyoung_brand_hashtag
where product_brand='{brand[0]}' and brand_hashtag='{brand[1]}'
"""
cur.execute(sql)
res = cur.fetchone()
if res:
sql = f"""
update raena_spider_management.oliveyoung_brand_hashtag set posts='{post_last7days}', posts_total='{post_overall}',
views='{view_last7days}', views_overall='{view_overall}', updatedat=now()
where product_brand='{brand[0]}' and brand_hashtag='{brand[1]}'
"""
cur.execute(sql)
else:
sql = f"""
insert into raena_spider_management.oliveyoung_brand_hashtag(product_brand,brand_hashtag,posts,posts_total,views,views_overall,createdat,updatedat)
values('{brand[0]}','{brand[1]}','{post_last7days}','{post_overall}','{view_last7days}','{view_overall}',now(),now())
"""
cur.execute(sql)
conn.close()

View File

@ -0,0 +1,76 @@
***Run:***
1. Change config accourding to the crawler type.
2. run "python shopee_crawler.py"
***Config for Master:***
config = {
"crawler_name": "raena_crawler_enginer_shopee",
"crawler_schema": "raena_spider_management",
"category_tab": "rce_category",
"tracker_tab": "crawler_tracker",
"product_tab": "rce_product",
"variant_tab": "rce_product_variant",
"brand_tab": "rce_brand",
"reseller_tab": "rce_reseller",
"reseller_store_tab": "rce_reseller_store",
"review_tab": "rce_ratings_reviews",
"product_per_category": "136",
"source_category": "11043145",
"db_user": "crawler",
"db_pass": "4Z063Zp9Aczv",
"database": "raena_db",
"db_host": "raen-prd-sg-aurora-pg-rds-cluster-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
"db_port": "5432",
"crawler_main": "1",
"crawler_slave_no": ""
}
***Config for Slave01:***
config = {
"crawler_name": "raena_crawler_enginer_shopee",
"crawler_schema": "raena_spider_management",
"category_tab": "rce_category",
"tracker_tab": "crawler_tracker",
"product_tab": "rce_product",
"variant_tab": "rce_product_variant",
"brand_tab": "rce_brand",
"reseller_tab": "rce_reseller",
"reseller_store_tab": "rce_reseller_store",
"review_tab": "rce_ratings_reviews",
"product_per_category": "136",
"source_category": "11043145",
"db_user": "crawler",
"db_pass": "4Z063Zp9Aczv",
"database": "raena_db",
"db_host": "raen-prd-sg-aurora-pg-rds-cluster-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
"db_port": "5432",
"crawler_main": "0",
"crawler_slave_no": "1"
}
***Config for Slave02:***
config = {
"crawler_name": "raena_crawler_enginer_shopee",
"crawler_schema": "raena_spider_management",
"category_tab": "rce_category",
"tracker_tab": "crawler_tracker",
"product_tab": "rce_product",
"variant_tab": "rce_product_variant",
"brand_tab": "rce_brand",
"reseller_tab": "rce_reseller",
"reseller_store_tab": "rce_reseller_store",
"review_tab": "rce_ratings_reviews",
"product_per_category": "136",
"source_category": "11043145",
"db_user": "crawler",
"db_pass": "4Z063Zp9Aczv",
"database": "raena_db",
"db_host": "raen-prd-sg-aurora-pg-rds-cluster-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
"db_port": "5432",
"crawler_main": "0",
"crawler_slave_no": "2"
}

25
shopee_crawler_engine/conf.json Executable file
View File

@ -0,0 +1,25 @@
{
"crawler_name": "raena_crawler_enginer_shopee",
"crawler_schema": "raena_spider_management",
"category_tab": "rce_category",
"tracker_tab": "crawler_tracker",
"product_tab": "rce_product",
"variant_tab": "rce_product_variant",
"brand_tab": "rce_brand",
"reseller_tab": "rce_reseller",
"reseller_store_tab": "rce_reseller_store",
"review_tab": "rce_ratings_reviews",
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
"review_producttags_tab": "rce_ratings_reviews_producttags",
"review_tags": "rce_tags",
"source_tab": "rce_source",
"product_per_category": "136",
"source_category": "11043145",
"db_user": "crawler",
"db_pass": "4Z063Zp9Aczv",
"database": "raena_db",
"db_host": "raen-prd-sg-aurora-pg-rds-cluster-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
"db_port": "5432",
"crawler_main": "1",
"crawler_slave_no": ""
}

View File

@ -0,0 +1,147 @@
alembic==1.9.3
anyio==3.6.2
apache-airflow==2.5.1
apache-airflow-providers-amazon==7.2.0
apache-airflow-providers-common-sql==1.3.3
apache-airflow-providers-ftp==3.3.1
apache-airflow-providers-http==4.1.1
apache-airflow-providers-imap==3.1.1
apache-airflow-providers-sqlite==3.3.1
apispec==3.3.2
argcomplete==1.12.3
asn1crypto==1.5.1
attrs==22.2.0
Babel==2.11.0
beautifulsoup4==4.11.2
blinker==1.5
boto3==1.26.69
botocore==1.29.69
cached-property==1.5.2
cachelib==0.9.0
cattrs==22.2.0
certifi==2022.12.7
cffi==1.15.1
chardet==3.0.4
charset-normalizer==3.0.1
click==8.1.3
clickclick==20.10.2
colorama==0.4.6
colorlog==4.0.2
configparser==3.5.3
ConfigUpdater==3.1.1
connexion==2.14.2
cron-descriptor==1.2.35
croniter==0.3.37
cryptography==39.0.1
decorator==5.1.1
defusedxml==0.7.1
Deprecated==1.2.13
dill==0.3.6
dnspython==2.3.0
docutils==0.19
email-validator==1.3.1
exceptiongroup==1.1.0
Flask==2.2.2
Flask-Admin==1.5.4
Flask-AppBuilder==4.1.4
Flask-Babel==1.0.0
Flask-Caching==2.0.2
Flask-JWT-Extended==4.4.4
Flask-Login==0.6.2
Flask-OpenID==1.3.0
Flask-Session==0.4.0
Flask-SQLAlchemy==2.5.1
flask-swagger==0.2.14
Flask-WTF==1.1.1
funcsigs==1.0.2
future==0.18.3
graphviz==0.20.1
greenlet==2.0.2
gunicorn==20.1.0
h11==0.14.0
httpcore==0.16.3
httpx==0.23.3
idna==2.10
importlib-resources==1.5.0
inflection==0.5.1
iso8601==1.1.0
itsdangerous==2.1.2
Jinja2==3.1.2
jmespath==0.10.0
json-merge-patch==0.2
jsonpath-ng==1.5.3
jsonschema==3.2.0
lazy-object-proxy==1.4.3
linkify-it-py==2.0.0
lockfile==0.12.2
lxml==4.9.2
Mako==1.2.4
Markdown==3.4.1
markdown-it-py==2.1.0
MarkupSafe==2.1.2
marshmallow==3.19.0
marshmallow-enum==1.5.1
marshmallow-oneofschema==3.0.1
marshmallow-sqlalchemy==0.23.1
mdit-py-plugins==0.3.3
mdurl==0.1.2
mypy-boto3-appflow==1.26.53
mypy-boto3-rds==1.26.47
mypy-boto3-redshift-data==1.26.30
natsort==8.2.0
numpy==1.24.2
packaging==23.0
pandas==1.5.3
pathspec==0.9.0
pendulum==2.1.2
piapy==0.2.0
pluggy==1.0.0
ply==3.11
prison==0.2.1
protobuf==4.21.12
psutil==5.9.4
pycparser==2.21
Pygments==2.14.0
PyJWT==2.6.0
pyrsistent==0.19.3
python-daemon==2.3.2
python-dateutil==2.8.2
python-dotenv==0.21.1
python-nvd3==0.15.0
python-slugify==8.0.0
python3-openid==3.2.0
pytz==2022.7.1
pytzdata==2020.1
PyYAML==6.0
redshift-connector==2.0.910
requests==2.28.2
requests-toolbelt==0.10.1
rfc3986==1.5.0
rich==13.3.1
s3transfer==0.6.0
scramp==1.4.4
setproctitle==1.3.2
six==1.16.0
sniffio==1.3.0
soupsieve==2.3.2.post1
SQLAlchemy==1.4.9
SQLAlchemy-JSONField==1.0.1.post0
sqlalchemy-redshift==0.8.12
SQLAlchemy-Utils==0.40.0
sqlparse==0.4.3
swagger-ui-bundle==0.0.9
tabulate==0.8.10
tenacity==8.2.1
termcolor==2.2.0
text-unidecode==1.3
thrift==0.16.0
typing_extensions==4.4.0
tzlocal==1.5.1
uc-micro-py==1.0.1
unicodecsv==0.14.1
urllib3==1.25.11
watchtower==2.0.1
Werkzeug==2.2.2
wrapt==1.14.1
WTForms==2.3.3
zope.deprecation==4.4.0

View File

@ -0,0 +1,177 @@
import hashlib
import logging
from selenium import webdriver
from selenium.webdriver import ActionChains, Keys
from selenium.webdriver.chrome.service import Service
import psycopg2
from selenium.webdriver.common.by import By
import bs4
from webdriver_manager.chrome import ChromeDriverManager
import random
from bs4 import BeautifulSoup
import json
import time
class shopee_category_products:
def __init__(self, config):
self.config = config
self.crawler_name = self.config.get("crawler_name")
self.url = "https://shopee.co.id/"
self.product_limit = int(self.config.get("product_per_category"))
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
sql = "delete from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where crawler_name='"+str(self.crawler_name)+"'"
self.cur.execute(sql)
def __del__(self):
print("Closing connection.....")
self.conn.close()
def browse_category_page(self):
op = webdriver.ChromeOptions()
hight = str(random.randint(640,1280))
width = str(random.randint(1024,1920))
op.add_argument("window-size="+width+","+hight+"")
op.add_experimental_option("useAutomationExtension", False)
op.add_argument('--no-sandbox')
op.add_argument('--disable-notifications')
op.add_argument("--lang=en-GB")
op.headless = True
driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op)
driver.get("https://shopee.co.id")
time.sleep(5)
cat = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/div/div/div[3]/div[2]/div[1]/div/div/div[2]/div/div[1]/ul/li[2]/div/a[2]/div')
ActionChains(driver).move_to_element(cat).double_click().perform()
time.sleep(10)
driver.execute_script("document.body.style.zoom='15%'")
time.sleep(10)
filters = driver.find_elements(By.CLASS_NAME, 'shopee-sort-by-options__option')
for filter in filters:
if filter.text == 'Terlaris':
logging.info("Sorting data by top sales.......")
driver.execute_script("arguments[0].click();", filter)
time.sleep(5)
lim = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/div/div[1]/div[4]/div[2]/div/div[1]/div[2]/div/span[2]').text
cat = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/div/div[1]/div[4]/div[1]/div[1]/div/div/div[1]/a').text
print("Collecting products for category: {}".format(str(cat)))
pg_cnt = 1
print("Collecting data for page: {}".format(str(pg_cnt)))
cnt = 0
skip = 0
cnt, skip = self.get_product(driver.page_source, cat, cnt, skip)
for i in range(int(lim)-1):
pg_cnt += 1
next = driver.find_element(By.XPATH,'/html/body/div[1]/div/div[2]/div/div[1]/div[4]/div[2]/div/div[1]/div[2]/button[2]')
driver.execute_script("arguments[0].click();", next)
time.sleep(5)
print("Collecting data for page: {}".format(str(pg_cnt)))
cnt, skip = self.get_product(driver.page_source, cat, cnt, skip)
if cnt >=self.product_limit:
break
more_cat = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/div/div/div[4]/div[1]/div[1]/div/div/div[2]/div/div[1]/div')
driver.execute_script("arguments[0].click();", more_cat)
time.sleep(10)
elements = driver.find_elements(By.CLASS_NAME, 'shopee-category-list__sub-category')
for element in elements:
driver.execute_script("arguments[0].click();", element)
time.sleep(5)
filters = driver.find_elements(By.CLASS_NAME, 'shopee-sort-by-options__option')
for filter in filters:
if filter.text == 'Terlaris':
logging.info("Sorting data by top sales.......")
driver.execute_script("arguments[0].click();", filter)
time.sleep(5)
lim = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/div/div/div[3]/div[2]/div/div[1]/div[2]/div/span[2]').text
print("Collecting products for subcategory: {}".format(str(element.text)))
pg_cnt = 1
print("Collecting data for page: {}".format(str(pg_cnt)))
cnt = 0
skip = 0
cnt, skip = self.get_product(driver.page_source, element.text, cnt, skip)
for i in range(int(lim)-1):
pg_cnt += 1
next = driver.find_element(By.XPATH,'/html/body/div[1]/div/div[2]/div/div/div[3]/div[2]/div/div[1]/div[2]/button[2]')
driver.execute_script("arguments[0].click();", next)
time.sleep(5)
print("Collecting data for page: {}".format(str(pg_cnt)))
cnt, skip = self.get_product(driver.page_source, element.text, cnt, skip)
if cnt >=self.product_limit:
break
time.sleep(random.randint(20,35))
def get_product(self, page_source, cat, cnt_main, skip_main):
try:
#Fetch page source
data = page_source
time.sleep(5)
#Fetch data from page source
try:
soup = bs4.BeautifulSoup(data,features="lxml")
all_product = soup.find_all('div',{'class':"col-xs-2-4 shopee-search-item-result__item"})
cnt = cnt_main
skip = skip_main
for product in all_product:
try:
product_link_element = product.find('a')
product_page_url = product_link_element.get('href')
product_page_url = ("https://shopee.co.id"+product_page_url).replace("'","''")
product_page_url_hash = hashlib.md5(product_page_url.encode('utf-8')).hexdigest()
ids = ((product_page_url.split('-i.')[1]).split('?')[0]).split('.')
itemid = ids[1]
shopid = ids[0]
flag = 0
#print("itemid: {}; shopid: {}".format(str(itemid), str(shopid)))
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where itemid='"+itemid+"' and shopid='"+shopid+"'"
self.cur.execute(sql)
res = self.cur.fetchall()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+"(crawler_name,keyword,shopid,itemid,product_page_url,product_page_url_hash,flag) values('"+str(self.crawler_name)+"','"+str(cat)+"',"+str(shopid)+","+str(itemid)+",'"+product_page_url+"','"+product_page_url_hash+"',"+str(flag)+")"
self.cur.execute(sql)
cnt += 1
if cnt >=self.product_limit:
break
#conn.commit()
else:
#print("Already collected. Skipping")
skip += 1
except Exception as e:
print("ERROR: {}".format(str(e)))
print("Total Items: {}\nTotal Collected: {}\nTotal Skipped: {}".format(str(len(all_product)),str(cnt), str(skip)))
return cnt, skip
except Exception as e:
print("Error: {}".format(str(e)))
except:
print("ERROR: Data cannot be collected.")

View File

@ -0,0 +1,213 @@
from shopee_sub_categories import shopee_sub_categories
from shopee_category_products import shopee_category_products
from shopee_products import shopee_products
import logging
import psycopg2
import json
###### Looger ######
format = "%(asctime)s: %(message)s"
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
config = {}
def get_sub_category():
sub_cat = shopee_sub_categories(config)
sub_cat.get_sub_categories()
def get_category_products(cur, slave01, slave02):
products = shopee_category_products(config)
products.browse_category_page()
if not slave01:
sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_slave01',1)"
cur.execute(sql)
else:
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=1 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave01'"
cur.execute(sql)
if not slave02:
sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_slave02',1)"
cur.execute(sql)
else:
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=1 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave02'"
cur.execute(sql)
def get_products_info():
product_info = shopee_products(config)
product_info.get_shopee_products()
def main():
crawler_main = int(config.get('crawler_main'))
crawler_slave_no = int(config.get('crawler_slave_no')) if config.get('crawler_slave_no') else None
if crawler_main:
crawler_master()
else:
if crawler_slave_no == 1:
crawler_slave1()
elif crawler_slave_no ==2:
crawler_slave2()
def crawler_master():
conn = psycopg2.connect(database=config.get('database'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port'))
conn.autocommit = True
cur = conn.cursor()
sql = "select crawler_name,keyword,flag from "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
cur.execute(sql)
res = cur.fetchone()
sql = "select crawler_name,keyword,flag from "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave01'"
cur.execute(sql)
slave01 = cur.fetchone()
sql = "select crawler_name,keyword,flag from "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave02'"
cur.execute(sql)
slave02 = cur.fetchone()
if not res:
sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_master',0)"
cur.execute(sql)
if not slave01:
sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_slave01',0)"
cur.execute(sql)
else:
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=0 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave01'"
cur.execute(sql)
if not slave02:
sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_slave02',0)"
cur.execute(sql)
else:
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=0 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave02'"
cur.execute(sql)
get_sub_category()
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=1 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
cur.execute(sql)
get_category_products(cur, slave01, slave02)
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=2 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
cur.execute(sql)
get_products_info()
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=3 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
cur.execute(sql)
else:
if res[2]==0:
if not slave01:
sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_slave01',0)"
cur.execute(sql)
else:
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=0 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave01'"
cur.execute(sql)
if not slave02:
sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_slave02',0)"
cur.execute(sql)
else:
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=0 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave02'"
cur.execute(sql)
get_sub_category()
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=1 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
cur.execute(sql)
get_category_products(cur, slave01, slave02)
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=2 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
cur.execute(sql)
get_products_info()
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=3 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
cur.execute(sql)
elif res[2]==1:
get_category_products(cur, slave01, slave02)
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=2 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
cur.execute(sql)
get_products_info()
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=3 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
cur.execute(sql)
elif res[2]==2:
get_products_info()
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=3 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
cur.execute(sql)
elif res[2]==3:
if slave01[2]==2 and slave02[2]==2:
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=0 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
cur.execute(sql)
main()
else:
logging.info("Slaves are working.....")
conn.close()
conn.close()
def crawler_slave1():
conn = psycopg2.connect(database=config.get('database'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port'))
conn.autocommit = True
cur = conn.cursor()
sql = "select crawler_name,keyword,flag from "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave01'"
cur.execute(sql)
res = cur.fetchone()
if res:
if res[2]==1:
get_products_info()
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=2 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave01'"
cur.execute(sql)
else:
logging.info("Slave02 or Master are working.....")
conn.close()
def crawler_slave2():
conn = psycopg2.connect(database=config.get('database'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port'))
conn.autocommit = True
cur = conn.cursor()
sql = "select crawler_name,keyword,flag from "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave02'"
cur.execute(sql)
res = cur.fetchone()
if res:
if res[2]==1:
get_products_info()
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=2 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave02'"
cur.execute(sql)
else:
logging.info("Slave01 or Master are working.....")
conn.close()
if __name__ == "__main__":
logging.info("Starting Shopee Crawler.......")
try:
logging.info("Loading config file.......")
with open("conf.json", "r") as jsonfile:
config = json.load(jsonfile)
logging.info("Config file loaded.......")
main()
except Exception as e:
#logging.info("Error: ".format(e))
logging.info("Cannot load cofig file. Please check. Exiting......")
exit(1)

View File

@ -0,0 +1,587 @@
import logging
import psycopg2
###### Looger ######
format = "%(asctime)s: %(message)s"
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
class shopee_db_writer:
def __init__(self, config):
self.config = config
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
def __del__(self):
logging.info("Closing connection.....")
self.conn.close()
def rce_category(self, data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" where rce_source_category_id = "+str(data['rce_source_category_id'])
self.cur.execute(sql)
res = self.cur.fetchone()
cat_name = data['category_name'].replace("'","''")
cat_url = data['category_page_url'].replace("'","''")
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" (parent_category_id,rce_source_id," \
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name) values (" \
+str(data['parent_category_id'])+","+str(data['rce_source_id'])+", "+str(data['rce_source_category_id'])+", "+str(data['rce_source_status'])+", " \
"'"+str(cat_url)+"', '"+str(data['category_page_url_hash'])+"', '"+str(cat_name)+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(id,parent_category_id,rce_source_id," \
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat) " \
"select id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \
"category_name,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \
"where rce_source_category_id = "+ str(data['rce_source_category_id'])
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['parent_category_id'])==str(res[1]) and str(data['rce_source_category_id'])==str(res[3]) and str(data['category_name']) == str(res[7]) and \
str(data['category_page_url'])==str(res[5]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" set updatedat=now() " \
"where rce_source_category_id = "+ str(res[3])
logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" b where a.id=b.id and b.id = "+str(res[0])
logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" set parent_category_id = " \
""+str(data['parent_category_id'])+", rce_source_category_id = "+str(data['rce_source_category_id'])+", " \
"category_name='"+str(cat_name)+"', category_page_url='"+str(cat_url)+"', " \
"category_page_url_hash='"+str(data['category_page_url_hash'])+"', updatedat=now() where " \
"rce_source_category_id = "+ str(res[3])
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(id,parent_category_id,rce_source_id," \
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat) " \
"select id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \
"category_name,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \
"where rce_source_category_id = "+ str(res[3])
#logging.info(sql)
self.cur.execute(sql)
def rce_product(self, data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_id = "+str(data['rce_source_product_id'])
self.cur.execute(sql)
res = self.cur.fetchone()
data['product_page_url'] = data['product_page_url'].replace("'","''")
data['rce_source_product_name'] = data['rce_source_product_name'].replace("'","''")
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" (rce_source_product_id," \
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
"ships_from) values("+str(data['rce_source_product_id'])+","+str(data['rce_source_product_status'])+",'"+str(data['product_page_url'])+"'," \
"'"+str(data['product_page_url_hash'])+"',"+str(data['rce_category_id'])+","+str(data['rce_brand_id'])+","+str(data['rce_store_id'])+"," \
"'"+str(data['rce_source_product_name'])+"','"+str(data['product_images'])+"','"+str(data['product_description'])+"',"+str(data['product_sold_total'])+"," \
""+str(data['product_sold'])+",'"+str(data['product_price_min'])+"','"+str(data['product_price_min_before_discount'])+"','"+str(data['product_price_max'])+"'," \
"'"+str(data['product_price_max_before_discount'])+"','"+str(data['ratings'])+"','"+str(data['ships_from'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" (id,rce_source_product_id," \
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
"ships_from,createdat,updatedat) select id,rce_source_product_id," \
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
"ships_from,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where " \
"rce_source_product_id="+str(data['rce_source_product_id'])+""
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_product_id'])==str(res[1]) and str(data['rce_source_product_status'])==str(res[2]) and \
str(data['product_page_url'])==str(res[3]) and str(data['product_page_url_hash'])==str(res[4]) and str(data['rce_category_id'])==str(res[5]) and \
str(data['rce_brand_id'])==str(res[6]) and str(data['rce_store_id'])==str(res[7]) and str(data['rce_source_product_name'])==str(res[8]) and \
str(data['product_images'])==str(res[9]) and str(data['product_sold_total'])==str(res[11]) and \
str(data['product_sold'])==str(res[12]) and str(data['product_price_min'])==str(res[13]) and str(data['product_price_min_before_discount'])==str(res[14]) and \
str(data['product_price_max'])==str(res[15]) and str(data['product_price_max_before_discount'])==str(res[16]) and str(data['ratings'])==str(res[17]) and \
str(data['ships_from'])==str(res[18]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" set updatedat=now() " \
"where rce_source_product_id = "+ str(res[1])
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" set rce_source_product_id="+str(data['rce_source_product_id'])+"," \
"rce_source_product_status="+str(data['rce_source_product_status'])+",product_page_url='"+str(data['product_page_url'])+"',product_page_url_hash= " \
"'"+str(data['product_page_url_hash'])+"',rce_category_id="+str(data['rce_category_id'])+",rce_brand_id="+str(data['rce_brand_id'])+"," \
"rce_store_id="+str(data['rce_store_id'])+",rce_source_product_name='"+str(data['rce_source_product_name'])+"',product_images='"+str(data['product_images'])+"'" \
",product_description='"+str(data['product_description'])+"',product_sold_total="+str(data['product_sold_total'])+",product_sold="+str(data['product_sold'])+"," \
"product_price_min='"+str(data['product_price_min'])+"',product_price_min_before_discount='"+str(data['product_price_min_before_discount'])+"'," \
"product_price_max='"+str(data['product_price_max'])+"',product_price_max_before_discount='"+str(data['product_price_max_before_discount'])+"',ratings='"+str(data['ratings'])+"'," \
"ships_from='"+str(data['ships_from'])+"', updatedat=now() where rce_source_product_id = "+ str(res[1])
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" (id,rce_source_product_id," \
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
"ships_from,createdat,updatedat) select id,rce_source_product_id," \
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
"ships_from,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where " \
"rce_source_product_id="+str(res[1])+""
#logging.info(sql)
self.cur.execute(sql)
def rce_product_variant(self, data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where rce_source_variant_id = "+str(data['rce_source_variant_id'])
self.cur.execute(sql)
res = self.cur.fetchone()
data['product_variant_name'] = data['product_variant_name'].replace("'","''")
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" (rce_source_variant_id,rce_product_id," \
"product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock) values("+str(data['rce_source_variant_id'])+"," \
""+str(data['rce_product_id'])+",'"+str(data['product_variant_name'])+"','"+str(data['product_variant_price'])+"'," \
"'"+str(data['product_variant_price_before_discount'])+"',"+str(data['product_variant_stock'])+")"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" (id,rce_source_variant_id,rce_product_id," \
"product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,createdat,updatedat) select * from " \
""+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where rce_source_variant_id="+str(data['rce_source_variant_id'])+""
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_variant_id'])==str(res[1]) and str(data['rce_product_id'])==str(res[2]) and str(data['product_variant_name'])==str(res[3]) and \
str(data['product_variant_price'])==str(res[4]) and str(data['product_variant_price_before_discount'])==str(res[5]) and str(data['product_variant_stock'])==str(res[6]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" set updatedat=now() " \
"where rce_source_variant_id = "+ str(res[1])
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" set rce_source_variant_id="+str(data['rce_source_variant_id'])+", " \
"rce_product_id="+str(data['rce_product_id'])+", product_variant_name='"+str(data['product_variant_name'])+"', product_variant_price=" \
"'"+str(data['product_variant_price'])+"',product_variant_price_before_discount='"+str(data['product_variant_price_before_discount'])+"'," \
"product_variant_stock="+str(data['product_variant_stock'])+", updatedat=now() where rce_source_variant_id = "+ str(res[1])
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" (id,rce_source_variant_id,rce_product_id," \
"product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,createdat,updatedat) select * from " \
""+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where rce_source_variant_id="+str(res[1])+""
#logging.info(sql)
self.cur.execute(sql)
def rce_brand(self, data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where rce_source_brand_id = "+str(data['rce_source_brand_id'])
self.cur.execute(sql)
res = self.cur.fetchone()
data['brand_page_url'] = data['brand_page_url'].replace("'","''")
data['brand_name'] = data['brand_name'].replace("'","''")
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" (rce_source_id,rce_source_brand_id,rce_source_brand_status," \
"brand_page_url,brand_page_url_hash,brand_name) values("+str(data['rce_source_id'])+","+str(data['rce_source_brand_id'])+"," \
""+str(data['rce_source_brand_status'])+",'"+str(data['brand_page_url'])+"','"+str(data['brand_page_url_hash'])+"'," \
"'"+str(data['brand_name'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" (id,rce_source_id,rce_source_brand_id,rce_source_brand_status," \
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat) select id,rce_source_id,rce_source_brand_id,rce_source_brand_status," \
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where rce_source_brand_id="+str(data['rce_source_brand_id'])+""
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_brand_id'])==str(res[2]) and str(data['rce_source_brand_status'])==str(res[3]) and str(data['brand_page_url'])==str(res[4]) and \
str(data['brand_page_url_hash'])==str(res[5]) and str(data['brand_name'])==str(res[6]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" set updatedat=now() " \
"where rce_source_brand_id = "+ str(res[2])
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" set rce_source_id="+str(data['rce_source_id'])+", rce_source_brand_id="+str(data['rce_source_brand_id'])+", " \
"rce_source_brand_status="+str(data['rce_source_brand_status'])+", brand_page_url='"+str(data['brand_page_url'])+"', brand_page_url_hash=" \
"'"+str(data['brand_page_url_hash'])+"',brand_name='"+str(data['brand_name'])+"', updatedat=now() where rce_source_brand_id = "+ str(res[2])
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" (id,rce_source_id,rce_source_brand_id,rce_source_brand_status," \
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat) select * from " \
""+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where rce_source_brand_id="+str(res[2])+""
#logging.info(sql)
self.cur.execute(sql)
def rce_reseller(self, data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where rce_source_reseller_id = "+str(data['rce_source_reseller_id'])
self.cur.execute(sql)
res = self.cur.fetchone()
data['reseller_name'] = data['reseller_name'].replace("'","''")
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" (rce_source_id,rce_source_reseller_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_follower_count,reseller_response_rate) values("+str(data['rce_source_id'])+","+str(data['rce_source_reseller_id'])+"," \
""+str(data['rce_source_reseller_status'])+",'"+str(data['reseller_name'])+"','"+str(data['reseller_average_rating'])+"'," \
""+str(data['reseller_follower_count'])+",'"+str(data['reseller_response_rate'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (id,rce_source_id,rce_source_reseller_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_follower_count,reseller_response_rate,createdat,updatedat) select id,rce_source_id,rce_source_reseller_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_follower_count,reseller_response_rate,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where rce_source_reseller_id="+str(data['rce_source_reseller_id'])+""
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_reseller_id'])==str(res[2]) and str(data['rce_source_reseller_status'])==str(res[3]) and str(data['reseller_name'])==str(res[4]) and \
str(data['reseller_average_rating'])==str(res[5]) and str(data['reseller_follower_count'])==str(res[7]) and str(data['reseller_response_rate'])==str(res[8]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set updatedat=now() " \
"where rce_source_reseller_id = "+ str(res[2])
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set rce_source_id="+str(data['rce_source_id'])+",rce_source_reseller_id="+str(data['rce_source_reseller_id'])+", " \
"rce_source_reseller_status="+str(data['rce_source_reseller_status'])+", reseller_name='"+str(data['reseller_name'])+"', reseller_average_rating=" \
"'"+str(data['reseller_average_rating'])+"',reseller_follower_count='"+str(data['reseller_follower_count'])+"', reseller_response_rate=" \
"'"+str(data['reseller_response_rate'])+"', updatedat=now() where rce_source_reseller_id = "+ str(res[2])
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (id,rce_source_id,rce_source_reseller_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_follower_count,reseller_response_rate,createdat,updatedat) select id,rce_source_id,rce_source_reseller_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_follower_count,reseller_response_rate,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where rce_source_reseller_id="+str(res[2])
#logging.info(sql)
self.cur.execute(sql)
def rce_reseller_store(self, data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_source_store_id = "+str(data['rce_source_store_id'])
self.cur.execute(sql)
res = self.cur.fetchone()
data['store_page_url'] = data['store_page_url'].replace("'","''")
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" (rce_source_store_id,rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id) values("+str(data['rce_source_store_id'])+"," \
""+str(data['rce_source_store_status'])+",'"+str(data['store_page_url'])+"','"+str(data['store_page_url_hash'])+"'," \
"'"+str(data['store_location'])+"', "+str(data['rce_reseller_id'])+")"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (id,rce_source_store_id,rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat) select id,rce_source_store_id,rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_source_store_id="+str(data['rce_source_store_id'])+""
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_store_id'])==str(res[1]) and str(data['rce_source_store_status'])==str(res[2]) and str(data['store_page_url'])==str(res[3]) and \
str(data['store_page_url_hash'])==str(res[4]) and str(data['store_location'])==str(res[5]) and str(data['rce_reseller_id'])==str(res[6]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set updatedat=now() " \
"where rce_source_store_id = "+ str(res[1])
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set rce_source_store_id="+str(data['rce_source_store_id'])+", " \
"rce_source_store_status="+str(data['rce_source_store_status'])+", store_page_url='"+str(data['store_page_url'])+"', store_page_url_hash=" \
"'"+str(data['store_page_url_hash'])+"',store_location='"+str(data['store_location'])+"', rce_reseller_id="+str(data['rce_reseller_id'])+", " \
"updatedat=now() where rce_source_store_id = "+ str(res[1])
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (id,rce_source_store_id,rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat) select id,rce_source_store_id,rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_source_store_id="+str(res[1])+""
#logging.info(sql)
self.cur.execute(sql)
def rce_ratings_reviews(self, data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id = "+str(data['rce_product_id'])+" and username ='"+str(data['username'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
data['username'] = data['username'].replace("'","''")
data['img_url'] = data['img_url'].replace("'","''")
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" (id,rce_product_id,username," \
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating) values("+str(data['id'])+","+str(data['rce_product_id'])+"," \
"'"+str(data['username'])+"','"+str(data['review'])+"','"+str(data['img_url'])+"',"+str(data['review_like_count'])+",'"+str(data['user_tier'])+"'," \
""+str(data['shop_id'])+", '"+str(data['video_url'])+"', '"+str(data['rating'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" (id,rce_product_id,username," \
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select id,rce_product_id,username," \
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id="+str(data['rce_product_id'])+" and username ='"+str(data['username'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_product_id'])==str(res[1]) and str(data['username'])==str(res[2]) and str(data['review'])==str(res[3]) and \
str(data['img_url'])==str(res[4]) and str(data['review_like_count'])==str(res[5]) and str(data['user_tier'])==str(res[6]) and \
str(data['shop_id'])==str(res[7]) and str(data['video_url'])==str(res[8]) and str(data['rating'])==str(res[9]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set updatedat=now() " \
"where rce_product_id = "+ str(res[1])+" and username ='"+res[2]+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set rce_product_id="+str(data['rce_product_id'])+", " \
"username='"+str(data['username'])+"', review='"+str(data['review'])+"', img_url=" \
"'"+str(data['img_url'])+"',review_like_count="+str(data['review_like_count'])+", user_tier='"+str(data['user_tier'])+"', " \
"shop_id="+str(data['shop_id'])+", video_url='"+str(data['video_url'])+"', rating='"+str(data['rating'])+"', updatedat=now() " \
"where rce_product_id = "+ str(res[1])+" and username ='"+str(data['username'])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" (id,rce_product_id,username," \
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select id,rce_product_id,username," \
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id="+str(res[1])+" and username ='"+str(data['username'])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_ratings_reviews_productmodels(self,data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id = "+str(data['rce_rating_id'])
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" (rce_rating_id,model_id) " \
"values("+str(data['rce_rating_id'])+",'"+str(data['model_id'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
"createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(data['rce_rating_id'])+""
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_rating_id'])==str(res[1]) and str(data['model_id'])==str(res[2]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set updatedat=now() " \
"where rce_rating_id = "+ str(res[1])
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set model_id="+str(data['model_id'])+", " \
"updatedat=now() where rce_source_store_id = "+ str(res[1])
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
"createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(res[1])+""
#logging.info(sql)
self.cur.execute(sql)
def rce_tags(self,data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description = '"+str(data['description'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" (id,description) " \
"values("+str(data['id'])+",'"+str(data['description'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
"createdat,updatedat) select id,description,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(data['description'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['description'])==str(res[1]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set updatedat=now() " \
"where description = '"+ str(res[1])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set description='"+str(data['description'])+"', " \
"updatedat=now() where description = "+ str(res[1])
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
"createdat,updatedat) select id,description,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(res[1])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_ratings_reviews_producttags(self,data):
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id = '"+str(data['rce_rating_id'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
if not res:
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" (rce_rating_id,tag_ids) " \
"values("+str(data['rce_rating_id'])+",'"+str(data['tag_ids'])+"')"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
"createdat,updatedat) select id,rce_rating_id,tag_ids,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id='"+str(data['rce_rating_id'])+"'"
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_rating_id'])==str(res[1]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set updatedat=now() " \
"where rce_rating_id = '"+ str(res[1])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set rce_rating_id='"+str(data['rce_rating_id'])+"', " \
"updatedat=now() where rce_rating_id = "+ str(res[1])
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
"createdat,updatedat) select id,description,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where description='"+str(res[1])+"'"
#logging.info(sql)
self.cur.execute(sql)

View File

@ -0,0 +1,540 @@
import hashlib
import logging
import sys
from selenium.webdriver.remote.remote_connection import LOGGER
LOGGER.setLevel(logging.WARNING)
import string
from seleniumwire import webdriver
from selenium.webdriver.chrome.service import Service
import psycopg2
import bs4
from webdriver_manager.chrome import ChromeDriverManager
import random
from bs4 import BeautifulSoup
import json
import time
import gzip
import re
from shopee_db_writer import shopee_db_writer
import random
class shopee_products:
def __init__(self, config):
self.config = config
self.crawler_name = self.config.get("crawler_name")
self.pattern = r'[' + string.punctuation + ']'
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Shopee'")
self.rce_source_id = self.cur.fetchone()[0]
self.db_writer = shopee_db_writer(config)
def __del__(self):
print("Closing connection.....")
self.conn.close()
def get_raw_product(self, url):
op = webdriver.ChromeOptions()
hight = str(random.randint(640,1280))
width = str(random.randint(1024,1920))
op.add_argument("window-size="+width+","+hight+"")
op.add_experimental_option("useAutomationExtension", False)
op.add_argument('--no-sandbox')
op.add_argument('--disable-notifications')
op.add_argument("--lang=en-GB")
op.add_argument("--log-level=3")
op.headless = True
driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op)
driver.get(url)
time.sleep(10)
iteminfo = ""
shopinfo = ""
ratinginfo = ""
try:
for request in driver.requests:
if request.response:
if '/api/v4/item/get?item' in request.url:
encoding = request.response.headers.get('content-encoding')
if encoding:
iteminfo = gzip.decompress(request.response.body).decode()
else:
iteminfo = request.response.body
if '/api/v4/product/get_shop_info?shopid' in request.url:
encoding = request.response.headers.get('content-encoding')
if encoding:
shopinfo = gzip.decompress(request.response.body).decode()
else:
shopinfo = request.response.body
if '/api/v2/item/get_ratings' in request.url:
if encoding:
ratinginfo = gzip.decompress(request.response.body).decode()
else:
ratinginfo = request.response.body
except:
pass
driver.close()
return iteminfo, shopinfo, ratinginfo
def product_info(self, data_item, item):
### rce_brand
data_brand = {}
data_brand['rce_source_id'] = self.rce_source_id
data_brand['rce_source_brand_id'] = ""
data_brand['rce_source_brand_status'] = 1
data_brand['brand_page_url'] = ""
data_brand['brand_page_url_hash'] = ""
data_brand['brand_name'] = ""
try:
data_brand['rce_source_brand_id'] = data_item['data']['brand_id']
data_brand['brand_page_url'] = "https://shopee.co.id/search?brands=" + str(data_item['data']['brand_id'])
data_brand['brand_page_url_hash'] = hashlib.md5(data_brand['brand_page_url'].encode('utf-8')).hexdigest()
try:
brand_name = data_item['data']['brand']
data_brand['brand_name'] = re.sub(self.pattern, '', brand_name)
except: pass
self.db_writer.rce_brand(data_brand)
except: pass
### rce_product
data_product = {}
data_product['rce_source_product_id'] = item[3] #itemid
data_product['rce_source_product_status'] = 1
data_product['product_page_url'] = item[4] #product page url
data_product['product_page_url_hash'] = item[5] #product page url hash
data_product['rce_category_id'] = ""
data_product['rce_brand_id'] = ""
data_product['rce_store_id'] = ""
data_product['rce_source_product_name'] = ""
data_product['product_images'] = ""
data_product['product_description'] = ""
data_product['product_sold_total'] = ""
data_product['product_sold'] = ""
data_product['product_price_min'] = ""
data_product['product_price_min_before_discount'] =""
data_product['product_price_max'] = ""
data_product['product_price_max_before_discount'] = ""
data_product['ratings'] = ""
data_product['ships_from'] = ""
try:
keyword = item[1]
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" where lower(category_name) = lower('"+keyword+"')"
self.cur.execute(sql)
data_product['rce_category_id'] = self.cur.fetchone()[0]
except: pass
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where rce_source_brand_id = "+str(data_brand['rce_source_brand_id'])
self.cur.execute(sql)
data_product['rce_brand_id'] = self.cur.fetchone()[0]
except: pass
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_source_store_id = "+str(item[2])+""
self.cur.execute(sql)
data_product['rce_store_id'] = self.cur.fetchone()[0]
except: pass
try:
rce_source_product_name = data_item['data']['name']
data_product['rce_source_product_name'] = str(re.sub(self.pattern, '', rce_source_product_name))
except: pass
try:
product_images = str(data_item["data"]["images"])
data_product['product_images'] = str(product_images.replace("'",""))
except: pass
try:
product_description = str(data_item["data"]["description"])
data_product['product_description'] = str(re.sub(self.pattern, '', product_description))
except: pass
try: data_product['product_sold_total'] = str(data_item["data"]["historical_sold"])
except: pass
try: data_product['product_sold'] = str(data_item["data"]["sold"])
except: pass
try: data_product['product_price_min'] = str(data_item["data"]["price_min"])
except: pass
try: data_product['product_price_min_before_discount'] = str(data_item["data"]["price_min_before_discount"])
except: pass
try: data_product['product_price_max'] = str(data_item["data"]["price_max"])
except: pass
try: data_product['product_price_max_before_discount'] = str(data_item["data"]["price_max_before_discount"])
except: pass
try: data_product['ratings'] = str(data_item["data"]["item_rating"]["rating_star"])
except: pass
try: data_product['ships_from'] = str(data_item["data"]["shop_location"])
except: pass
self.db_writer.rce_product(data_product)
### rce_product_variant
data_variant = {}
data_variant['rce_source_variant_id'] = ""
data_variant['rce_product_id'] = ""
data_variant['product_variant_name'] = ""
data_variant['product_variant_price'] = ""
data_variant['product_variant_price_before_discount'] = ""
data_variant['product_variant_stock'] = ""
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_id = "+str(data_product['rce_source_product_id'])
self.cur.execute(sql)
data_variant['rce_product_id'] = self.cur.fetchone()[0]
except:
pass
try:
MODELS = data_item["data"]["models"]
for i in MODELS:
try:
data_variant['rce_source_variant_id'] = str(i["modelid"])
try:
product_variant_name = str(i["name"])
data_variant['product_variant_name'] = re.sub(self.pattern, '', product_variant_name)
except: pass
try: data_variant['product_variant_price'] = str(i["price"])
except: pass
try: data_variant['product_variant_price_before_discount'] = str(i["price_before_discount"])
except: pass
try: data_variant['product_variant_stock'] = str(i["stock"])
except: pass
self.db_writer.rce_product_variant(data_variant)
except: pass
except: pass
def reseller_info(self, data_shop, item):
data_reseller = {}
data_reseller['rce_source_id'] = self.rce_source_id
data_reseller['rce_source_reseller_id'] = ""
data_reseller['rce_source_reseller_status'] = 1
data_reseller['reseller_name'] = ""
data_reseller['reseller_average_rating'] = ""
data_reseller['reseller_follower_count'] = ""
data_reseller['reseller_response_rate'] = ""
try:
data_reseller['rce_source_reseller_id'] = str(data_shop["data"]["userid"])
except: pass
try:
reseller_name = str(data_shop["data"]["name"])
data_reseller['reseller_name'] = re.sub(self.pattern, '', reseller_name)
except: pass
try: data_reseller['reseller_average_rating'] = str(data_shop["data"]["rating_star"])
except: pass
try: data_reseller['reseller_follower_count'] = str(data_shop["data"]["follower_count"])
except: pass
try: data_reseller['reseller_response_rate'] = str(data_shop["data"]["response_rate"])
except: pass
self.db_writer.rce_reseller(data_reseller)
data_reseller_store = {}
data_reseller_store['rce_source_store_id'] = item[2]
data_reseller_store['rce_source_store_status'] = 1
data_reseller_store['store_page_url'] = ""
data_reseller_store['store_page_url_hash'] = ""
data_reseller_store['store_location'] = ""
data_reseller_store['rce_reseller_id'] = ""
try:
username = str(data_shop["data"]["account"]["username"])
data_reseller_store['store_page_url'] = "https://shopee.co.id/"+username
data_reseller_store['store_page_url_hash'] = hashlib.md5(data_reseller_store['store_page_url'].encode('utf-8')).hexdigest()
except:
pass
try: data_reseller_store['store_location'] = str(data_shop["data"]["shop_location"])
except: pass
try:
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where rce_source_reseller_id = "+str(data_reseller['rce_source_reseller_id']))
rce_reseller_id = self.cur.fetchone()
data_reseller_store['rce_reseller_id'] = rce_reseller_id[0]
except:
pass
self.db_writer.rce_reseller_store(data_reseller_store)
def rating_info(self, data_rating, item):
sql = "select max(id) from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')
self.cur.execute(sql)
rating_id = self.cur.fetchone()
if rating_id[0]==None:
rating_id = 1
else:
rating_id = int(rating_id[0]) + 1
for data in data_rating['data']['ratings']:
data_review = {}
data_review["id"] = rating_id
data_review["rce_product_id"] = ""
data_review["username"] = ""
data_review["review"] = ""
data_review["img_url"] = ""
data_review["review_like_count"] = ""
data_review["user_tier"] = ""
data_review["shop_id"] = item[2]
data_review["video_url"] = ""
data_review["rating"] = ""
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_id = "+str(item[3])
self.cur.execute(sql)
data_review["rce_product_id"] = self.cur.fetchone()[0]
except: pass
try: data_review["username"] = str(data['author_username'])
except: pass
try:
review = str(data['comment'])
review = review.replace(",", " ")
review = review.replace("'", " ")
comments = list(review.split("\n"))
for comment_items in range(len(comments)):
temp_comment = re.sub('[^a-zA-Z0-9\: ]([a-zA-Z\:]+)?\s{0,2}[^a-zA-Z0-9\: ]?', ' ', comments[comment_items])
if not re.match('[A-Za-z0-9\s*]*\s*(\:)\s*[A-Za-z0-9\s*]*', temp_comment):
data_review["review"] = data_review["review"] + (comments[comment_items])
except: pass
try: data_review["img_url"] = str(data['images']).replace("'","").replace("[","").replace("]","")
except: pass
try:
if data['like_count']:
data_review["review_like_count"] = str(data['like_count'])
else:
data_review["review_like_count"]=0
except: pass
try: data_review["user_tier"] = str(data['loyalty_info']['tier_text'])
except: pass
try:
rce_video_url = []
for urls in data["videos"]:
rce_video_url.append(urls["url"])
data_review["video_url"] = str(",".join(rce_video_url))
except: pass
try: data_review["rating"] = str(data['rating_star'])
except: pass
self.db_writer.rce_ratings_reviews(data_review)
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where id="+str(data_review['id'])
self.cur.execute(sql)
res = self.cur.fetchall()
if res:
data_review_product_model = {}
data_review_product_model["rce_rating_id"] = rating_id
data_review_product_model["model_id"] = ""
try:
product_models = []
for models in data["product_items"]:
product_models.append(models["modelid"])
data_review_product_model["model_id"] = str(product_models).replace("[","").replace("]","")
self.db_writer.rce_ratings_reviews_productmodels(data_review_product_model)
except: pass
if data['tags']:
rce_tags_list = []
for tags in data["tags"]:
sql = "select max(id) from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')
self.cur.execute(sql)
tag_id = self.cur.fetchone()
if tag_id[0]==None:
tag_id = 1
else:
tag_id = int(tag_id[0]) + 1
data_tags = {}
data_tags['id'] = tag_id
data_tags['description'] = tags["tag_description"]
self.db_writer.rce_tags(data_tags)
rce_tags_list.append(tags["tag_description"])
rce_tags_list = str(rce_tags_list).replace('[','').replace(']','')
tags_id_query = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description in (" + str(rce_tags_list) + ")"
self.cur.execute(tags_id_query)
tags_id_query = self.cur.fetchall()
rce_tag_ids = str(tags_id_query)
rce_tag_ids = rce_tag_ids.replace("[", "")
rce_tag_ids = rce_tag_ids.replace("]", "")
rce_tag_ids = rce_tag_ids.replace("(", "")
rce_tag_ids = rce_tag_ids.replace(")", "")
rce_tag_ids = rce_tag_ids.replace(",,", ",")
rce_tag_ids = rce_tag_ids.rstrip(",")
data_review_product_tags = {}
data_review_product_tags['rce_rating_id'] = rating_id
data_review_product_tags['tag_ids'] = rce_tag_ids
self.db_writer.rce_ratings_reviews_producttags(data_review_product_tags)
def get_shopee_products(self):
crawler_main = int(self.config.get('crawler_main'))
crawler_slave_no = int(self.config.get('crawler_slave_no')) if self.config.get('crawler_slave_no') else None
sql = None
if crawler_main:
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where flag=0 and crawler_name='"+self.config.get('crawler_name')+"' " \
"and keyword in ('Perawatan & Kecantikan','Perawatan Tubuh','Perawatan Tangan','Perawatan Kaki','Perawatan Kuku','Perawatan Rambut','Perawatan Pria'," \
"'Parfum & Wewangian') order by id"
else:
if crawler_slave_no == 1:
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where flag=0 and crawler_name='"+self.config.get('crawler_name')+"' " \
"and keyword in ('Kosmetik Wajah','Kosmetik Mata','Kosmetik Bibir','Pembersih Make Up','Aksesoris Make Up','Alat Perawatan Wajah','Alat Pelangsing Tubuh') order by id"
elif crawler_slave_no ==2:
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where flag=0 and crawler_name='"+self.config.get('crawler_name')+"' " \
"and keyword in ('Alat Penghilang Bulu Rambut','Alat Rambut','Perawatan Wajah','Treatment Mata','Treatment Bibir','Paket & Set Kecantikan','Kecantikan Lainnya') order by id"
if sql:
self.cur.execute(sql)
items = self.cur.fetchall()
logging.info("Total Item found: {}".format(str(len(items))))
for item in items:
self.crawl_shopee_products(item)
time.sleep(random.randint(15,25))
else:
logging.info("SQL not generated. Please check if Master or Slaves are working correctly.")
sys.exit(1)
def crawl_shopee_products(self,item, flag=0):
logging.info("Collecting info for itemid="+str(item[3])+" and shopid="+str(item[2]))
iteminfo, shopinfo, ratinginfo = self.get_raw_product(item[4])
try:
data_item = json.loads(iteminfo)
data_shop = json.loads(shopinfo)
data_rating = json.loads(ratinginfo)
X = None
Y = None
Z = None
try : X = data_item["data"]
except: pass
try : Y = data_shop["data"]
except: pass
try : Z = data_rating["data"]
except: pass
if not X or not Y or not Z:
if flag == 0:
print("Data is NULL. Retrying..... Itemid: {}, Shopid: {}".format(str(item[3]),str(item[2])))
self.crawl_shopee_products(item, flag=1)
else:
print("Data is NULL. Skipping")
pass
else:
try:
self.reseller_info(data_shop,item)
except Exception as e:
logging.info("Reseller info: "+ str(e))
pass
try:
self.product_info(data_item,item)
except Exception as e:
logging.info("Product info: "+ str(e))
pass
try:
self.rating_info(data_rating,item)
except Exception as e:
logging.info("Rating info: "+ str(e))
except Exception as e:
logging.info("Data not parsable..... Skipping....")
#self.crawl_shopee_products(item, flag=1)
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" set flag=1 where itemid="+str(item[3])+" and shopid="+str(item[2])+" and crawler_name='"+self.config.get('crawler_name')+"'"
logging.info(sql)
self.cur.execute(sql)

View File

@ -0,0 +1,107 @@
import hashlib
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import psycopg2
from webdriver_manager.chrome import ChromeDriverManager
import random
from bs4 import BeautifulSoup
import json
import logging
from shopee_db_writer import shopee_db_writer
###### Looger ######
format = "%(asctime)s: %(message)s"
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
class shopee_sub_categories:
def __init__(self, config):
logging.info("Loading Sub Categories of Beauty & Care.........")
self.config = config
self.url = "https://shopee.co.id/api/v4/pages/get_category_tree"
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Shopee'")
try : self.rce_source_id = self.cur.fetchone()[0]
except:
logging.info("Source tab is empty. Please check. Exiting.....")
exit(1)
self.db_writer = shopee_db_writer(config)
def __del__(self):
logging.info("Closing connection.....")
self.conn.close()
def get_sub_categories(self):
op = webdriver.ChromeOptions()
hight = str(random.randint(640,1280))
width = str(random.randint(1024,1920))
op.add_argument("window-size="+width+","+hight+"")
op.add_experimental_option("useAutomationExtension", False)
op.add_argument('--no-sandbox')
op.add_argument('--disable-notifications')
op.headless = True
driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op)
driver.get(self.url)
self.page_source = driver.page_source
self.parse()
def parse(self):
soup = BeautifulSoup(self.page_source,features="html.parser")
all_cat = json.loads(soup.body.text)['data']['category_list']
for cat in all_cat:
if cat['catid'] == int(self.config.get('source_category')):
self.sub_cats = cat['children']
data = {}
data['parent_category_id'] = cat['parent_catid']
data['rce_source_id'] = self.rce_source_id
data['rce_source_category_id'] = cat['catid']
data['rce_source_status'] = 1
data['category_name'] = cat['display_name']
data['category_page_url'] = self.get_url(name=data['category_name'], pcatid=data['rce_source_category_id'])
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
self.db_writer.rce_category(data)
self.process_sub_categories()
def process_sub_categories(self):
for sub_cat in self.sub_cats:
data = {}
data['parent_category_id'] = sub_cat['parent_catid']
data['rce_source_id'] = self.rce_source_id
data['rce_source_category_id'] = sub_cat['catid']
data['rce_source_status'] = 1
data['category_name'] = sub_cat['display_name']
data['category_page_url'] = self.get_url(name=data['category_name'], pcatid=data['parent_category_id'], ccatid=data['rce_source_category_id'])
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
self.db_writer.rce_category(data)
def get_url(self, name, pcatid=None, ccatid=None):
uri = name.split('& ')
uri = ''.join(uri)
uri = uri.split(' ')
uri = '-'.join(uri)
url = 'https://shopee.co.id/' + uri
if not ccatid:
url = url + '-cat.' + str(pcatid)
else:
url = url + '-cat.' + str(pcatid) + '.' + str(ccatid)
return url

1
tokopedia_crawler_engine/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
conf.json

View File

@ -0,0 +1,13 @@
### Run: ###
* run "python tokopedia_crawler.py"
### Configuration: ###
* Ensure that tables are created already.
* cp conf.json.sample conf.json
* Install zyte certificate - https://docs.zyte.com/smart-proxy-manager/next-steps/fetching-https-pages-with-smart-proxy.html#fetching-https-pages-with-smart-proxy
### Notes: ###
* Cronjob can be setup for 'Master' to run every 1 minute.
* It is expected to capture all product urls in ~107 minutes.
* It makes only 2 API calls per minute(3 in the first minute) to prevent IP blocking.
* Infinite slaves can be added.

View File

@ -0,0 +1,28 @@
{
"crawler_name": "raena_crawler_engine_tokopedia",
"crawler_target": "Tokopedia",
"crawler_target_url": "https://www.tokopedia.com/",
"crawler_schema": "raena_spider_management",
"category_tab": "rce_category",
"tracker_tab": "crawler_tracker",
"product_tab": "rce_product",
"variant_tab": "rce_product_variant",
"brand_tab": "rce_brand",
"reseller_tab": "rce_reseller",
"reseller_store_tab": "rce_reseller_store",
"review_tab": "rce_ratings_reviews",
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
"review_producttags_tab": "rce_ratings_reviews_producttags",
"review_tags": "rce_tags",
"source_tab": "rce_source",
"product_per_category": "120",
"source_category": "61",
"proxy_url": "http://59e7e01ebdf54a6585c7db8824efa1e8:@proxy.crawlera.com:8011/",
"db_user": "",
"db_pass": "",
"database": "raena_db",
"db_host": "localhost",
"db_port": "5432",
"crawler_main": "1",
"crawler_slave_no": ""
}

View File

@ -0,0 +1,31 @@
import requests
from pathlib import Path
from tokopedia_config import Config
class api():
config = Config().get()
def post(self, url, payload):
try:
response = requests.post(url, payload)
return response.json()
except:
return []
def postProxy(self, url, payload, headers):
path = Path.cwd()
proxyUrl = self.config.get('proxy_url')
# print(data)
try:
response = requests.post(url,
data=payload,
headers=headers,
proxies={
"http": proxyUrl,
"https": proxyUrl,
},
verify=f'{path}/zyte-proxy-ca.crt'
)
return response.json()
except:
return []

View File

@ -0,0 +1,25 @@
import json
from tokopedia_logger import logger
class Config():
config = None
def __new__(cls, *args, **kw):
if not hasattr(cls, '_instance'):
orig = super(Config, cls)
cls._instance = orig.__new__(cls, *args, **kw)
return cls._instance
def __init__(self):
if not self.config:
try:
logger.info("Loading config fine...")
with open("conf.json", "r") as jsonfile:
self.config = json.load(jsonfile)
logger.info("Config file loaded.")
except Exception as e:
logger.error("Cannot load config file. Please check. Exiting......")
exit(1)
def get(self):
return self.config

View File

@ -0,0 +1,43 @@
from tokopedia_logger import logger
from tokopedia_db_writer import DB
from tokopedia_config import Config
from tokopedia_sub_categories import TokopediaSubCategories
from tokopedia_db_migrations import db_migrations
from tokopedia_product_list import ProductList
from tokopedia_products import Products
def checkSource():
config = Config().get()
table = config.get("crawler_schema") + "." + config.get("source_tab")
query = "select id from " + table + " where source_name='Tokopedia'"
data = DB().fetchone(query)
if not data:
logger.error("Please create source in " + table)
exit(1)
def runMainCrawler():
db_migrations()
checkSource()
TokopediaSubCategories()
ProductList()
def runSlaveCrawler():
config = Config().get()
try:
int(config.get('crawler_slave_no'))
except:
logger.error("Please set slave number")
exit(1)
Products()
def main():
config = Config().get()
isMainCrawler = bool(int(config.get('crawler_main')))
if isMainCrawler:
runMainCrawler()
else:
runSlaveCrawler()
if __name__ == "__main__":
main()

View File

@ -0,0 +1,109 @@
from tokopedia_logger import logger
from tokopedia_db_writer import DB
from tokopedia_config import Config
class db_migrations():
config = Config().get()
def __init__(self):
logger.info('Running database migrations')
self.updateSource()
self.updateCategoryColumn()
self.alterCrawlerTracker()
self.alterProductTab()
self.alterResellerStoreTab()
logger.info('Database migrations completed')
def updateSource(self):
table = f'{self.config.get("crawler_schema")}.{self.config.get("source_tab")}'
target = self.config.get("crawler_target")
target_url = self.config.get("crawler_target_url")
query = f'''INSERT INTO {table} (source_name, source_main_url)
SELECT '{target}', '{target_url}'
WHERE
NOT EXISTS (
SELECT id FROM {table} WHERE source_name = '{target}'
);'''
try:
DB().execute_query(query)
except:
logger.error(f'Problem while creating source in {table}')
exit(1)
def updateCategoryColumn(self):
table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}'
aud_table = f'{self.config.get("crawler_schema")}.aud_{self.config.get("category_tab")}'
query = f'Alter table {table} ADD COLUMN IF NOT EXISTS category_slug character varying UNIQUE'
aud_query = f'Alter table {aud_table} ADD COLUMN IF NOT EXISTS category_slug character varying UNIQUE'
try:
DB().execute_query(query)
DB().execute_query(aud_query)
except:
logger.error(f'Problem while updating column in {table}')
exit(1)
def alterCrawlerTracker(self):
table = f'{self.config.get("crawler_schema")}.{self.config.get("tracker_tab")}'
query = f'''
ALTER TABLE {table}
ADD CONSTRAINT unique_product_page_url UNIQUE (product_page_url);
'''
try:
DB().execute_query(query)
except:
# This might be the reason of a silent error
pass
def alterProductTab(self):
table = f'{self.config.get("crawler_schema")}.{self.config.get("product_tab")}'
aud_table = f'{self.config.get("crawler_schema")}.aud_{self.config.get("product_tab")}'
query = f'Alter table {table} ADD COLUMN IF NOT EXISTS rce_source_id bigint;'
aud_query = f'Alter table {aud_table} ADD COLUMN IF NOT EXISTS rce_source_id bigint;'
constraint_query = f'''
ALTER TABLE {table}
ADD CONSTRAINT product_source_id_ukey UNIQUE (rce_source_product_id, rce_source_id);
'''
try:
DB().execute_query(query + aud_query)
except:
logger.error(f'Problem while updating column in {table}')
exit(1)
try:
DB().execute_query(constraint_query)
except:
pass
def alterResellerStoreTab(self):
table = f'{self.config.get("crawler_schema")}.{self.config.get("reseller_store_tab")}'
aud_table = f'{self.config.get("crawler_schema")}.aud_{self.config.get("reseller_store_tab")}'
query = f'Alter table {table} ADD COLUMN IF NOT EXISTS rce_source_id bigint;'
aud_query = f'Alter table {aud_table} ADD COLUMN IF NOT EXISTS rce_source_id bigint;'
constraint_query = f'''
ALTER TABLE {table}
ADD CONSTRAINT store_source_id_ukey UNIQUE (rce_source_store_id, rce_source_id);
'''
aud_constraint_query = f'''
ALTER TABLE {aud_table}
ADD CONSTRAINT aud_store_source_id_ukey UNIQUE (rce_source_store_id, rce_source_id);
'''
try:
DB().execute_query(query + aud_query)
except:
logger.error(f'Problem while updating column in {table}')
exit(1)
try:
DB().execute_query(constraint_query)
except:
pass
try:
DB().execute_query(aud_constraint_query)
except:
pass

View File

@ -0,0 +1,60 @@
from tokopedia_config import Config
from tokopedia_logger import logger
import psycopg2
class DBConnector:
def __init__(self):
config = Config().get()
self.host = config.get('db_host')
self.database = config.get('database')
self.user = config.get('db_user')
self.password = config.get('db_pass')
self.port = config.get('db_port')
self.dbconn = None
def create_connection(self):
return psycopg2.connect(
database=self.database,
user=self.user,
password=self.password,
host=self.host,
port=self.port
)
def __enter__(self):
self.dbconn = self.create_connection()
return self.dbconn
def __exit__(self, exc_type, exc_val, exc_tb):
self.dbconn.close()
class DB(object):
connection = None
def __new__(cls, *args, **kw):
if not hasattr(cls, '_instance'):
orig = super(DB, cls)
cls._instance = orig.__new__(cls, *args, **kw)
return cls._instance
def get_connection(self):
if not self.connection:
self.connection = DBConnector().create_connection()
return self.connection
def execute_query(self, query):
connection = self.get_connection()
connection.autocommit = True
try:
cursor = connection.cursor()
except psycopg2.ProgrammingError:
connection = self.get_connection()
cursor = connection.cursor()
cursor.execute(query)
return cursor
def fetchone(self, query):
return self.execute_query(query).fetchone()
def fetchall(self, query):
return self.execute_query(query).fetchall()

View File

@ -0,0 +1,7 @@
import logging
###### Logger ######
format = "%(asctime)s: %(message)s"
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
logger = logging.getLogger("tokopedia")

View File

@ -0,0 +1,108 @@
import json
from tokopedia_db_writer import DB
from tokopedia_logger import logger
from tokopedia_config import Config
from tokopedia_api import api
class ProductList():
config = Config().get()
sourceId = None
def __init__(self):
self.sourceId = self.getSourceId()
self.get()
def getSourceId(self):
table = f'{self.config.get("crawler_schema")}.{self.config.get("source_tab")}'
query = f'select id from {table} where source_name=\'Tokopedia\''
data = DB().fetchone(query)
return data[0]
# fetch 1 row of category which does not have rce_source_status set
# Make api call to fetch 120 products and store in crawler_tracker
def getCategoryIdentifier(self):
table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}'
query = f"""
SELECT category_slug FROM {table}
WHERE rce_source_id = {self.sourceId} and rce_source_status is null
ORDER BY id ASC
Limit 1
"""
try:
data = DB().fetchone(query)
return data[0] if data else None
except:
return None
def getProductList(self, identifier, page):
url = 'https://gql.tokopedia.com/graphql/SearchProductQuery'
params = f"ob=&page={page}&start={1 + (page-1)*60}&identifier={identifier}&sc=2266&user_id=0&rows=60&source=directory&device=desktop&related=true&st=product&safe_search=false"
payload = json.dumps([{
"operationName": "SearchProductQuery",
"variables": {
"params": params
},
"query": "query SearchProductQuery($params: String) {\n CategoryProducts: searchProduct(params: $params) {\n data: products {\n id\n url\n }\n }\n }\n"
}])
data = api().post(url, payload)
return data
def processData(self, data1, data2):
crawler_name = self.config.get("crawler_name")
data = None
try:
rootData1 = data1[0]["data"]["CategoryProducts"]["data"]
rootData2 = data2[0]["data"]["CategoryProducts"]["data"]
data = rootData1 + rootData2
except:
data = []
proccessedData = list(map(lambda x: (f"'{crawler_name}'", f'\'{x["url"]}\''), data))
return proccessedData
@staticmethod
def convertToString(n, delimeter = ','):
return delimeter.join(n)
def updateTracker(self, rawData):
table = f'{self.config.get("crawler_schema")}.{self.config.get("tracker_tab")}'
data = f"({self.convertToString(map(self.convertToString, rawData), '),(')})"
query = f"""
INSERT INTO {table}(crawler_name, product_page_url)
VALUES {data}
ON CONFLICT (product_page_url) DO Nothing;
"""
try:
DB().execute_query(query)
return True
except:
logger.info(f'Error while inserting data in {table}')
return False
def updateCategoryTableRow(self, identifier):
table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}'
query = f"""
Update {table}
SET rce_source_status = 1
WHERE category_slug='{identifier}'
"""
try:
data = DB().execute_query(query)
except:
logger.error(f'Something went wrong while updating {table}')
def get(self):
identifier = self.getCategoryIdentifier()
if not identifier:
logger.info("All the categories are processed, no task left for master")
return
data1 = self.getProductList(identifier, 1)
data2 = self.getProductList(identifier, 2)
processedData = self.processData(data1, data2)
isDataInserted = self.updateTracker(processedData)
if isDataInserted:
self.updateCategoryTableRow(identifier)
logger.info(f'All the URLs are fetched for the following category identifier - {identifier}')

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,142 @@
import json
from datetime import datetime
from tokopedia_logger import logger
from tokopedia_config import Config
from tokopedia_api import api
from tokopedia_db_writer import DB
class TokopediaSubCategories:
config = Config().get()
sourceCategoryId = int(config.get("source_category"))
sourceId = None
def __init__(self):
self.sourceId = self.getSourceId()
self.populate()
def getSourceId(self):
table = f'{self.config.get("crawler_schema")}.{self.config.get("source_tab")}'
query = f'select id from {table} where source_name=\'Tokopedia\''
data = DB().fetchone(query)
return data[0]
def getSourceCategoryUpdatedTime(self):
table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}'
where = f'rce_source_category_id={self.sourceCategoryId} and rce_source_id={self.sourceId}'
query = f'select updatedat from {table} where {where}'
data = DB().fetchone(query)
return data[0] if data else None
def fetchCategories(self):
url = 'https://gql.tokopedia.com/graphql/categoryAllList'
payload = json.dumps([{
"operationName": "categoryAllList",
"variables": {
"categoryID": self.sourceCategoryId
},
"query": "query categoryAllList($categoryID: Int, $type: String) {\n CategoryAllList: categoryAllList(categoryID: $categoryID, type: $type) {\n categories {\n identifier\n url\n name\n id\n child {\n id\n identifier\n name\n url\n child {\n name\n identifier\n url\n id\n }\n }\n }\n }\n }\n"
}])
data = api().post(url, payload)
return data
def processData(self, rawData):
sourceId = self.sourceId
data = rawData[0]['data']['CategoryAllList']['categories'][0]
values = [(str(sourceId), str(0), data['id'], f"'{data['url']}'", f"'{data['name']}'", f"'{data['identifier']}'")]
for fc in data['child']:
values.insert(len(values), (str(sourceId), data['id'], fc['id'], f"'{fc['url']}'", f"'{fc['name']}'", f"'{fc['identifier']}'"))
for sc in fc['child']:
values.insert(len(values), (str(sourceId), fc['id'], sc['id'], f"'{sc['url']}'", f"'{sc['name']}'", f"'{sc['identifier']}'"))
return values
@staticmethod
def convertToString(n, delimeter = ','):
return delimeter.join(n)
def upsertData(self):
table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}'
categories = self.fetchCategories()
rawData = self.processData(categories)
data = f"({self.convertToString(map(self.convertToString, rawData), '),(')})"
query = f'''
INSERT INTO {table} (rce_source_id, parent_category_id, rce_source_category_id, category_page_url, category_name, category_slug)
VALUES {data}
ON CONFLICT (category_slug) DO UPDATE SET updatedat = now();
'''
try:
DB().execute_query(query)
except:
logger.error('Issue while inserting categories')
exit(1)
def deleteTokoCategories(self):
table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}'
query = f'Delete from {table} where rce_source_id={self.sourceId};'
try:
DB().execute_query(query)
except:
logger.error(f'Tokopedia categories were not deleted from {table}')
def fetchCategoriesFromDB(self):
table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}'
query = f'Select id, rce_source_id, parent_category_id, rce_source_category_id, category_page_url, category_name, category_slug, createdat, updatedat from {table} where rce_source_id={self.sourceId};'
# query = f'Select (id, rce_source_id, parent_category_id, rce_source_category_id, category_page_url, category_name, category_slug, updatedat) from {table} where rce_source_id={self.sourceId};'
try:
return DB().fetchall(query)
except:
logger.error(f'Issue while fetching data from {table}')
exit(1)
def processAudData(self, data):
processedData = []
for x in data:
t = list(x)
t[0] = str(t[0])
t[1] = str(t[1])
t[2] = str(t[2])
t[3] = str(t[3])
t[4] = f"'{t[4]}'"
t[5] = f"'{t[5]}'"
t[6] = f"'{t[6]}'"
t[7] = f'\'{t[7].strftime("%Y-%m-%d %H:%M:%S.%f")}\''
t[8] = f'\'{t[8].strftime("%Y-%m-%d %H:%M:%S.%f")}\''
processedData.insert(len(processedData), tuple(t))
return processedData
def updateAudTable(self):
dbData = self.fetchCategoriesFromDB()
rawData = self.processAudData(dbData)
table = f'{self.config.get("crawler_schema")}.aud_{self.config.get("category_tab")}'
data = f"({self.convertToString(map(self.convertToString, rawData), '),(')})"
query = f'''
Insert into {table}
(id, rce_source_id, parent_category_id, rce_source_category_id, category_page_url, category_name, category_slug, createdat, updatedat)
values {data}
ON CONFLICT (category_slug) DO UPDATE SET updatedat = now(), id=EXCLUDED.id;
'''
try:
return DB().execute_query(query)
except Exception as e:
logger.error(f'Issue while updating {table} {str(e)}')
exit(1)
def populate(self):
sourceCategoryUpdatedTime = self.getSourceCategoryUpdatedTime()
if sourceCategoryUpdatedTime:
diffDays = (datetime.now() - sourceCategoryUpdatedTime).days
# Let's keep a frequency of 1 day to fetch/update categories
if diffDays < 1:
logger.info('Categories were populated recently, so skipping this step')
return
# delete data from main table
logger.info('Deleting categories from main table')
self.deleteTokoCategories()
# insert fresh data
logger.info('Inserting categories in main table')
self.upsertData()
# update audit table, if required
logger.info('Inserting/Updating categories in audit table')
self.updateAudTable()

View File

@ -0,0 +1,25 @@
-----BEGIN CERTIFICATE-----
MIIERzCCAy+gAwIBAgIJAN/VCi6U4Y5SMA0GCSqGSIb3DQEBCwUAMIG5MQswCQYD
VQQGEwJJRTEQMA4GA1UECAwHTXVuc3RlcjENMAsGA1UEBwwEQ29yazEUMBIGA1UE
CgwLU2NyYXBpbmdIdWIxNTAzBgNVBAsMLExlYWRpbmcgVGVjaG5vbG9neSBhbmQg
UHJvZmVzc2lvbmFsIFNlcnZpY2VzMRQwEgYDVQQDDAtDcmF3bGVyYSBDQTEmMCQG
CSqGSIb3DQEJARYXc3VwcG9ydEBzY3JhcGluZ2h1Yi5jb20wHhcNMTUwNTE5MTQ1
NjA3WhcNMjUwNTE2MTQ1NjA3WjCBuTELMAkGA1UEBhMCSUUxEDAOBgNVBAgMB011
bnN0ZXIxDTALBgNVBAcMBENvcmsxFDASBgNVBAoMC1NjcmFwaW5nSHViMTUwMwYD
VQQLDCxMZWFkaW5nIFRlY2hub2xvZ3kgYW5kIFByb2Zlc3Npb25hbCBTZXJ2aWNl
czEUMBIGA1UEAwwLQ3Jhd2xlcmEgQ0ExJjAkBgkqhkiG9w0BCQEWF3N1cHBvcnRA
c2NyYXBpbmdodWIuY29tMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA
3I3nDH62M7FHT6HG5ZNS9cBeXmMZaKaxYdr+7ioSiVXzruDkH3uX6CQZLkvR2KpG
icHOnd0FM4S4rHYQoWc82b/UGgwjQdi47ED8fqCPusEcgo/7eY3y2Y/JivEWKk6f
z+gBlvEHjKj2EyzZ7FaExTEMQTTe28EroXTNySUctY9jprtKrs8jjGXd2sR6AHF1
M6O+5CT/5kXhuDO9/Q9Tfym7wxBsU/k+6hhNH+RkYlNEvkv0d8vdku/ZKTCBuL9D
NTqgXFvAmOj0MNEjf5kFrF95g+k5+PxPU04TPUtOwU30GYbCjE+ecYsoTODg6+ju
TQoNk3RFt0A0wZS3ly1rnQIDAQABo1AwTjAdBgNVHQ4EFgQUn6fXHOpDIsaswTMr
K2DwcOHLtZ0wHwYDVR0jBBgwFoAUn6fXHOpDIsaswTMrK2DwcOHLtZ0wDAYDVR0T
BAUwAwEB/zANBgkqhkiG9w0BAQsFAAOCAQEAOLtBuyHixFblY2BieG3ZCs8D74Xc
Z1usYCUNuVxOzKhuLt/cv49r39SVienqvS2UTr3kmKdyaaRJnYQ06b5FmAP72vdI
4wUAU2F7bFErAVnH1rihB+YMRE/5/6VPLfwuK8yf3rkzdrKcV2DlRQwsnwroSIR8
iON6JK2HOI0/LsKxPXUk9cHrli7e99yazS5+jBhRFGx8AVfoJg/6uLe6IKuw5xEZ
xAzDdjEIB/tf1cE0SQ+5sdmepO1cIjQYVSL7U+br+y9A1J9N+FYkBKVevM/W25tb
iGWBe46djkdm/6eyQ7gtuxhby5lwtRl5sIm9/ID/vWWDMf8O4GPPnW/Xug==
-----END CERTIFICATE-----