first commit
This commit is contained in:
commit
3154eec5ab
|
@ -0,0 +1,29 @@
|
||||||
|
# README #
|
||||||
|
|
||||||
|
This README would normally document whatever steps are necessary to get your application up and running.
|
||||||
|
|
||||||
|
### What is this repository for? ###
|
||||||
|
|
||||||
|
* Quick summary
|
||||||
|
* Version
|
||||||
|
* [Learn Markdown](https://bitbucket.org/tutorials/markdowndemo)
|
||||||
|
|
||||||
|
### How do I get set up? ###
|
||||||
|
|
||||||
|
* Summary of set up
|
||||||
|
* Configuration
|
||||||
|
* Dependencies
|
||||||
|
* Database configuration
|
||||||
|
* How to run tests
|
||||||
|
* Deployment instructions
|
||||||
|
|
||||||
|
### Contribution guidelines ###
|
||||||
|
|
||||||
|
* Writing tests
|
||||||
|
* Code review
|
||||||
|
* Other guidelines
|
||||||
|
|
||||||
|
### Who do I talk to? ###
|
||||||
|
|
||||||
|
* Repo owner or admin
|
||||||
|
* Other community or team contact
|
|
@ -0,0 +1,194 @@
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import undetected_chromedriver as webdriver
|
||||||
|
import psycopg2
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from pyvirtualdisplay import Display
|
||||||
|
|
||||||
|
from amazon_db_writer import amazon_db_writer
|
||||||
|
import ssl
|
||||||
|
ssl._create_default_https_context = ssl._create_unverified_context
|
||||||
|
|
||||||
|
|
||||||
|
class amazon_categories:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.crawler_name = self.config.get("crawler_name")
|
||||||
|
self.url = "https://www.amazon.ae/s?rh=n%3A11497859031&ref=lp_11497860031_sar"
|
||||||
|
self.product_limit = int(self.config.get("product_per_category"))
|
||||||
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
||||||
|
self.conn.autocommit = True
|
||||||
|
self.cur = self.conn.cursor()
|
||||||
|
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Amazon'")
|
||||||
|
try : self.rce_source_id = self.cur.fetchone()[0]
|
||||||
|
except:
|
||||||
|
logging.info("Source tab is empty. Please check. Exiting.....")
|
||||||
|
exit(1)
|
||||||
|
self.db_writer = amazon_db_writer(config)
|
||||||
|
|
||||||
|
#self.display = Display(visible=0, size=(800, 600))
|
||||||
|
#self.display.start()
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
print("Closing connection.....")
|
||||||
|
self.conn.close()
|
||||||
|
#self.display.stop()
|
||||||
|
|
||||||
|
def start_processing(self):
|
||||||
|
op = webdriver.ChromeOptions()
|
||||||
|
op.add_argument('--no-sandbox')
|
||||||
|
op.add_argument('--disable-notifications')
|
||||||
|
op.add_argument("--lang=en-GB")
|
||||||
|
#op.headless = True
|
||||||
|
#driver=webdriver.Chrome(version_main = 113, options=op)
|
||||||
|
driver=webdriver.Chrome(options=op)
|
||||||
|
|
||||||
|
driver.get(self.url)
|
||||||
|
|
||||||
|
driver.implicitly_wait(10)
|
||||||
|
|
||||||
|
self.get_categories(driver)
|
||||||
|
|
||||||
|
driver.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_categories(self, driver):
|
||||||
|
|
||||||
|
#element = driver.find_elements(By.CSS_SELECTOR,'.bxc-grid__container.bxc-grid__container--width-1500.bxc-grid__mp-gutter-layout')
|
||||||
|
#sub_cats = element[0].find_elements(By.CSS_SELECTOR,'.bxc-grid__image.bxc-grid__image--light')
|
||||||
|
sub_cats = driver.find_elements(By.CSS_SELECTOR,'.bxc-grid__image.bxc-grid__image--light')
|
||||||
|
|
||||||
|
|
||||||
|
names = ['Perfumes', 'Skin care', 'Hair care', 'Bath & body', 'Makeup', 'Nail care']
|
||||||
|
|
||||||
|
categories = []
|
||||||
|
for sub_cat in sub_cats:
|
||||||
|
name = sub_cat.find_element(By.TAG_NAME, 'a').get_attribute('aria-label')
|
||||||
|
if name in names:
|
||||||
|
link = sub_cat.find_element(By.TAG_NAME, 'a').get_attribute('href')
|
||||||
|
|
||||||
|
category = {
|
||||||
|
"name": name,
|
||||||
|
"link": link
|
||||||
|
}
|
||||||
|
|
||||||
|
categories.append(category)
|
||||||
|
|
||||||
|
print(categories)
|
||||||
|
self.get_sub_categories(driver, categories)
|
||||||
|
|
||||||
|
def get_sub_categories(self,driver,categories):
|
||||||
|
|
||||||
|
sub_categories = []
|
||||||
|
for category in categories:
|
||||||
|
print("=============== {} ===============".format(category["name"]))
|
||||||
|
|
||||||
|
data = {}
|
||||||
|
data['parent_category_id'] = 0
|
||||||
|
data['rce_source_id'] = self.rce_source_id
|
||||||
|
data['rce_source_category_id'] = 0
|
||||||
|
data['rce_source_status'] = 1
|
||||||
|
data['category_name'] = category["name"]
|
||||||
|
data['category_page_url'] = category["link"]
|
||||||
|
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
self.db_writer.rce_category(data)
|
||||||
|
|
||||||
|
driver.get(category["link"])
|
||||||
|
|
||||||
|
##### Feature Categories
|
||||||
|
try:
|
||||||
|
f_cat = driver.find_element(By.CSS_SELECTOR, '.octopus-pc-category-card-v2-title .a-size-extra-large')
|
||||||
|
if f_cat:
|
||||||
|
cats_c = driver.find_element(By.CSS_SELECTOR, '.a-section.octopus-pc-category-card-v2-content')
|
||||||
|
cats = cats_c.find_elements(By.CSS_SELECTOR, '.octopus-pc-category-card-v2-item')
|
||||||
|
for cat in cats:
|
||||||
|
cat_name = cat.find_element(By.CSS_SELECTOR, '.a-size-medium.a-color-base.a-text-bold').text
|
||||||
|
url = cat.find_element(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-category-card-v2-category-link').get_attribute("href")
|
||||||
|
# print('Name: {}, URL: {}'.format(cat_name,url))
|
||||||
|
# s_cat = {
|
||||||
|
# "name": cat_name,
|
||||||
|
# "link": url
|
||||||
|
# }
|
||||||
|
# sub_categories.append(s_cat)
|
||||||
|
|
||||||
|
data = {}
|
||||||
|
data['parent_category_id'] = 0
|
||||||
|
data['rce_source_id'] = self.rce_source_id
|
||||||
|
data['rce_source_category_id'] = 0
|
||||||
|
data['rce_source_status'] = 1
|
||||||
|
data['category_name'] = cat_name
|
||||||
|
data['category_page_url'] = url
|
||||||
|
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
self.db_writer.rce_category(data)
|
||||||
|
|
||||||
|
try:
|
||||||
|
sub_cats = cat.find_elements(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-category-card-v2-subcategory-link')
|
||||||
|
|
||||||
|
for sub_cat in sub_cats:
|
||||||
|
s_url = sub_cat.get_attribute('href')
|
||||||
|
s_title = sub_cat.get_attribute('title')
|
||||||
|
# print('Title: {}, URL: {}'.format(s_title, s_url))
|
||||||
|
# s_cat = {
|
||||||
|
# "name": s_title,
|
||||||
|
# "link": s_url
|
||||||
|
# }
|
||||||
|
# sub_categories.append(s_cat)
|
||||||
|
data = {}
|
||||||
|
data['parent_category_id'] = 0
|
||||||
|
data['rce_source_id'] = self.rce_source_id
|
||||||
|
data['rce_source_category_id'] = 0
|
||||||
|
data['rce_source_status'] = 1
|
||||||
|
data['category_name'] = s_title
|
||||||
|
data['category_page_url'] = s_url
|
||||||
|
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
self.db_writer.rce_category(data)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
except:
|
||||||
|
print("Feature Cat not available.")
|
||||||
|
pass
|
||||||
|
|
||||||
|
##### Shop by categories
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
cat_h = driver.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-header')
|
||||||
|
except:
|
||||||
|
cat_h = driver.find_element(By.CSS_SELECTOR, '#contentGrid_292470')
|
||||||
|
pass
|
||||||
|
if cat_h:
|
||||||
|
cats_c = driver.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-viewport-row-inner')
|
||||||
|
cats = cats_c.find_elements(By.TAG_NAME, 'li')
|
||||||
|
for cat in cats:
|
||||||
|
cat_name = cat.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-sub-card-title').text
|
||||||
|
url = cat.find_element(By.TAG_NAME, 'a').get_attribute('href')
|
||||||
|
# print('Name: {}, URL: {}'.format(cat_name,url))
|
||||||
|
# s_cat = {
|
||||||
|
# "name": cat_name,
|
||||||
|
# "link": url
|
||||||
|
# }
|
||||||
|
# sub_categories.append(s_cat)
|
||||||
|
data = {}
|
||||||
|
data['parent_category_id'] = 0
|
||||||
|
data['rce_source_id'] = self.rce_source_id
|
||||||
|
data['rce_source_category_id'] = 0
|
||||||
|
data['rce_source_status'] = 1
|
||||||
|
data['category_name'] = cat_name
|
||||||
|
data['category_page_url'] = url
|
||||||
|
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
self.db_writer.rce_category(data)
|
||||||
|
except Exception as e:
|
||||||
|
print('Cat not available')
|
||||||
|
pass
|
||||||
|
|
||||||
|
print(sub_categories)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# categories = amazon_categories()
|
||||||
|
# categories.start_processing()
|
|
@ -0,0 +1,186 @@
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import undetected_chromedriver as webdriver
|
||||||
|
from selenium.webdriver import ActionChains, Keys
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
import psycopg2
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from amazon_db_writer import amazon_db_writer
|
||||||
|
from pyvirtualdisplay import Display
|
||||||
|
from scroller.scroller import smartScroll
|
||||||
|
|
||||||
|
|
||||||
|
import ssl
|
||||||
|
ssl._create_default_https_context = ssl._create_unverified_context
|
||||||
|
|
||||||
|
|
||||||
|
class amazon_category_products:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.crawler_name = self.config.get("crawler_name")
|
||||||
|
#self.url = "https://www.amazon.ae/gp/browse.html?node=11497860031&ref_=nav_em_by_all_0_2_11_2"
|
||||||
|
self.product_limit = int(self.config.get("product_per_category"))
|
||||||
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
||||||
|
self.conn.autocommit = True
|
||||||
|
self.cur = self.conn.cursor()
|
||||||
|
sql = "delete from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where crawler_name='"+str(self.crawler_name)+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
sql = "select id, category_page_url from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" where rce_source_id = 66"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
self.categories = self.cur.fetchall()
|
||||||
|
#self.display = Display(visible=0, size=(800, 600))
|
||||||
|
#self.display.start()
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
print("Closing connection.....")
|
||||||
|
self.conn.close()
|
||||||
|
#self.display.stop()
|
||||||
|
|
||||||
|
def start_processing(self):
|
||||||
|
for category in self.categories:
|
||||||
|
logging.info("======= Fetching products of {}".format(category))
|
||||||
|
self.browse_category_page(category)
|
||||||
|
|
||||||
|
|
||||||
|
def browse_category_page(self, catagory):
|
||||||
|
try:
|
||||||
|
op = webdriver.ChromeOptions()
|
||||||
|
op.add_argument('--no-sandbox')
|
||||||
|
op.add_argument('--disable-notifications')
|
||||||
|
op.add_argument("--lang=en-GB")
|
||||||
|
#op.headless = True
|
||||||
|
#driver=webdriver.Chrome(version_main = 113, options=op)
|
||||||
|
driver=webdriver.Chrome(options=op)
|
||||||
|
|
||||||
|
driver.get(catagory[1])
|
||||||
|
|
||||||
|
driver.implicitly_wait(10)
|
||||||
|
|
||||||
|
#### Collect section name and section products ####
|
||||||
|
section_products = self.section_products(driver, catagory[0])
|
||||||
|
self.insert_tracker_tab(section_products)
|
||||||
|
|
||||||
|
#### Collect All products ####
|
||||||
|
self.base_products(driver, catagory[0])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
driver.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
|
||||||
|
def section_products(self, driver, catagory):
|
||||||
|
|
||||||
|
elements = driver.find_elements(By.CSS_SELECTOR,".a-size-extra-large.a-color-base.a-text-bold")
|
||||||
|
section_name = []
|
||||||
|
for element in elements:
|
||||||
|
section_name.append(element.text)
|
||||||
|
|
||||||
|
elements = driver.find_elements(By.CSS_SELECTOR,".a-section.octopus-pc-card-content")
|
||||||
|
section_products = []
|
||||||
|
for element in elements:
|
||||||
|
objs = element.find_elements(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-item-link')
|
||||||
|
|
||||||
|
urls = []
|
||||||
|
for obj in objs:
|
||||||
|
url = obj.get_attribute("href")
|
||||||
|
urls.append(url)
|
||||||
|
section_products.append(urls)
|
||||||
|
|
||||||
|
result = []
|
||||||
|
for i in range(len(section_name)):
|
||||||
|
result.append({
|
||||||
|
"catagory": catagory,
|
||||||
|
"key": section_name[i],
|
||||||
|
"value": section_products[i]
|
||||||
|
})
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def insert_tracker_tab(self, objs):
|
||||||
|
|
||||||
|
for obj in objs:
|
||||||
|
category = obj['catagory']
|
||||||
|
key = obj['key']
|
||||||
|
items = obj['value']
|
||||||
|
for item in items:
|
||||||
|
product_page_url = item
|
||||||
|
product_page_url_hash = hashlib.md5(product_page_url.encode('utf-8')).hexdigest()
|
||||||
|
flag = 0
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where product_page_url = '"+product_page_url+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchall()
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+"(crawler_name,category,keyword,product_page_url,product_page_url_hash,flag) values('"+str(self.crawler_name)+"','"+str(category)+"','"+str(key)+"','"+product_page_url+"','"+product_page_url_hash+"',"+str(flag)+")"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def base_products(self, driver, catagory):
|
||||||
|
|
||||||
|
try:
|
||||||
|
smartScroll(driver, stopAtBorder=True, distancePerSecond=500, humanBreaks=True)
|
||||||
|
all_res = driver.find_element(By.CSS_SELECTOR, '#apb-desktop-browse-search-see-all')
|
||||||
|
all_res.click()
|
||||||
|
|
||||||
|
driver.implicitly_wait(5)
|
||||||
|
|
||||||
|
for i in range(1,16):
|
||||||
|
items = driver.find_elements(By.CSS_SELECTOR, '.a-size-mini.a-spacing-none.a-color-base.s-line-clamp-4')
|
||||||
|
|
||||||
|
smartScroll(driver, stopAtBorder=True, distancePerSecond=500, humanBreaks=True)
|
||||||
|
|
||||||
|
|
||||||
|
urls = []
|
||||||
|
for item in items:
|
||||||
|
url = item.find_element(By.TAG_NAME, 'a').get_attribute('href')
|
||||||
|
urls.append(url)
|
||||||
|
|
||||||
|
result = [{
|
||||||
|
"catagory": catagory,
|
||||||
|
"key": "Base Product Page {}".format(str(i)),
|
||||||
|
"value": urls
|
||||||
|
}]
|
||||||
|
|
||||||
|
self.insert_tracker_tab(result)
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver.find_element(By.CSS_SELECTOR, '.s-pagination-next').click()
|
||||||
|
driver.implicitly_wait(5)
|
||||||
|
except:
|
||||||
|
logging.info("No more page to navigate......")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# config = {
|
||||||
|
# "crawler_name": "raena_crawler_enginer_amazon",
|
||||||
|
# "crawler_schema": "raena_spider_management",
|
||||||
|
# "category_tab": "rce_category",
|
||||||
|
# "tracker_tab": "crawler_tracker",
|
||||||
|
# "product_tab": "rce_product",
|
||||||
|
# "variant_tab": "rce_product_variant",
|
||||||
|
# "brand_tab": "rce_brand",
|
||||||
|
# "reseller_tab": "rce_reseller",
|
||||||
|
# "reseller_store_tab": "rce_reseller_store",
|
||||||
|
# "review_tab": "rce_ratings_reviews",
|
||||||
|
# "review_productmodels_tab": "rce_ratings_reviews_productmodels",
|
||||||
|
# "review_producttags_tab": "rce_ratings_reviews_producttags",
|
||||||
|
# "review_tags": "rce_tags",
|
||||||
|
# "source_tab": "rce_source",
|
||||||
|
# "product_per_category": "1000",
|
||||||
|
# "source_category": "11043145",
|
||||||
|
# "db_user": "postgres",
|
||||||
|
# "db_pass": "postgres",
|
||||||
|
# "database": "postgres",
|
||||||
|
# "db_host": "localhost",
|
||||||
|
# "db_port": "5444",
|
||||||
|
# "crawler_main": "1",
|
||||||
|
# "crawler_slave_no": ""
|
||||||
|
# }
|
||||||
|
# amazon_category_products = amazon_category_products(config)
|
||||||
|
# amazon_category_products.start_processing()
|
|
@ -0,0 +1,98 @@
|
||||||
|
import logging
|
||||||
|
import psycopg2
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
import smtplib
|
||||||
|
from email.message import EmailMessage
|
||||||
|
|
||||||
|
from amazon_categories import amazon_categories
|
||||||
|
from amazon_category_products import amazon_category_products
|
||||||
|
from amazon_products import amazon_products
|
||||||
|
|
||||||
|
|
||||||
|
##### Looger ######
|
||||||
|
format = "%(asctime)s: %(message)s"
|
||||||
|
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
config = {}
|
||||||
|
|
||||||
|
def send_mail():
|
||||||
|
|
||||||
|
try:
|
||||||
|
EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5"
|
||||||
|
EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh"
|
||||||
|
From = 'data_reporting@raenabeauty.com'
|
||||||
|
To = 'shariar@raenabeauty.com'
|
||||||
|
#To = 'shariar@raenabeauty.com'
|
||||||
|
|
||||||
|
html = f'''
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<div style="background-color:#eee;padding:10px 20px;">
|
||||||
|
<h2 style="font-family:Georgia, 'Times New Roman', Times, serif;color#454349;">Amazon Crawler Status</h2>
|
||||||
|
</div>
|
||||||
|
<div style="padding:20px 0px">
|
||||||
|
<div style="height: 800px;width:800px">
|
||||||
|
Error occured. Please check Amazon Pipeline.
|
||||||
|
<div style="text-align:Left;">
|
||||||
|
<p>This is system generated mail. Please do not reply</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
'''
|
||||||
|
|
||||||
|
msg = EmailMessage()
|
||||||
|
msg['Subject'] = 'Amazon Crawler Status'
|
||||||
|
msg['From'] = From
|
||||||
|
msg['To'] = To
|
||||||
|
msg.set_content(html, subtype='html')
|
||||||
|
|
||||||
|
|
||||||
|
with smtplib.SMTP('email-smtp.ap-southeast-1.amazonaws.com', 587) as smtp:
|
||||||
|
smtp.ehlo()
|
||||||
|
smtp.starttls()
|
||||||
|
smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD)
|
||||||
|
smtp.send_message(msg)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info("Error while sending mail: {}".format(e))
|
||||||
|
def main():
|
||||||
|
# start = datetime.now()
|
||||||
|
# categories = amazon_categories(config)
|
||||||
|
# categories.start_processing()
|
||||||
|
# end = datetime.now()
|
||||||
|
# logging.info('Total time taken to fetch the categories: {}'.format(str(end-start)))
|
||||||
|
#
|
||||||
|
# start = datetime.now()
|
||||||
|
# products = amazon_category_products(config)
|
||||||
|
# products.start_processing()
|
||||||
|
# end = datetime.now()
|
||||||
|
# logging.info('Total time taken to fetch the category products: {}'.format(str(end-start)))
|
||||||
|
|
||||||
|
|
||||||
|
product_info = amazon_products(config)
|
||||||
|
product_info.start_processing()
|
||||||
|
|
||||||
|
# ###### For test
|
||||||
|
# item = (100, 'raena_crawler_enginer_amazon', '3066', 'Up to 25 AED', 'https://www.amazon.ae/Ross-Massager-Shampoo-Silicone-Bristles/dp/B09JGH1WM3?ref_=Oct_d_oup_d_12149480031_0&pd_rd_w=lfMTW&content-id=amzn1.sym.d6d96598-a48c-43a2-8244-52a2329bf791&pf_rd_p=d6d96598-a48c-43a2-8244-52a2329bf791&pf_rd_r=C1QM2XCSJDBVMS27JV7E&pd_rd_wg=gkRZv&pd_rd_r=f5af13ee-c6c4-4d8a-8677-cba9cbacdace&pd_rd_i=B09JGH1WM3', '8f0540b5919e176303cf24a1d46b0e1c', 0)
|
||||||
|
# product_info.get_product_info(item)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.info("Starting Shopee Crawler.......")
|
||||||
|
try:
|
||||||
|
logging.info("Loading config file.......")
|
||||||
|
with open("conf.json", "r") as jsonfile:
|
||||||
|
config = json.load(jsonfile)
|
||||||
|
logging.info("Config file loaded.......")
|
||||||
|
print(config)
|
||||||
|
|
||||||
|
main()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.info("Error: ".format(e))
|
||||||
|
#logging.info("Cannot load config file. Please check. Exiting......")
|
||||||
|
send_mail()
|
||||||
|
exit(1)
|
|
@ -0,0 +1,589 @@
|
||||||
|
import logging
|
||||||
|
import psycopg2
|
||||||
|
|
||||||
|
###### Looger ######
|
||||||
|
format = "%(asctime)s: %(message)s"
|
||||||
|
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
class amazon_db_writer:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
||||||
|
self.conn.autocommit = True
|
||||||
|
self.cur = self.conn.cursor()
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
logging.info("Closing connection.....")
|
||||||
|
self.conn.close()
|
||||||
|
|
||||||
|
def rce_category(self, data):
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" where category_name = '"+str(data['category_name'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
cat_name = data['category_name'].replace("'","''")
|
||||||
|
cat_url = data['category_page_url'].replace("'","''")
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" (parent_category_id,rce_source_id," \
|
||||||
|
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name) values (" \
|
||||||
|
+str(data['parent_category_id'])+","+str(data['rce_source_id'])+", "+str(data['rce_source_category_id'])+", "+str(data['rce_source_status'])+", " \
|
||||||
|
"'"+str(cat_url)+"', '"+str(data['category_page_url_hash'])+"', '"+str(cat_name)+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(id,parent_category_id,rce_source_id," \
|
||||||
|
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat) " \
|
||||||
|
"select id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \
|
||||||
|
"category_name,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \
|
||||||
|
"where rce_source_category_id = "+ str(data['rce_source_category_id'])
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
if str(data['parent_category_id'])==str(res[1]) and str(data['rce_source_category_id'])==str(res[3]) and str(data['category_name']) == str(res[7]) and \
|
||||||
|
str(data['category_page_url'])==str(res[5]):
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" set updatedat=now() " \
|
||||||
|
"where category_name = '"+ str(res[7])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" set parent_category_id = " \
|
||||||
|
""+str(data['parent_category_id'])+", rce_source_category_id = "+str(data['rce_source_category_id'])+", " \
|
||||||
|
"category_name='"+str(cat_name)+"', category_page_url='"+str(cat_url)+"', " \
|
||||||
|
"category_page_url_hash='"+str(data['category_page_url_hash'])+"', updatedat=now() where " \
|
||||||
|
"category_name = '"+ str(res[7])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(id,parent_category_id,rce_source_id," \
|
||||||
|
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat) " \
|
||||||
|
"select id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \
|
||||||
|
"category_name,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \
|
||||||
|
"where category_name = '"+ str(res[7])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_product(self, data):
|
||||||
|
|
||||||
|
data['product_page_url'] = data['product_page_url'].replace("'","''")
|
||||||
|
data['rce_source_product_name'] = data['rce_source_product_name'].replace("'","''")
|
||||||
|
data['product_description'] = data['product_description'].replace("'","''")
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where product_page_url = '"+str(data['product_page_url'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" (rce_source_product_id," \
|
||||||
|
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
|
||||||
|
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
|
||||||
|
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
|
||||||
|
"product_section,rce_source_id) values("+str(data['rce_source_product_id'])+","+str(data['rce_source_product_status'])+",'"+str(data['product_page_url'])+"'," \
|
||||||
|
"'"+str(data['product_page_url_hash'])+"',"+str(data['rce_category_id'])+","+str(data['rce_brand_id'])+","+str(data['rce_store_id'])+"," \
|
||||||
|
"'"+str(data['rce_source_product_name'])+"','"+str(data['product_images'])+"','"+str(data['product_description'])+"',"+str(data['product_sold_total'])+"," \
|
||||||
|
""+str(data['product_sold'])+",'"+str(data['product_price_min'])+"','"+str(data['product_price_min_before_discount'])+"','"+str(data['product_price_max'])+"'," \
|
||||||
|
"'"+str(data['product_price_max_before_discount'])+"','"+str(data['ratings'])+"','"+str(data['product_section'])+"',"+str(data['rce_source_id'])+")"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" (id,rce_source_product_id," \
|
||||||
|
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
|
||||||
|
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
|
||||||
|
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
|
||||||
|
"product_section,createdat,updatedat,rce_source_id) select id,rce_source_product_id," \
|
||||||
|
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
|
||||||
|
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
|
||||||
|
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
|
||||||
|
"product_section,createdat,updatedat,rce_source_id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where " \
|
||||||
|
"product_page_url='"+str(data['product_page_url'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_source_product_id'])==str(res[1]) and str(data['rce_source_product_status'])==str(res[2]) and \
|
||||||
|
str(data['product_page_url'])==str(res[3]) and str(data['product_page_url_hash'])==str(res[4]) and str(data['rce_category_id'])==str(res[5]) and \
|
||||||
|
str(data['rce_brand_id'])==str(res[6]) and str(data['rce_store_id'])==str(res[7]) and str(data['rce_source_product_name'])==str(res[8]) and \
|
||||||
|
str(data['product_images'])==str(res[9]) and str(data['product_sold_total'])==str(res[11]) and \
|
||||||
|
str(data['product_sold'])==str(res[12]) and str(data['product_price_min'])==str(res[13]) and str(data['product_price_min_before_discount'])==str(res[14]) and \
|
||||||
|
str(data['product_price_max'])==str(res[15]) and str(data['product_price_max_before_discount'])==str(res[16]) and str(data['ratings'])==str(res[17]) \
|
||||||
|
and str(data['rce_source_id'])==str(res[21]) \
|
||||||
|
and str(data['product_section'])==str(res[22]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" set updatedat=now() " \
|
||||||
|
"where product_page_url = '"+ str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" set rce_source_product_id="+str(data['rce_source_product_id'])+"," \
|
||||||
|
"rce_source_product_status="+str(data['rce_source_product_status'])+",product_page_url='"+str(data['product_page_url'])+"',product_page_url_hash= " \
|
||||||
|
"'"+str(data['product_page_url_hash'])+"',rce_category_id="+str(data['rce_category_id'])+",rce_brand_id="+str(data['rce_brand_id'])+"," \
|
||||||
|
"rce_store_id="+str(data['rce_store_id'])+",rce_source_product_name='"+str(data['rce_source_product_name'])+"',product_images='"+str(data['product_images'])+"'" \
|
||||||
|
",product_description='"+str(data['product_description'])+"',product_sold_total="+str(data['product_sold_total'])+",product_sold="+str(data['product_sold'])+"," \
|
||||||
|
"product_price_min='"+str(data['product_price_min'])+"',product_price_min_before_discount='"+str(data['product_price_min_before_discount'])+"'," \
|
||||||
|
"product_price_max='"+str(data['product_price_max'])+"',product_price_max_before_discount='"+str(data['product_price_max_before_discount'])+"',ratings='"+str(data['ratings'])+"'," \
|
||||||
|
"product_section='"+str(data['product_section'])+"', updatedat=now(), rce_source_id="+str(data['rce_source_id'])+" where product_page_url = '"+ str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" (id,rce_source_product_id," \
|
||||||
|
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
|
||||||
|
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
|
||||||
|
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
|
||||||
|
"product_section,createdat,updatedat,rce_source_id) select id,rce_source_product_id," \
|
||||||
|
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
|
||||||
|
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
|
||||||
|
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
|
||||||
|
"product_section,createdat,updatedat,rce_source_id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where " \
|
||||||
|
"product_page_url='"+str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
def rce_product_variant(self, data):
|
||||||
|
data['product_variant_name'] = data['product_variant_name'].replace("'","''")
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where product_variant_name = '"+str(data['product_variant_name'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" (rce_source_variant_id,rce_product_id," \
|
||||||
|
"product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock) values("+str(data['rce_source_variant_id'])+"," \
|
||||||
|
""+str(data['rce_product_id'])+",'"+str(data['product_variant_name'])+"','"+str(data['product_variant_price'])+"'," \
|
||||||
|
"'"+str(data['product_variant_price_before_discount'])+"',"+str(data['product_variant_stock'])+")"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" (id,rce_source_variant_id,rce_product_id," \
|
||||||
|
"product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,createdat,updatedat) select * from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where product_variant_name='"+str(data['product_variant_name'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
if str(data['rce_source_variant_id'])==str(res[1]) and str(data['rce_product_id'])==str(res[2]) and str(data['product_variant_name'])==str(res[3]) and \
|
||||||
|
str(data['product_variant_price'])==str(res[4]) and str(data['product_variant_price_before_discount'])==str(res[5]) and str(data['product_variant_stock'])==str(res[6]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" set updatedat=now() " \
|
||||||
|
"where product_variant_name = '"+ str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" set rce_source_variant_id="+str(data['rce_source_variant_id'])+", " \
|
||||||
|
"rce_product_id="+str(data['rce_product_id'])+", product_variant_name='"+str(data['product_variant_name'])+"', product_variant_price=" \
|
||||||
|
"'"+str(data['product_variant_price'])+"',product_variant_price_before_discount='"+str(data['product_variant_price_before_discount'])+"'," \
|
||||||
|
"product_variant_stock="+str(data['product_variant_stock'])+", updatedat=now() where product_variant_name = '"+ str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" (id,rce_source_variant_id,rce_product_id," \
|
||||||
|
"product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,createdat,updatedat) select * from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where product_variant_name='"+str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
def rce_brand(self, data):
|
||||||
|
data['brand_page_url'] = data['brand_page_url'].replace("'","''")
|
||||||
|
data['brand_name'] = data['brand_name'].replace("'","''")
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_page_url = '"+str(data['brand_page_url'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" (rce_source_id,rce_source_brand_status," \
|
||||||
|
"brand_page_url,brand_page_url_hash,brand_name) values("+str(data['rce_source_id'])+"," \
|
||||||
|
""+str(data['rce_source_brand_status'])+",'"+str(data['brand_page_url'])+"','"+str(data['brand_page_url_hash'])+"'," \
|
||||||
|
"'"+str(data['brand_name'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" (id,rce_source_id,rce_source_brand_status," \
|
||||||
|
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat) select id,rce_source_id,rce_source_brand_status," \
|
||||||
|
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_page_url='"+str(data['brand_page_url'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_source_id'])==str(res[1]) and str(data['rce_source_brand_status'])==str(res[3]) and str(data['brand_page_url'])==str(res[4]) and \
|
||||||
|
str(data['brand_page_url_hash'])==str(res[5]) and str(data['brand_name'])==str(res[6]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" set updatedat=now() " \
|
||||||
|
"where brand_page_url = '"+ str(res[4])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" set rce_source_id="+str(data['rce_source_id'])+", " \
|
||||||
|
"rce_source_brand_status="+str(data['rce_source_brand_status'])+", brand_page_url='"+str(data['brand_page_url'])+"', brand_page_url_hash=" \
|
||||||
|
"'"+str(data['brand_page_url_hash'])+"',brand_name='"+str(data['brand_name'])+"', updatedat=now() where brand_page_url = '"+ str(res[4])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" (id,rce_source_id,rce_source_brand_status," \
|
||||||
|
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat) select id,rce_source_id,rce_source_brand_status, " \
|
||||||
|
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_page_url='"+str(res[4])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_reseller(self, data):
|
||||||
|
data['reseller_name'] = data['reseller_name'].replace("'","''")
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data['reseller_name'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" (rce_source_id,rce_source_reseller_status," \
|
||||||
|
"reseller_name,reseller_average_rating,reseller_description) values("+str(data['rce_source_id'])+"," \
|
||||||
|
""+str(data['rce_source_reseller_status'])+",'"+str(data['reseller_name'])+"','"+str(data['reseller_average_rating'])+"'," \
|
||||||
|
"'"+str(data['reseller_description'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (id,rce_source_id,rce_source_reseller_status," \
|
||||||
|
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat) select id,rce_source_id,rce_source_reseller_status," \
|
||||||
|
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name='"+str(data['reseller_name'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_source_reseller_status'])==str(res[3]) and str(data['reseller_name'])==str(res[4]) and \
|
||||||
|
str(data['reseller_average_rating'])==str(res[5]) and str(data['reseller_description'])==str(res[6]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set updatedat=now() " \
|
||||||
|
"where reseller_name = '"+ str(res[4])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set rce_source_id="+str(data['rce_source_id'])+", " \
|
||||||
|
"rce_source_reseller_status="+str(data['rce_source_reseller_status'])+", reseller_name='"+str(data['reseller_name'])+"', reseller_average_rating=" \
|
||||||
|
"'"+str(data['reseller_average_rating'])+"',reseller_description='"+str(data['reseller_description'])+"', updatedat=now() where reseller_name = '"+ str(res[4])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (id,rce_source_id,rce_source_reseller_status," \
|
||||||
|
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat) select id,rce_source_id,rce_source_reseller_status," \
|
||||||
|
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name='"+str(res[4])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_reseller_store(self, data):
|
||||||
|
|
||||||
|
data['store_page_url'] = data['store_page_url'].replace("'","''")
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url = '"+str(data['store_page_url'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" (rce_source_store_status," \
|
||||||
|
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,rce_source_id) values(" \
|
||||||
|
""+str(data['rce_source_store_status'])+",'"+str(data['store_page_url'])+"','"+str(data['store_page_url_hash'])+"'," \
|
||||||
|
"'"+str(data['store_location'])+"', "+str(data['rce_reseller_id'])+", "+str(data['rce_source_id'])+")"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (id,rce_source_store_status," \
|
||||||
|
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id) select id,rce_source_store_status," \
|
||||||
|
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url='"+str(data['store_page_url'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_source_store_status'])==str(res[2]) and str(data['store_page_url'])==str(res[3]) and \
|
||||||
|
str(data['store_page_url_hash'])==str(res[4]) and str(data['store_location'])==str(res[5]) and \
|
||||||
|
str(data['rce_reseller_id'])==str(res[6]) and str(data['rce_source_id'])==str(res[9]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set updatedat=now() " \
|
||||||
|
"where store_page_url = '"+ str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set " \
|
||||||
|
"rce_source_store_status="+str(data['rce_source_store_status'])+", store_page_url='"+str(data['store_page_url'])+"', store_page_url_hash=" \
|
||||||
|
"'"+str(data['store_page_url_hash'])+"',store_location='"+str(data['store_location'])+"', rce_reseller_id="+str(data['rce_reseller_id'])+", " \
|
||||||
|
"updatedat=now(), rce_source_id="+str(data['rce_source_id'])+" where store_page_url = '"+ str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (id,rce_source_store_status," \
|
||||||
|
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id) select id,rce_source_store_status," \
|
||||||
|
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url='"+str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_ratings_reviews(self, data):
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id = "+str(data['rce_product_id'])+" and username ='"+str(data['username'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
data['username'] = data['username'].replace("'","''")
|
||||||
|
data['img_url'] = data['img_url'].replace("'","''")
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" (id,rce_product_id,username," \
|
||||||
|
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating) values("+str(data['id'])+","+str(data['rce_product_id'])+"," \
|
||||||
|
"'"+str(data['username'])+"','"+str(data['review'])+"','"+str(data['img_url'])+"',"+str(data['review_like_count'])+",'"+str(data['user_tier'])+"'," \
|
||||||
|
""+str(data['shop_id'])+", '"+str(data['video_url'])+"', '"+str(data['rating'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" (id,rce_product_id,username," \
|
||||||
|
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select id,rce_product_id,username," \
|
||||||
|
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id="+str(data['rce_product_id'])+" and username ='"+str(data['username'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_product_id'])==str(res[1]) and str(data['username'])==str(res[2]) and str(data['review'])==str(res[3]) and \
|
||||||
|
str(data['img_url'])==str(res[4]) and str(data['review_like_count'])==str(res[5]) and str(data['user_tier'])==str(res[6]) and \
|
||||||
|
str(data['shop_id'])==str(res[7]) and str(data['video_url'])==str(res[8]) and str(data['rating'])==str(res[9]):
|
||||||
|
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set updatedat=now() " \
|
||||||
|
"where rce_product_id = "+ str(res[1])+" and username ='"+res[2]+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set rce_product_id="+str(data['rce_product_id'])+", " \
|
||||||
|
"username='"+str(data['username'])+"', review='"+str(data['review'])+"', img_url=" \
|
||||||
|
"'"+str(data['img_url'])+"',review_like_count="+str(data['review_like_count'])+", user_tier='"+str(data['user_tier'])+"', " \
|
||||||
|
"shop_id="+str(data['shop_id'])+", video_url='"+str(data['video_url'])+"', rating='"+str(data['rating'])+"', updatedat=now() " \
|
||||||
|
"where rce_product_id = "+ str(res[1])+" and username ='"+str(data['username'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" (id,rce_product_id,username," \
|
||||||
|
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select id,rce_product_id,username," \
|
||||||
|
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id="+str(res[1])+" and username ='"+str(data['username'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_ratings_reviews_productmodels(self,data):
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id = "+str(data['rce_rating_id'])
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" (rce_rating_id,model_id) " \
|
||||||
|
"values("+str(data['rce_rating_id'])+",'"+str(data['model_id'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
|
||||||
|
"createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(data['rce_rating_id'])+""
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_rating_id'])==str(res[1]) and str(data['model_id'])==str(res[2]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set updatedat=now() " \
|
||||||
|
"where rce_rating_id = "+ str(res[1])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set model_id="+str(data['model_id'])+", " \
|
||||||
|
"updatedat=now() where rce_source_store_id = "+ str(res[1])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
|
||||||
|
"createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(res[1])+""
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
def rce_tags(self,data):
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description = '"+str(data['description'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" (id,description) " \
|
||||||
|
"values("+str(data['id'])+",'"+str(data['description'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
|
||||||
|
"createdat,updatedat) select id,description,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(data['description'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['description'])==str(res[1]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set updatedat=now() " \
|
||||||
|
"where description = '"+ str(res[1])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set description='"+str(data['description'])+"', " \
|
||||||
|
"updatedat=now() where description = "+ str(res[1])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
|
||||||
|
"createdat,updatedat) select id,description,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(res[1])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
def rce_ratings_reviews_producttags(self,data):
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id = '"+str(data['rce_rating_id'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" (rce_rating_id,tag_ids) " \
|
||||||
|
"values("+str(data['rce_rating_id'])+",'"+str(data['tag_ids'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
|
||||||
|
"createdat,updatedat) select id,rce_rating_id,tag_ids,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id='"+str(data['rce_rating_id'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_rating_id'])==str(res[1]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set updatedat=now() " \
|
||||||
|
"where rce_rating_id = '"+ str(res[1])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set rce_rating_id='"+str(data['rce_rating_id'])+"', " \
|
||||||
|
"updatedat=now() where rce_rating_id = "+ str(res[1])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
|
||||||
|
"createdat,updatedat) select id,description,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where description='"+str(res[1])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,174 @@
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import random
|
||||||
|
import sys
|
||||||
|
import string
|
||||||
|
#from selenium import webdriver
|
||||||
|
import undetected_chromedriver as webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
import psycopg2
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
from amazon_db_writer import amazon_db_writer
|
||||||
|
from datetime import datetime
|
||||||
|
from pyvirtualdisplay import Display
|
||||||
|
|
||||||
|
import ssl
|
||||||
|
ssl._create_default_https_context = ssl._create_unverified_context
|
||||||
|
|
||||||
|
class amazon_products_adhoc:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.crawler_name = self.config.get("crawler_name")
|
||||||
|
self.pattern = r'[' + string.punctuation + ']'
|
||||||
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
||||||
|
self.conn.autocommit = True
|
||||||
|
self.cur = self.conn.cursor()
|
||||||
|
sql = f"""select * from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where rce_source_id=66 and product_price_min= '' order by id desc"""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
self.items = self.cur.fetchall()
|
||||||
|
self.db_writer = amazon_db_writer(config)
|
||||||
|
#self.display = Display(visible=0, size=(800, 600))
|
||||||
|
#self.display.start()
|
||||||
|
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
print("Closing connection.....")
|
||||||
|
self.conn.close()
|
||||||
|
#self.display.stop()
|
||||||
|
|
||||||
|
def start_processing(self):
|
||||||
|
op = webdriver.ChromeOptions()
|
||||||
|
op.add_argument('--no-sandbox')
|
||||||
|
op.add_argument('--disable-notifications')
|
||||||
|
op.add_argument("--lang=en-GB")
|
||||||
|
op.add_argument('--user-data-dir=/home/ec2-user/chrome_cache/')
|
||||||
|
driver=webdriver.Chrome(options=op)
|
||||||
|
count = 0
|
||||||
|
for item in self.items:
|
||||||
|
count += 1
|
||||||
|
try:
|
||||||
|
logging.info("============== Getting info for {}/{}: {} ================".format(str(count),str(len(self.items)),str(item)))
|
||||||
|
start = datetime.now()
|
||||||
|
|
||||||
|
driver.get(item[3])
|
||||||
|
self.product_info(driver, item)
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 where product_page_url_hash='{item[4]}'
|
||||||
|
"""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
end = datetime.now()
|
||||||
|
logging.info('Total time taken to fetch the product: {}'.format(str(end-start)))
|
||||||
|
time.sleep(5)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
driver.close()
|
||||||
|
|
||||||
|
def product_info(self, driver, item):
|
||||||
|
|
||||||
|
data_product = {}
|
||||||
|
|
||||||
|
data_product['rce_source_product_id'] = item[1]
|
||||||
|
data_product['rce_source_id'] = item[21]
|
||||||
|
data_product['rce_source_product_status'] = item[2]
|
||||||
|
data_product['product_page_url'] = item[3]
|
||||||
|
data_product['product_page_url_hash'] = item[4]
|
||||||
|
data_product['rce_category_id'] = item[5]
|
||||||
|
data_product['rce_brand_id'] = item[6]
|
||||||
|
data_product['rce_store_id'] = item[7]
|
||||||
|
data_product['rce_source_product_name'] = item[8]
|
||||||
|
data_product['product_images'] = item[9]
|
||||||
|
data_product['product_description'] = item[10]
|
||||||
|
data_product['product_sold_total'] = item[11]
|
||||||
|
data_product['product_sold'] = item[12]
|
||||||
|
data_product['product_price_min'] = item[13]
|
||||||
|
data_product['product_price_min_before_discount'] =item[14]
|
||||||
|
data_product['product_price_max'] = item[15]
|
||||||
|
data_product['product_price_max_before_discount'] = item[16]
|
||||||
|
data_product['ratings'] = item[17]
|
||||||
|
data_product['product_section'] = item[22]
|
||||||
|
|
||||||
|
|
||||||
|
# try:
|
||||||
|
# data_product['product_price_min'] = (driver.find_element(By.CSS_SELECTOR, '#corePrice_desktop > div > table > tbody > tr:nth-child(2) > td.a-span12 > span.a-price.a-text-price.a-size-medium.apexPriceToPay > span:nth-child(2)').text).replace('AED', '')
|
||||||
|
# data_product['product_price_max'] = data_product['product_price_min']
|
||||||
|
#
|
||||||
|
# except:
|
||||||
|
#
|
||||||
|
# try:
|
||||||
|
# price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text
|
||||||
|
# price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text
|
||||||
|
#
|
||||||
|
# price = price_whole+"."+price_fraction
|
||||||
|
# data_product['product_price_min'] = price
|
||||||
|
# data_product['product_price_max'] = price
|
||||||
|
# except:
|
||||||
|
# try:
|
||||||
|
# data_product['product_price_min'] =(driver.find_element(By.CSS_SELECTOR, '#sns-base-price > div > span.a-price.a-text-price.a-size-medium.apexPriceToPay > span:nth-child(2)').text).replace('AED','')
|
||||||
|
# data_product['product_price_max'] = data_product['product_price_min']
|
||||||
|
# except:
|
||||||
|
# data_product['product_price_min'] = (driver.find_element(By.CSS_SELECTOR, '#sns-base-price').text).replace('AED','')
|
||||||
|
# data_product['product_price_max'] = data_product['product_price_min']
|
||||||
|
# pass
|
||||||
|
# pass
|
||||||
|
#
|
||||||
|
# pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_product['product_price_min'] = (driver.find_element(By.CSS_SELECTOR, '#sns-base-price').text).replace('AED', '')
|
||||||
|
data_product['product_price_max'] = data_product['product_price_min']
|
||||||
|
|
||||||
|
except:
|
||||||
|
price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text
|
||||||
|
price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text
|
||||||
|
|
||||||
|
price = price_whole+"."+price_fraction
|
||||||
|
data_product['product_price_min'] = price
|
||||||
|
data_product['product_price_max'] = price
|
||||||
|
pass
|
||||||
|
|
||||||
|
print("product_price_min: {}".format(data_product['product_price_min']))
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_product['product_price_min_before_discount'] = (driver.find_element(By.CSS_SELECTOR, '.a-text-price').text).replace('AED', '')
|
||||||
|
data_product['product_price_max_before_discount'] = data_product['product_price_min_before_discount']
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_product(data_product)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"crawler_name": "raena_crawler_enginer_amazon",
|
||||||
|
"crawler_schema": "raena_spider_management",
|
||||||
|
"category_tab": "rce_category",
|
||||||
|
"tracker_tab": "crawler_tracker",
|
||||||
|
"product_tab": "rce_product",
|
||||||
|
"variant_tab": "rce_product_variant",
|
||||||
|
"brand_tab": "rce_brand",
|
||||||
|
"reseller_tab": "rce_reseller",
|
||||||
|
"reseller_store_tab": "rce_reseller_store",
|
||||||
|
"review_tab": "rce_ratings_reviews",
|
||||||
|
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
|
||||||
|
"review_producttags_tab": "rce_ratings_reviews_producttags",
|
||||||
|
"review_tags": "rce_tags",
|
||||||
|
"source_tab": "rce_source",
|
||||||
|
"product_per_category": "1000",
|
||||||
|
"source_category": "11043145",
|
||||||
|
"db_user": "dbadmin",
|
||||||
|
"db_pass": "5qCif6eyY3Kmg4z",
|
||||||
|
"database": "analytics",
|
||||||
|
"db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
|
||||||
|
"db_port": "5432",
|
||||||
|
"crawler_main": "1",
|
||||||
|
"crawler_slave_no": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
amazon_products_adhoc = amazon_products_adhoc(config)
|
||||||
|
amazon_products_adhoc.start_processing()
|
|
@ -0,0 +1,516 @@
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import random
|
||||||
|
import sys
|
||||||
|
import string
|
||||||
|
import undetected_chromedriver as webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
import psycopg2
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
from amazon_db_writer import amazon_db_writer
|
||||||
|
from datetime import datetime
|
||||||
|
from pyvirtualdisplay import Display
|
||||||
|
|
||||||
|
import ssl
|
||||||
|
ssl._create_default_https_context = ssl._create_unverified_context
|
||||||
|
|
||||||
|
class amazon_products:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.crawler_name = self.config.get("crawler_name")
|
||||||
|
self.pattern = r'[' + string.punctuation + ']'
|
||||||
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
||||||
|
self.conn.autocommit = True
|
||||||
|
self.cur = self.conn.cursor()
|
||||||
|
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Amazon'")
|
||||||
|
self.rce_source_id = self.cur.fetchone()[0]
|
||||||
|
self.cur.execute("select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where crawler_name='raena_crawler_enginer_amazon' and flag=0 order by id")
|
||||||
|
self.items = self.cur.fetchall()
|
||||||
|
self.db_writer = amazon_db_writer(config)
|
||||||
|
#self.display = Display(visible=0, size=(800, 600))
|
||||||
|
#self.display.start()
|
||||||
|
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
print("Closing connection.....")
|
||||||
|
self.conn.close()
|
||||||
|
#self.display.stop()
|
||||||
|
|
||||||
|
def start_processing(self):
|
||||||
|
count = 0
|
||||||
|
for item in self.items:
|
||||||
|
count += 1
|
||||||
|
try:
|
||||||
|
logging.info("============== Getting info for {}/{}: {} ================".format(str(count),str(len(self.items)),str(item)))
|
||||||
|
start = datetime.now()
|
||||||
|
self.get_product_info(item)
|
||||||
|
end = datetime.now()
|
||||||
|
logging.info('Total time taken to fetch the product: {}'.format(str(end-start)))
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
def reseller_info(self, driver):
|
||||||
|
try:
|
||||||
|
store_urls = []
|
||||||
|
try:
|
||||||
|
driver.find_element(By.CSS_SELECTOR, '.a-icon.a-icon-arrow.a-icon-small.arrow-icon').click()
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
offers = driver.find_elements(By.CSS_SELECTOR, '#aod-offer-soldBy')
|
||||||
|
|
||||||
|
for offer in offers:
|
||||||
|
try:
|
||||||
|
store_url = offer.find_element(By.CSS_SELECTOR, '.a-fixed-left-grid-col.a-col-right').find_element(By.TAG_NAME, 'a').get_attribute('href')
|
||||||
|
store_urls.append(store_url)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
store_url = driver.find_element(By.CSS_SELECTOR, '#sellerProfileTriggerId').get_attribute('href')
|
||||||
|
store_urls.append(store_url)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
pass
|
||||||
|
|
||||||
|
if store_urls:
|
||||||
|
|
||||||
|
store_urls = list(set(store_urls))
|
||||||
|
|
||||||
|
return_item = ""
|
||||||
|
flag = 0
|
||||||
|
|
||||||
|
for store_url in store_urls:
|
||||||
|
driver.get(store_url)
|
||||||
|
driver.implicitly_wait(5)
|
||||||
|
|
||||||
|
##### reseller info
|
||||||
|
|
||||||
|
data_reseller = {}
|
||||||
|
data_reseller['rce_source_id'] = self.rce_source_id
|
||||||
|
data_reseller['rce_source_reseller_status'] = 1
|
||||||
|
data_reseller['reseller_name'] = ""
|
||||||
|
data_reseller['reseller_average_rating'] = 0.0
|
||||||
|
data_reseller['reseller_description'] = ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_reseller['reseller_name'] = driver.find_element(By.CSS_SELECTOR,'#seller-name').text
|
||||||
|
data_reseller['reseller_name'] = data_reseller['reseller_name'].replace("'","")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_reseller['reseller_average_rating'] = float(driver.find_element(By.CSS_SELECTOR,'#effective-timeperiod-rating-year-description').text)
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
data_reseller['reseller_average_rating'] = float(driver.find_element(By.CSS_SELECTOR,'#effective-timeperiod-rating-year-description').text)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_reseller['reseller_description'] = driver.find_element(By.CSS_SELECTOR, '#spp-expander-about-seller .a-row').text
|
||||||
|
data_reseller['reseller_description'] = data_reseller['reseller_description'].replace("'","")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_reseller(data_reseller)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
|
||||||
|
##### Store info
|
||||||
|
|
||||||
|
data_reseller_store = {}
|
||||||
|
data_reseller_store['rce_source_store_status'] = 1
|
||||||
|
data_reseller_store['store_page_url'] = store_url
|
||||||
|
data_reseller_store['store_page_url_hash'] = hashlib.md5(data_reseller_store['store_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
data_reseller_store['store_location'] = ""
|
||||||
|
data_reseller_store['rce_reseller_id'] = ""
|
||||||
|
data_reseller_store['rce_source_id'] = self.rce_source_id
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data_reseller['reseller_name'])+"'")
|
||||||
|
rce_reseller_id = self.cur.fetchone()
|
||||||
|
data_reseller_store['rce_reseller_id'] = rce_reseller_id[0]
|
||||||
|
if flag == 0:
|
||||||
|
return_item = data_reseller_store['rce_reseller_id']
|
||||||
|
flag = 1
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_reseller_store(data_reseller_store)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
|
||||||
|
time.sleep(2)
|
||||||
|
else:
|
||||||
|
|
||||||
|
##### reseller info
|
||||||
|
|
||||||
|
data_reseller = {}
|
||||||
|
data_reseller['rce_source_id'] = self.rce_source_id
|
||||||
|
data_reseller['rce_source_reseller_status'] = 1
|
||||||
|
data_reseller['reseller_name'] = "Amazon.ae"
|
||||||
|
data_reseller['reseller_average_rating'] = 0.0
|
||||||
|
data_reseller['reseller_description'] = ""
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_reseller(data_reseller)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
|
||||||
|
##### Store info
|
||||||
|
|
||||||
|
data_reseller_store = {}
|
||||||
|
data_reseller_store['rce_source_store_status'] = 1
|
||||||
|
data_reseller_store['store_page_url'] = "amazon.ae"
|
||||||
|
data_reseller_store['store_page_url_hash'] = hashlib.md5(data_reseller_store['store_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
data_reseller_store['store_location'] = ""
|
||||||
|
data_reseller_store['rce_reseller_id'] = ""
|
||||||
|
data_reseller_store['rce_source_id'] = self.rce_source_id
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data_reseller['reseller_name'])+"'")
|
||||||
|
rce_reseller_id = self.cur.fetchone()
|
||||||
|
data_reseller_store['rce_reseller_id'] = rce_reseller_id[0]
|
||||||
|
return_item = data_reseller_store['rce_reseller_id']
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_reseller_store(data_reseller_store)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return return_item
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
def brand_info(self, driver):
|
||||||
|
data_brand = {}
|
||||||
|
|
||||||
|
data_brand['rce_source_id'] = self.rce_source_id
|
||||||
|
data_brand['rce_source_brand_status'] = 1
|
||||||
|
data_brand['brand_page_url'] = ""
|
||||||
|
data_brand['brand_page_url_hash'] = ""
|
||||||
|
data_brand['brand_name'] = ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_brand['brand_page_url'] = driver.find_element(By.CSS_SELECTOR, '#bylineInfo').get_attribute('href')
|
||||||
|
data_brand['brand_page_url_hash'] = hashlib.md5(data_brand['brand_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_brand['brand_name'] = driver.find_element(By.CSS_SELECTOR, '.po-brand .po-break-word').text
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_brand(data_brand)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
|
||||||
|
return data_brand['brand_name']
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def product_info(self, driver, category, keyword, url, url_hash, brand_name, rce_reseller_id):
|
||||||
|
data_product = {}
|
||||||
|
|
||||||
|
data_product['rce_source_product_id'] = 0
|
||||||
|
data_product['rce_source_id'] = self.rce_source_id
|
||||||
|
data_product['rce_source_product_status'] = 1
|
||||||
|
data_product['product_page_url'] = url.replace("'","''")
|
||||||
|
data_product['product_page_url_hash'] = url_hash
|
||||||
|
data_product['rce_category_id'] = category
|
||||||
|
data_product['rce_brand_id'] = ""
|
||||||
|
data_product['rce_store_id'] = ""
|
||||||
|
data_product['rce_source_product_name'] = ""
|
||||||
|
data_product['product_images'] = ""
|
||||||
|
data_product['product_description'] = ""
|
||||||
|
data_product['product_sold_total'] = 0
|
||||||
|
data_product['product_sold'] = 0
|
||||||
|
data_product['product_price_min'] = ""
|
||||||
|
data_product['product_price_min_before_discount'] =""
|
||||||
|
data_product['product_price_max'] = ""
|
||||||
|
data_product['product_price_max_before_discount'] = ""
|
||||||
|
data_product['ratings'] = 0.0
|
||||||
|
data_product['product_section'] = keyword
|
||||||
|
|
||||||
|
try:
|
||||||
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_name = '"+str(brand_name)+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
data_product['rce_brand_id'] = self.cur.fetchone()[0]
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
data_product['rce_store_id'] = self.cur.fetchone()[0]
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
rce_source_product_name = driver.find_element(By.CSS_SELECTOR,'#productTitle').text
|
||||||
|
data_product['rce_source_product_name'] = str(re.sub(self.pattern, '', rce_source_product_name)).replace("'","''")
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
product_images_element = driver.find_element(By.CSS_SELECTOR, '#magnifierLens')
|
||||||
|
product_images_raw = product_images_element.find_elements(By.TAG_NAME, 'img')
|
||||||
|
|
||||||
|
product_images = []
|
||||||
|
for product_image in product_images_raw:
|
||||||
|
url = product_image.get_attribute('src')
|
||||||
|
product_images.append(url)
|
||||||
|
|
||||||
|
data_product['product_images'] = str(product_images)
|
||||||
|
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
description = ""
|
||||||
|
des_rank = ""
|
||||||
|
try:
|
||||||
|
des_raws = driver.find_element(By.CSS_SELECTOR, '.a-unordered-list.a-vertical.a-spacing-mini').find_elements(By.CSS_SELECTOR, '.a-list-item')
|
||||||
|
|
||||||
|
for des_raw in des_raws:
|
||||||
|
try:
|
||||||
|
des = des_raw.text
|
||||||
|
description += des
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
des_rank = driver.find_element(By.XPATH, '/html/body/div[2]/div/div[6]/div[24]/div/ul[1]').find_element(By.CSS_SELECTOR, '.a-list-item').text
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
data_product['product_description'] = description+des_rank
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text
|
||||||
|
price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text
|
||||||
|
|
||||||
|
price = price_whole+"."+price_fraction
|
||||||
|
|
||||||
|
data_product['product_price_min'] = price
|
||||||
|
data_product['product_price_max'] = price
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
d_price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text
|
||||||
|
d_price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text
|
||||||
|
|
||||||
|
price = d_price_whole+"."+d_price_fraction
|
||||||
|
|
||||||
|
data_product['product_price_min'] = price
|
||||||
|
data_product['product_price_max'] = price
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_product['product_price_min_before_discount'] = (driver.find_element(By.CSS_SELECTOR, '.a-text-price').text).replace('AED', '')
|
||||||
|
data_product['product_price_max_before_discount'] = data_product['product_price_min_before_discount']
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_product['ratings'] = driver.find_element(By.CSS_SELECTOR, '#averageCustomerReviews .a-color-base').text
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_product(data_product)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
|
||||||
|
### rce_product_variant
|
||||||
|
try:
|
||||||
|
is_variant = driver.find_element(By.CSS_SELECTOR, '.a-unordered-list.a-nostyle.a-button-list.a-declarative.a-button-toggle-group.a-horizontal.a-spacing-top-micro.swatches.swatchesSquare.imageSwatches')
|
||||||
|
if is_variant:
|
||||||
|
variants = is_variant.find_elements(By.TAG_NAME, 'li')
|
||||||
|
#random.shuffle(variants)
|
||||||
|
|
||||||
|
for variant in variants:
|
||||||
|
variant.click()
|
||||||
|
data_variant = {}
|
||||||
|
|
||||||
|
data_variant['rce_source_variant_id'] = 0
|
||||||
|
data_variant['rce_product_id'] = ""
|
||||||
|
data_variant['product_variant_name'] = ""
|
||||||
|
data_variant['product_variant_price'] = ""
|
||||||
|
data_variant['product_variant_price_before_discount'] = ""
|
||||||
|
data_variant['product_variant_stock'] = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_name = '"+str(data_product['rce_source_product_name'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
data_variant['rce_product_id'] = self.cur.fetchone()[0]
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
product_variant_name = driver.find_element(By.CSS_SELECTOR,'#productTitle').text
|
||||||
|
data_variant['product_variant_name'] = str(re.sub(self.pattern, '', product_variant_name)).replace("'","''")
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
d_price_whole = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-whole').text
|
||||||
|
d_price_fraction = driver.find_element(By.CSS_SELECTOR, '.reinventPricePriceToPayMargin .a-price-fraction').text
|
||||||
|
|
||||||
|
price = d_price_whole+"."+d_price_fraction
|
||||||
|
|
||||||
|
data_variant['product_variant_price'] = price
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_variant['product_variant_price_before_discount'] = (driver.find_element(By.CSS_SELECTOR, '.a-text-price').text).replace('AED', '')
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_product_variant(data_variant)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
|
||||||
|
time.sleep(random.randint(2,5))
|
||||||
|
|
||||||
|
else:
|
||||||
|
logging.info('No variant found')
|
||||||
|
except:
|
||||||
|
logging.info('No variant found')
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def rating_info(self, driver, rce_reseller_id, url_hash):
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver.find_element(By.CSS_SELECTOR, '#reviews-medley-footer .a-link-emphasis').click()
|
||||||
|
driver.implicitly_wait(5)
|
||||||
|
|
||||||
|
data_reviews = driver.find_elements(By.CSS_SELECTOR, '.a-section.review.aok-relative')
|
||||||
|
|
||||||
|
|
||||||
|
for data in data_reviews:
|
||||||
|
|
||||||
|
data_review = {}
|
||||||
|
|
||||||
|
data_review["id"] = ""
|
||||||
|
data_review["rce_product_id"] = ""
|
||||||
|
data_review["username"] = ""
|
||||||
|
data_review["review"] = ""
|
||||||
|
data_review["img_url"] = ""
|
||||||
|
data_review["review_like_count"] = 0
|
||||||
|
data_review["user_tier"] = ""
|
||||||
|
data_review["shop_id"] = 0
|
||||||
|
data_review["video_url"] = ""
|
||||||
|
data_review["rating"] = ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
sql = "select max(id) from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')
|
||||||
|
self.cur.execute(sql)
|
||||||
|
rating_id = self.cur.fetchone()
|
||||||
|
|
||||||
|
if rating_id[0]==None:
|
||||||
|
rating_id = 1
|
||||||
|
else:
|
||||||
|
rating_id = int(rating_id[0]) + 1
|
||||||
|
|
||||||
|
data_review["id"] = rating_id
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where product_page_url_hash = '"+str(url_hash)+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
data_review["rce_product_id"] = self.cur.fetchone()[0]
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_review["username"] = data.find_element(By.CSS_SELECTOR, '.a-profile-name').text
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_review["review"] = data.find_element(By.CSS_SELECTOR, '.a-size-base.review-text.review-text-content').text
|
||||||
|
data_review["review"] = data_review["review"].replace("'","")
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
rating = data.find_element(By.CSS_SELECTOR, '.a-icon.a-icon-star.review-rating .a-icon-alt').get_attribute("textContent")
|
||||||
|
data_review["rating"] = rating.replace(' out of 5 stars', '')
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
data_review["shop_id"] = self.cur.fetchone()[0]
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_ratings_reviews(data_review)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_product_info(self,item):
|
||||||
|
try:
|
||||||
|
op = webdriver.ChromeOptions()
|
||||||
|
op.add_argument('--no-sandbox')
|
||||||
|
op.add_argument('--disable-notifications')
|
||||||
|
op.add_argument("--lang=en-GB")
|
||||||
|
op.add_argument('--user-data-dir=/home/ec2-user/chrome_cache/')
|
||||||
|
#op.headless = True
|
||||||
|
driver=webdriver.Chrome(options=op)
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver.get('https://www.amazon.ae')
|
||||||
|
time.sleep(3)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
##### Reseller info #####
|
||||||
|
driver.get(item[4])
|
||||||
|
driver.implicitly_wait(5)
|
||||||
|
rce_reseller_id = self.reseller_info(driver)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
##### Product Info #####
|
||||||
|
driver.get(item[4])
|
||||||
|
driver.implicitly_wait(5)
|
||||||
|
##### Brand Info
|
||||||
|
brand_name = self.brand_info(driver)
|
||||||
|
##### Product info
|
||||||
|
self.product_info(driver, item[2], item[3], item[4], item[5], brand_name, rce_reseller_id)
|
||||||
|
|
||||||
|
|
||||||
|
##### Rating Info #####
|
||||||
|
driver.get(item[4])
|
||||||
|
driver.implicitly_wait(5)
|
||||||
|
self.rating_info(driver, rce_reseller_id, item[5])
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 where product_page_url_hash='{item[5]}'
|
||||||
|
"""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
driver.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
driver.close()
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
{
|
||||||
|
"crawler_name": "raena_crawler_enginer_amazon",
|
||||||
|
"crawler_schema": "raena_spider_management",
|
||||||
|
"category_tab": "rce_category",
|
||||||
|
"tracker_tab": "crawler_tracker",
|
||||||
|
"product_tab": "rce_product",
|
||||||
|
"variant_tab": "rce_product_variant",
|
||||||
|
"brand_tab": "rce_brand",
|
||||||
|
"reseller_tab": "rce_reseller",
|
||||||
|
"reseller_store_tab": "rce_reseller_store",
|
||||||
|
"review_tab": "rce_ratings_reviews",
|
||||||
|
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
|
||||||
|
"review_producttags_tab": "rce_ratings_reviews_producttags",
|
||||||
|
"review_tags": "rce_tags",
|
||||||
|
"source_tab": "rce_source",
|
||||||
|
"product_per_category": "1000",
|
||||||
|
"source_category": "11043145",
|
||||||
|
"db_user": "dbadmin",
|
||||||
|
"db_pass": "5qCif6eyY3Kmg4z",
|
||||||
|
"database": "analytics",
|
||||||
|
"db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
|
||||||
|
"db_port": "5432",
|
||||||
|
"crawler_main": "1",
|
||||||
|
"crawler_slave_no": ""
|
||||||
|
}
|
|
@ -0,0 +1,44 @@
|
||||||
|
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
import time
|
||||||
|
|
||||||
|
import ssl
|
||||||
|
ssl._create_default_https_context = ssl._create_unverified_context
|
||||||
|
|
||||||
|
op = webdriver.ChromeOptions()
|
||||||
|
op.add_argument('--no-sandbox')
|
||||||
|
op.add_argument('--disable-notifications')
|
||||||
|
op.add_argument("--lang=en-GB")
|
||||||
|
#op.headless = True
|
||||||
|
driver=webdriver.Chrome( options=op)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
driver.get('https://www.noon.com/uae-en/beauty/')
|
||||||
|
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
element = driver.find_element(By.CSS_SELECTOR, '.componentArea-9')
|
||||||
|
|
||||||
|
title = element.find_element(By.CSS_SELECTOR, '.truncate-title-header').text
|
||||||
|
products = element.find_elements(By.CSS_SELECTOR, '.sc-kCMKrZ.ealOXE')
|
||||||
|
|
||||||
|
urls = []
|
||||||
|
for product in products:
|
||||||
|
url = product.find_element(By.TAG_NAME, 'a').get_attribute('href')
|
||||||
|
urls.append(url)
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"title": title,
|
||||||
|
"products": urls
|
||||||
|
}
|
||||||
|
|
||||||
|
print(data)
|
||||||
|
|
||||||
|
driver.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,83 @@
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
import string
|
||||||
|
import undetected_chromedriver as webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
import psycopg2
|
||||||
|
import bs4
|
||||||
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
import random
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import gzip
|
||||||
|
import re
|
||||||
|
import random
|
||||||
|
from amazon_db_writer import amazon_db_writer
|
||||||
|
|
||||||
|
import ssl
|
||||||
|
ssl._create_default_https_context = ssl._create_unverified_context
|
||||||
|
|
||||||
|
|
||||||
|
def reseller_info(store_url):
|
||||||
|
|
||||||
|
op = webdriver.ChromeOptions()
|
||||||
|
op.add_argument('--no-sandbox')
|
||||||
|
op.add_argument('--disable-notifications')
|
||||||
|
op.add_argument("--lang=en-GB")
|
||||||
|
#op.headless = True
|
||||||
|
driver=webdriver.Chrome( options=op)
|
||||||
|
|
||||||
|
driver.get(store_url)
|
||||||
|
|
||||||
|
driver.implicitly_wait(5)
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver.get(store_url)
|
||||||
|
driver.implicitly_wait(5)
|
||||||
|
|
||||||
|
##### reseller info
|
||||||
|
|
||||||
|
avg_rating = driver.find_element(By.CSS_SELECTOR,'#effective-timeperiod-rating-year-description.ratings-reviews').text
|
||||||
|
|
||||||
|
print(avg_rating)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"crawler_name": "raena_crawler_enginer_amazon",
|
||||||
|
"crawler_schema": "raena_spider_management",
|
||||||
|
"category_tab": "rce_category",
|
||||||
|
"tracker_tab": "crawler_tracker",
|
||||||
|
"product_tab": "rce_product",
|
||||||
|
"variant_tab": "rce_product_variant",
|
||||||
|
"brand_tab": "rce_brand",
|
||||||
|
"reseller_tab": "rce_reseller",
|
||||||
|
"reseller_store_tab": "rce_reseller_store",
|
||||||
|
"review_tab": "rce_ratings_reviews",
|
||||||
|
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
|
||||||
|
"review_producttags_tab": "rce_ratings_reviews_producttags",
|
||||||
|
"review_tags": "rce_tags",
|
||||||
|
"source_tab": "rce_source",
|
||||||
|
"product_per_category": "1000",
|
||||||
|
"source_category": "11043145",
|
||||||
|
"db_user": "postgres",
|
||||||
|
"db_pass": "postgres",
|
||||||
|
"database": "postgres",
|
||||||
|
"db_host": "localhost",
|
||||||
|
"db_port": "5444",
|
||||||
|
"crawler_main": "1",
|
||||||
|
"crawler_slave_no": ""
|
||||||
|
}
|
||||||
|
conn = psycopg2.connect(database=config.get('database'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port'))
|
||||||
|
conn.autocommit = True
|
||||||
|
cur = conn.cursor()
|
||||||
|
db_writer = amazon_db_writer(config)
|
||||||
|
|
||||||
|
|
||||||
|
reseller_info('https://www.amazon.ae/sp?ie=UTF8&seller=A3TFGX22P341AN&isAmazonFulfilled=0&asin=B09BR31PF9&ref_=olp_merch_name_1')
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,77 @@
|
||||||
|
import hashlib
|
||||||
|
from amazon_db_writer import amazon_db_writer
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"crawler_name": "raena_crawler_enginer_amazon",
|
||||||
|
"crawler_schema": "raena_spider_management",
|
||||||
|
"category_tab": "rce_category",
|
||||||
|
"tracker_tab": "crawler_tracker",
|
||||||
|
"product_tab": "rce_product",
|
||||||
|
"variant_tab": "rce_product_variant",
|
||||||
|
"brand_tab": "rce_brand",
|
||||||
|
"reseller_tab": "rce_reseller",
|
||||||
|
"reseller_store_tab": "rce_reseller_store",
|
||||||
|
"review_tab": "rce_ratings_reviews",
|
||||||
|
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
|
||||||
|
"review_producttags_tab": "rce_ratings_reviews_producttags",
|
||||||
|
"review_tags": "rce_tags",
|
||||||
|
"source_tab": "rce_source",
|
||||||
|
"product_per_category": "1000",
|
||||||
|
"source_category": "11043145",
|
||||||
|
"db_user": "postgres",
|
||||||
|
"db_pass": "postgres",
|
||||||
|
"database": "postgres",
|
||||||
|
"db_host": "localhost",
|
||||||
|
"db_port": "5444",
|
||||||
|
"crawler_main": "1",
|
||||||
|
"crawler_slave_no": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
db_writer = amazon_db_writer(config)
|
||||||
|
|
||||||
|
data_product = {}
|
||||||
|
|
||||||
|
data_product['rce_source_product_id'] = 0
|
||||||
|
data_product['rce_source_id'] = 1
|
||||||
|
data_product['rce_source_product_status'] = 1
|
||||||
|
data_product['product_page_url'] = 'https://www.amazon.ae/Davidoff-Water-Perfume-Toilette-110ML/dp/B002S8PT8U/?_encoding=UTF8&pd_rd_w=VQ6dh&content-id=amzn1.sym.baa1fbbd-9373-444b-8104-61fa134741c5%3Aamzn1.symc.36bd837a-d66d-47d1-8457-ffe9a9f3ddab&pf_rd_p=baa1fbbd-9373-444b-8104-61fa134741c5&pf_rd_r=6EKKA9QC40Y5MFKGRWYQ&pd_rd_wg=nsmjm&pd_rd_r=6d02ccd2-297c-4b73-8586-a9ac9b355d4a&ref_=pd_gw_ci_mcx_mr_hp_atf_m'
|
||||||
|
data_product['product_page_url_hash'] = 'bjhgfds867ty3iuhbfew'
|
||||||
|
data_product['rce_category_id'] = 3
|
||||||
|
data_product['rce_brand_id'] = 2
|
||||||
|
data_product['rce_store_id'] = 6
|
||||||
|
data_product['rce_source_product_name'] = "Hot Water by Davidoff for Men"
|
||||||
|
data_product['product_images'] = ""
|
||||||
|
data_product['product_description'] = "Davidoff Hot Water hits you first with it’s fresh spicy aroma owing to the vegetal top notes of wormwood and basil. While the o"
|
||||||
|
data_product['product_sold_total'] = 0
|
||||||
|
data_product['product_sold'] = 0
|
||||||
|
data_product['product_price_min'] = "99.00"
|
||||||
|
data_product['product_price_min_before_discount'] ="340.00"
|
||||||
|
data_product['product_price_max'] = "99.00"
|
||||||
|
data_product['product_price_max_before_discount'] = "340.00"
|
||||||
|
data_product['ratings'] = 4.1
|
||||||
|
data_product['product_section'] = "Fragrance"
|
||||||
|
|
||||||
|
data_variant = {}
|
||||||
|
|
||||||
|
data_variant['rce_source_variant_id'] = 0
|
||||||
|
data_variant['rce_product_id'] = 2
|
||||||
|
data_variant['product_variant_name'] = "abc"
|
||||||
|
data_variant['product_variant_price'] = "67.3"
|
||||||
|
data_variant['product_variant_price_before_discount'] = "100.90"
|
||||||
|
data_variant['product_variant_stock'] = 0
|
||||||
|
|
||||||
|
|
||||||
|
data_review = {}
|
||||||
|
|
||||||
|
data_review["id"] = 1
|
||||||
|
data_review["rce_product_id"] = 5
|
||||||
|
data_review["username"] = "adnan"
|
||||||
|
data_review["review"] = "very good product"
|
||||||
|
data_review["img_url"] = ""
|
||||||
|
data_review["review_like_count"] = 0
|
||||||
|
data_review["user_tier"] = ""
|
||||||
|
data_review["shop_id"] = 2
|
||||||
|
data_review["video_url"] = ""
|
||||||
|
data_review["rating"] = "4.9"
|
||||||
|
|
||||||
|
db_writer.rce_ratings_reviews(data_review)
|
|
@ -0,0 +1,9 @@
|
||||||
|
1. Log into Facebook and go to the group from which you want to export the members.
|
||||||
|
|
||||||
|
2. Navigate to the “Members“ tab.
|
||||||
|
|
||||||
|
3. Open the Developer console on chrome and paste the code from "chrome_group_export".
|
||||||
|
|
||||||
|
4. Paste the code from "chrome_auto_scroll" to the auto-scroll page.
|
||||||
|
|
||||||
|
5. Download and save the file once the limit (10K) is reached.
|
|
@ -0,0 +1,37 @@
|
||||||
|
(function() {
|
||||||
|
var intervalObj = null;
|
||||||
|
var retry = 0;
|
||||||
|
var clickHandler = function() {
|
||||||
|
console.log("Clicked; stopping autoscroll");
|
||||||
|
clearInterval(intervalObj);
|
||||||
|
document.body.removeEventListener("click", clickHandler);
|
||||||
|
}
|
||||||
|
function scrollDown() {
|
||||||
|
var scrollHeight = document.body.scrollHeight,
|
||||||
|
scrollTop = document.body.scrollTop,
|
||||||
|
innerHeight = window.innerHeight,
|
||||||
|
difference = (scrollHeight - scrollTop) - innerHeight
|
||||||
|
|
||||||
|
if (difference > 0) {
|
||||||
|
window.scrollBy(0, difference);
|
||||||
|
if (retry > 0) {
|
||||||
|
retry = 0;
|
||||||
|
}
|
||||||
|
console.log("scrolling down more");
|
||||||
|
} else {
|
||||||
|
if (retry >= 3) {
|
||||||
|
console.log("reached bottom of page; stopping");
|
||||||
|
clearInterval(intervalObj);
|
||||||
|
document.body.removeEventListener("click", clickHandler);
|
||||||
|
} else {
|
||||||
|
console.log("[apparenty] hit bottom of page; retrying: " + (retry + 1));
|
||||||
|
retry++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
document.body.addEventListener("click", clickHandler);
|
||||||
|
|
||||||
|
intervalObj = setInterval(scrollDown, 1000);
|
||||||
|
|
||||||
|
})()
|
|
@ -0,0 +1 @@
|
||||||
|
function exportToCsv(e,t){for(var n="",o=0;o<t.length;o++)n+=function(e){for(var t="",n=0;n<e.length;n++){var o=null===e[n]||void 0===e[n]?"":e[n].toString(),o=(o=e[n]instanceof Date?e[n].toLocaleString():o).replace(/"/g,'""');0<n&&(t+=","),t+=o=0<=o.search(/("|,|\n)/g)?'"'+o+'"':o}return t+"\n"}(t[o]);var r=new Blob([n],{type:"text/csv;charset=utf-8;"}),i=document.createElement("a");void 0!==i.download&&(r=URL.createObjectURL(r),i.setAttribute("href",r),i.setAttribute("download",e),document.body.appendChild(i),i.click(),document.body.removeChild(i))}function buildCTABtn(){var e=document.createElement("div"),t=(e.setAttribute("style",["position: fixed;","top: 0;","left: 0;","z-index: 10;","width: 100%;","height: 100%;","pointer-events: none;"].join("")),document.createElement("div")),n=(t.setAttribute("style",["position: absolute;","bottom: 30px;","right: 130px;","color: white;","min-width: 150px;","background: var(--primary-button-background);","border-radius: var(--button-corner-radius);","padding: 0px 12px;","cursor: pointer;","font-weight:600;","font-size:15px;","display: inline-flex;","pointer-events: auto;","height: 36px;","align-items: center;","justify-content: center;"].join("")),document.createTextNode("Download ")),o=document.createElement("span"),r=(o.setAttribute("id","fb-group-scraper-number-tracker"),o.textContent="0",document.createTextNode(" members"));return t.appendChild(n),t.appendChild(o),t.appendChild(r),t.addEventListener("click",function(){var e=(new Date).toISOString();exportToCsv("groupMemberExport-".concat(e,".csv"),window.members_list)}),e.appendChild(t),document.body.appendChild(e),e}function processResponse(e){var t;if(null!==(n=null==e?void 0:e.data)&&void 0!==n&&n.group)o=e.data.group;else{if("Group"!==(null===(n=null===(n=null==e?void 0:e.data)||void 0===n?void 0:n.node)||void 0===n?void 0:n.__typename))return;o=e.data.node}if(null!==(n=null==o?void 0:o.new_members)&&void 0!==n&&n.edges)t=o.new_members.edges;else{if(null===(e=null==o?void 0:o.new_forum_members)||void 0===e||!e.edges)return;t=o.new_forum_members.edges}var n=t.map(function(e){var t=e.node,n=t.id,o=t.name,r=t.bio_text,i=t.url,d=t.profile_picture,t=t.__isProfile,s=(null===(s=null==e?void 0:e.join_status_text)||void 0===s?void 0:s.text)||(null===(s=null===(s=null==e?void 0:e.membership)||void 0===s?void 0:s.join_status_text)||void 0===s?void 0:s.text),e=null===(e=e.node.group_membership)||void 0===e?void 0:e.associated_group.id;return[n,o,i,(null==r?void 0:r.text)||"",(null==d?void 0:d.uri)||"",e,s||"",t]}),o=((e=window.members_list).push.apply(e,n),document.getElementById("fb-group-scraper-number-tracker"));o&&(o.textContent=window.members_list.length.toString())}function parseResponse(e){var n=[];try{n.push(JSON.parse(e))}catch(t){var o=e.split("\n");if(o.length<=1)return void console.error("Fail to parse API response",t);for(var r=0;r<o.length;r++){var i=o[r];try{n.push(JSON.parse(i))}catch(e){console.error("Fail to parse API response",t)}}}for(var t=0;t<n.length;t++)processResponse(n[t])}function main(){buildCTABtn();var e=XMLHttpRequest.prototype.send;XMLHttpRequest.prototype.send=function(){this.addEventListener("readystatechange",function(){this.responseURL.includes("/api/graphql/")&&4===this.readyState&&parseResponse(this.responseText)},!1),e.apply(this,arguments)}}window.members_list=window.members_list||[["ProfileId","FulName","ProfileLink","Bio","ImageSrc","GroupId","GroupJoining","ProfileType"]],main();
|
|
@ -0,0 +1,25 @@
|
||||||
|
{
|
||||||
|
"crawler_name": "raena_crawler_enginer_noon",
|
||||||
|
"crawler_schema": "raena_spider_management",
|
||||||
|
"category_tab": "rce_category",
|
||||||
|
"tracker_tab": "crawler_tracker_noon",
|
||||||
|
"product_tab": "rce_product",
|
||||||
|
"variant_tab": "rce_product_variant",
|
||||||
|
"brand_tab": "rce_brand",
|
||||||
|
"reseller_tab": "rce_reseller",
|
||||||
|
"reseller_store_tab": "rce_reseller_store",
|
||||||
|
"review_tab": "rce_ratings_reviews",
|
||||||
|
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
|
||||||
|
"review_producttags_tab": "rce_ratings_reviews_producttags",
|
||||||
|
"review_tags": "rce_tags",
|
||||||
|
"source_tab": "rce_source",
|
||||||
|
"product_per_category": "1000",
|
||||||
|
"source_category": "11043145",
|
||||||
|
"db_user": "dbadmin",
|
||||||
|
"db_pass": "5qCif6eyY3Kmg4z",
|
||||||
|
"database": "analytics",
|
||||||
|
"db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
|
||||||
|
"db_port": "5432",
|
||||||
|
"crawler_main": "1",
|
||||||
|
"crawler_slave_no": ""
|
||||||
|
}
|
|
@ -0,0 +1,194 @@
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import undetected_chromedriver as webdriver
|
||||||
|
import psycopg2
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from pyvirtualdisplay import Display
|
||||||
|
|
||||||
|
from amazon_db_writer import amazon_db_writer
|
||||||
|
import ssl
|
||||||
|
ssl._create_default_https_context = ssl._create_unverified_context
|
||||||
|
|
||||||
|
|
||||||
|
class amazon_categories:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.crawler_name = self.config.get("crawler_name")
|
||||||
|
self.url = "https://www.amazon.ae/s?rh=n%3A11497859031&ref=lp_11497860031_sar"
|
||||||
|
self.product_limit = int(self.config.get("product_per_category"))
|
||||||
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
||||||
|
self.conn.autocommit = True
|
||||||
|
self.cur = self.conn.cursor()
|
||||||
|
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Amazon'")
|
||||||
|
try : self.rce_source_id = self.cur.fetchone()[0]
|
||||||
|
except:
|
||||||
|
logging.info("Source tab is empty. Please check. Exiting.....")
|
||||||
|
exit(1)
|
||||||
|
self.db_writer = amazon_db_writer(config)
|
||||||
|
|
||||||
|
#self.display = Display(visible=0, size=(800, 600))
|
||||||
|
#self.display.start()
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
print("Closing connection.....")
|
||||||
|
self.conn.close()
|
||||||
|
#self.display.stop()
|
||||||
|
|
||||||
|
def start_processing(self):
|
||||||
|
op = webdriver.ChromeOptions()
|
||||||
|
op.add_argument('--no-sandbox')
|
||||||
|
op.add_argument('--disable-notifications')
|
||||||
|
op.add_argument("--lang=en-GB")
|
||||||
|
#op.headless = True
|
||||||
|
#driver=webdriver.Chrome(version_main = 113, options=op)
|
||||||
|
driver=webdriver.Chrome(options=op)
|
||||||
|
|
||||||
|
driver.get(self.url)
|
||||||
|
|
||||||
|
driver.implicitly_wait(10)
|
||||||
|
|
||||||
|
self.get_categories(driver)
|
||||||
|
|
||||||
|
driver.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_categories(self, driver):
|
||||||
|
|
||||||
|
#element = driver.find_elements(By.CSS_SELECTOR,'.bxc-grid__container.bxc-grid__container--width-1500.bxc-grid__mp-gutter-layout')
|
||||||
|
#sub_cats = element[0].find_elements(By.CSS_SELECTOR,'.bxc-grid__image.bxc-grid__image--light')
|
||||||
|
sub_cats = driver.find_elements(By.CSS_SELECTOR,'.bxc-grid__image.bxc-grid__image--light')
|
||||||
|
|
||||||
|
|
||||||
|
names = ['Perfumes', 'Skin care', 'Hair care', 'Bath & body', 'Makeup', 'Nail care']
|
||||||
|
|
||||||
|
categories = []
|
||||||
|
for sub_cat in sub_cats:
|
||||||
|
name = sub_cat.find_element(By.TAG_NAME, 'a').get_attribute('aria-label')
|
||||||
|
if name in names:
|
||||||
|
link = sub_cat.find_element(By.TAG_NAME, 'a').get_attribute('href')
|
||||||
|
|
||||||
|
category = {
|
||||||
|
"name": name,
|
||||||
|
"link": link
|
||||||
|
}
|
||||||
|
|
||||||
|
categories.append(category)
|
||||||
|
|
||||||
|
print(categories)
|
||||||
|
self.get_sub_categories(driver, categories)
|
||||||
|
|
||||||
|
def get_sub_categories(self,driver,categories):
|
||||||
|
|
||||||
|
sub_categories = []
|
||||||
|
for category in categories:
|
||||||
|
print("=============== {} ===============".format(category["name"]))
|
||||||
|
|
||||||
|
data = {}
|
||||||
|
data['parent_category_id'] = 0
|
||||||
|
data['rce_source_id'] = self.rce_source_id
|
||||||
|
data['rce_source_category_id'] = 0
|
||||||
|
data['rce_source_status'] = 1
|
||||||
|
data['category_name'] = category["name"]
|
||||||
|
data['category_page_url'] = category["link"]
|
||||||
|
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
self.db_writer.rce_category(data)
|
||||||
|
|
||||||
|
driver.get(category["link"])
|
||||||
|
|
||||||
|
##### Feature Categories
|
||||||
|
try:
|
||||||
|
f_cat = driver.find_element(By.CSS_SELECTOR, '.octopus-pc-category-card-v2-title .a-size-extra-large')
|
||||||
|
if f_cat:
|
||||||
|
cats_c = driver.find_element(By.CSS_SELECTOR, '.a-section.octopus-pc-category-card-v2-content')
|
||||||
|
cats = cats_c.find_elements(By.CSS_SELECTOR, '.octopus-pc-category-card-v2-item')
|
||||||
|
for cat in cats:
|
||||||
|
cat_name = cat.find_element(By.CSS_SELECTOR, '.a-size-medium.a-color-base.a-text-bold').text
|
||||||
|
url = cat.find_element(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-category-card-v2-category-link').get_attribute("href")
|
||||||
|
# print('Name: {}, URL: {}'.format(cat_name,url))
|
||||||
|
# s_cat = {
|
||||||
|
# "name": cat_name,
|
||||||
|
# "link": url
|
||||||
|
# }
|
||||||
|
# sub_categories.append(s_cat)
|
||||||
|
|
||||||
|
data = {}
|
||||||
|
data['parent_category_id'] = 0
|
||||||
|
data['rce_source_id'] = self.rce_source_id
|
||||||
|
data['rce_source_category_id'] = 0
|
||||||
|
data['rce_source_status'] = 1
|
||||||
|
data['category_name'] = cat_name
|
||||||
|
data['category_page_url'] = url
|
||||||
|
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
self.db_writer.rce_category(data)
|
||||||
|
|
||||||
|
try:
|
||||||
|
sub_cats = cat.find_elements(By.CSS_SELECTOR, '.a-link-normal.octopus-pc-category-card-v2-subcategory-link')
|
||||||
|
|
||||||
|
for sub_cat in sub_cats:
|
||||||
|
s_url = sub_cat.get_attribute('href')
|
||||||
|
s_title = sub_cat.get_attribute('title')
|
||||||
|
# print('Title: {}, URL: {}'.format(s_title, s_url))
|
||||||
|
# s_cat = {
|
||||||
|
# "name": s_title,
|
||||||
|
# "link": s_url
|
||||||
|
# }
|
||||||
|
# sub_categories.append(s_cat)
|
||||||
|
data = {}
|
||||||
|
data['parent_category_id'] = 0
|
||||||
|
data['rce_source_id'] = self.rce_source_id
|
||||||
|
data['rce_source_category_id'] = 0
|
||||||
|
data['rce_source_status'] = 1
|
||||||
|
data['category_name'] = s_title
|
||||||
|
data['category_page_url'] = s_url
|
||||||
|
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
self.db_writer.rce_category(data)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
except:
|
||||||
|
print("Feature Cat not available.")
|
||||||
|
pass
|
||||||
|
|
||||||
|
##### Shop by categories
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
cat_h = driver.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-header')
|
||||||
|
except:
|
||||||
|
cat_h = driver.find_element(By.CSS_SELECTOR, '#contentGrid_292470')
|
||||||
|
pass
|
||||||
|
if cat_h:
|
||||||
|
cats_c = driver.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-viewport-row-inner')
|
||||||
|
cats = cats_c.find_elements(By.TAG_NAME, 'li')
|
||||||
|
for cat in cats:
|
||||||
|
cat_name = cat.find_element(By.CSS_SELECTOR, '.sl-sobe-carousel-sub-card-title').text
|
||||||
|
url = cat.find_element(By.TAG_NAME, 'a').get_attribute('href')
|
||||||
|
# print('Name: {}, URL: {}'.format(cat_name,url))
|
||||||
|
# s_cat = {
|
||||||
|
# "name": cat_name,
|
||||||
|
# "link": url
|
||||||
|
# }
|
||||||
|
# sub_categories.append(s_cat)
|
||||||
|
data = {}
|
||||||
|
data['parent_category_id'] = 0
|
||||||
|
data['rce_source_id'] = self.rce_source_id
|
||||||
|
data['rce_source_category_id'] = 0
|
||||||
|
data['rce_source_status'] = 1
|
||||||
|
data['category_name'] = cat_name
|
||||||
|
data['category_page_url'] = url
|
||||||
|
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
self.db_writer.rce_category(data)
|
||||||
|
except Exception as e:
|
||||||
|
print('Cat not available')
|
||||||
|
pass
|
||||||
|
|
||||||
|
print(sub_categories)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# categories = amazon_categories()
|
||||||
|
# categories.start_processing()
|
|
@ -0,0 +1,255 @@
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
#import undetected_chromedriver as webdriver
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver import ActionChains, Keys
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
import psycopg2
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.common.keys import Keys
|
||||||
|
from noon_db_writer import noon_db_writer
|
||||||
|
from pyvirtualdisplay import Display
|
||||||
|
from scroller.scroller import smartScroll
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
import ssl
|
||||||
|
ssl._create_default_https_context = ssl._create_unverified_context
|
||||||
|
|
||||||
|
|
||||||
|
class noon_category_products:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.crawler_name = self.config.get("crawler_name")
|
||||||
|
#self.url = "https://www.amazon.ae/gp/browse.html?node=11497860031&ref_=nav_em_by_all_0_2_11_2"
|
||||||
|
self.product_limit = int(self.config.get("product_per_category"))
|
||||||
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
||||||
|
self.conn.autocommit = True
|
||||||
|
self.cur = self.conn.cursor()
|
||||||
|
sql = "delete from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where crawler_name='"+str(self.crawler_name)+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
sql = f"""
|
||||||
|
select a.id, a.category_page_url from {self.config.get('crawler_schema')}.{self.config.get('category_tab')} a
|
||||||
|
where a.rce_source_id = (
|
||||||
|
select id from {self.config.get('crawler_schema')}.{self.config.get('source_tab')} where source_name = 'Noon')
|
||||||
|
"""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
self.categories = self.cur.fetchall()
|
||||||
|
#self.display = Display(visible=0, size=(800, 600))
|
||||||
|
#self.display.start()
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
print("Closing connection.....")
|
||||||
|
self.conn.close()
|
||||||
|
#self.display.stop()
|
||||||
|
|
||||||
|
def start_processing(self):
|
||||||
|
# #### Collect section name and section products ####
|
||||||
|
section_products = self.section_products()
|
||||||
|
self.insert_tracker_tab(section_products)
|
||||||
|
# if self.categories:
|
||||||
|
# for category in self.categories:
|
||||||
|
# logging.info("======= Fetching products of {}".format(category))
|
||||||
|
# self.browse_category_page(category)
|
||||||
|
# else:
|
||||||
|
# logging.info("No category available. Stopping.......")
|
||||||
|
|
||||||
|
|
||||||
|
def browse_category_page(self, catagory):
|
||||||
|
try:
|
||||||
|
# op = webdriver.ChromeOptions()
|
||||||
|
# op.add_argument('--no-sandbox')
|
||||||
|
# op.add_argument('--disable-notifications')
|
||||||
|
# op.add_argument("--lang=en-GB")
|
||||||
|
#op.headless = True
|
||||||
|
#driver=webdriver.Chrome(version_main = 113, options=op)
|
||||||
|
# driver=webdriver.Chrome(options=op)
|
||||||
|
|
||||||
|
driver = webdriver.Firefox()
|
||||||
|
|
||||||
|
driver.get(catagory[1])
|
||||||
|
|
||||||
|
driver.implicitly_wait(10)
|
||||||
|
|
||||||
|
|
||||||
|
### Collect All products ####
|
||||||
|
self.base_products(driver, catagory[0])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
driver.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
|
||||||
|
def section_products(self):
|
||||||
|
|
||||||
|
driver = webdriver.Firefox()
|
||||||
|
|
||||||
|
driver.get('https://www.noon.com/uae-en/beauty/')
|
||||||
|
driver.implicitly_wait(5)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
#Bestsellers
|
||||||
|
elements = driver.find_element(By.CSS_SELECTOR, '.componentArea-4 > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > div:nth-child(1)').find_elements(By.CSS_SELECTOR,'.sc-kCMKrZ.ealOXE')
|
||||||
|
|
||||||
|
urls = []
|
||||||
|
for element in elements:
|
||||||
|
link = element.find_element(By.TAG_NAME, 'a').get_attribute('href')
|
||||||
|
urls.append(link)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"catagory": '3184',
|
||||||
|
"key": "Bestsellers",
|
||||||
|
"value": urls
|
||||||
|
}
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
# New arrivals
|
||||||
|
|
||||||
|
elements = driver.find_element(By.CSS_SELECTOR, '.componentArea-18 > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > div:nth-child(1)').find_elements(By.CSS_SELECTOR,'.swiper-slide')
|
||||||
|
urls = []
|
||||||
|
for element in elements:
|
||||||
|
link = element.find_element(By.TAG_NAME, 'a').get_attribute('href')
|
||||||
|
urls.append(link)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"catagory": '3184',
|
||||||
|
"key": "New arrivals",
|
||||||
|
"value": urls
|
||||||
|
}
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
# Clearance deals
|
||||||
|
|
||||||
|
elements = driver.find_element(By.CSS_SELECTOR, '.componentArea-21 > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > div:nth-child(1)').find_elements(By.CSS_SELECTOR,'.swiper-slide')
|
||||||
|
|
||||||
|
urls = []
|
||||||
|
for element in elements:
|
||||||
|
link = element.find_element(By.TAG_NAME, 'a').get_attribute('href')
|
||||||
|
urls.append(link.replace("'",""))
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"catagory": '3184',
|
||||||
|
"key": "Clearance deals",
|
||||||
|
"value": urls
|
||||||
|
}
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
print(results)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def insert_tracker_tab(self, objs):
|
||||||
|
|
||||||
|
for obj in objs:
|
||||||
|
category = str(obj['catagory'])
|
||||||
|
key = str(obj['key'])
|
||||||
|
items = obj['value']
|
||||||
|
for item in items:
|
||||||
|
product_page_url = item
|
||||||
|
product_page_url_hash = hashlib.md5(product_page_url.encode('utf-8')).hexdigest()
|
||||||
|
flag = 0
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where product_page_url = '"+product_page_url+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchall()
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+"(crawler_name,category,keyword,product_page_url,product_page_url_hash,flag) values('"+str(self.crawler_name)+"','"+str(category)+"','"+str(key)+"','"+product_page_url+"','"+product_page_url_hash+"',"+str(flag)+")"
|
||||||
|
print(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def base_products(self, driver, catagory):
|
||||||
|
|
||||||
|
try:
|
||||||
|
for i in range(1,16):
|
||||||
|
|
||||||
|
smartScroll(driver, stopAtBorder=True, distancePerSecond=500, humanBreaks=True)
|
||||||
|
|
||||||
|
|
||||||
|
# ##############
|
||||||
|
# SCROLL_PAUSE_TIME = 0.5
|
||||||
|
#
|
||||||
|
# # Get scroll height
|
||||||
|
# last_height = driver.execute_script("return document.body.scrollHeight")
|
||||||
|
#
|
||||||
|
# while True:
|
||||||
|
# # Scroll down to bottom
|
||||||
|
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
|
#
|
||||||
|
# # Wait to load page
|
||||||
|
# time.sleep(SCROLL_PAUSE_TIME)
|
||||||
|
#
|
||||||
|
# # Calculate new scroll height and compare with last scroll height
|
||||||
|
# new_height = driver.execute_script("return document.body.scrollHeight")
|
||||||
|
# if new_height == last_height:
|
||||||
|
# break
|
||||||
|
# last_height = new_height
|
||||||
|
# #############
|
||||||
|
|
||||||
|
items = driver.find_element(By.CSS_SELECTOR, '.sc-810b5658-7.upghB.grid').find_elements(By.CSS_SELECTOR,'.sc-ff3f80d5-0.iBVDAS.wrapper.productContainer')
|
||||||
|
|
||||||
|
#smartScroll(driver, stopAtBorder=True, distancePerSecond=500, humanBreaks=True)
|
||||||
|
|
||||||
|
|
||||||
|
urls = []
|
||||||
|
for item in items:
|
||||||
|
url = item.find_element(By.TAG_NAME, 'a').get_attribute('href')
|
||||||
|
urls.append(url)
|
||||||
|
|
||||||
|
result = [{
|
||||||
|
"catagory": catagory,
|
||||||
|
"key": "Base Product Page {}".format(str(i)),
|
||||||
|
"value": urls
|
||||||
|
}]
|
||||||
|
|
||||||
|
self.insert_tracker_tab(result)
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver.find_elements(By.CSS_SELECTOR, '.arrowLink')[1].click()
|
||||||
|
html = driver.find_element(By.TAG_NAME, 'html')
|
||||||
|
html.send_keys(Keys.HOME)
|
||||||
|
driver.implicitly_wait(5)
|
||||||
|
except:
|
||||||
|
logging.info("No more page to navigate......")
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"crawler_name": "raena_crawler_enginer_noon",
|
||||||
|
"crawler_schema": "raena_spider_management",
|
||||||
|
"category_tab": "rce_category",
|
||||||
|
"tracker_tab": "crawler_tracker_noon",
|
||||||
|
"product_tab": "rce_product",
|
||||||
|
"variant_tab": "rce_product_variant",
|
||||||
|
"brand_tab": "rce_brand",
|
||||||
|
"reseller_tab": "rce_reseller",
|
||||||
|
"reseller_store_tab": "rce_reseller_store",
|
||||||
|
"review_tab": "rce_ratings_reviews",
|
||||||
|
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
|
||||||
|
"review_producttags_tab": "rce_ratings_reviews_producttags",
|
||||||
|
"review_tags": "rce_tags",
|
||||||
|
"source_tab": "rce_source",
|
||||||
|
"product_per_category": "1000",
|
||||||
|
"source_category": "11043145",
|
||||||
|
"db_user": "dbadmin",
|
||||||
|
"db_pass": "5qCif6eyY3Kmg4z",
|
||||||
|
"database": "analytics",
|
||||||
|
"db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
|
||||||
|
"db_port": "5432",
|
||||||
|
"crawler_main": "1",
|
||||||
|
"crawler_slave_no": ""
|
||||||
|
}
|
||||||
|
noon_category_products = noon_category_products(config)
|
||||||
|
noon_category_products.start_processing()
|
|
@ -0,0 +1,115 @@
|
||||||
|
import logging
|
||||||
|
import psycopg2
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
import smtplib
|
||||||
|
from email.message import EmailMessage
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from noon_products import noon_products
|
||||||
|
|
||||||
|
|
||||||
|
##### Looger ######
|
||||||
|
format = "%(asctime)s: %(message)s"
|
||||||
|
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
config = {}
|
||||||
|
|
||||||
|
def slack_notification(message):
|
||||||
|
webhook_url = "https://hooks.slack.com/services/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"
|
||||||
|
slack_data = {"text": "Issue occurred on Noon Crawler. Error: " + str(message)}
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
webhook_url, data=json.dumps(slack_data),
|
||||||
|
headers={"Content-Type": "application/json"}
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise ValueError(
|
||||||
|
f"Request to Slack returned an error {response.status_code}, {response.text}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def send_mail():
|
||||||
|
|
||||||
|
try:
|
||||||
|
EMAIL_ADDRESS = "AKIAR2YL57QC6NITTJN5"
|
||||||
|
EMAIL_PASSWORD = "BAs9W772KNxLL1xnMzYhdIkpflQ8H+KP0Zbl8dphQZWh"
|
||||||
|
From = 'data_reporting@raenabeauty.com'
|
||||||
|
To = 'shariar@raenabeauty.com'
|
||||||
|
#To = 'shariar@raenabeauty.com'
|
||||||
|
|
||||||
|
html = f'''
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<div style="background-color:#eee;padding:10px 20px;">
|
||||||
|
<h2 style="font-family:Georgia, 'Times New Roman', Times, serif;color#454349;">Amazon Crawler Status</h2>
|
||||||
|
</div>
|
||||||
|
<div style="padding:20px 0px">
|
||||||
|
<div style="height: 800px;width:800px">
|
||||||
|
Error occured. Please check Amazon Pipeline.
|
||||||
|
<div style="text-align:Left;">
|
||||||
|
<p>This is system generated mail. Please do not reply</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
'''
|
||||||
|
|
||||||
|
msg = EmailMessage()
|
||||||
|
msg['Subject'] = 'Amazon Crawler Status'
|
||||||
|
msg['From'] = From
|
||||||
|
msg['To'] = To
|
||||||
|
msg.set_content(html, subtype='html')
|
||||||
|
|
||||||
|
|
||||||
|
with smtplib.SMTP('email-smtp.ap-southeast-1.amazonaws.com', 587) as smtp:
|
||||||
|
smtp.ehlo()
|
||||||
|
smtp.starttls()
|
||||||
|
smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD)
|
||||||
|
smtp.send_message(msg)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info("Error while sending mail: {}".format(e))
|
||||||
|
def main():
|
||||||
|
# start = datetime.now()
|
||||||
|
# categories = amazon_categories(config)
|
||||||
|
# categories.start_processing()
|
||||||
|
# end = datetime.now()
|
||||||
|
# logging.info('Total time taken to fetch the categories: {}'.format(str(end-start)))
|
||||||
|
#
|
||||||
|
# start = datetime.now()
|
||||||
|
# products = amazon_category_products(config)
|
||||||
|
# products.start_processing()
|
||||||
|
# end = datetime.now()
|
||||||
|
# logging.info('Total time taken to fetch the category products: {}'.format(str(end-start)))
|
||||||
|
|
||||||
|
|
||||||
|
product_info = noon_products(config)
|
||||||
|
product_info.start_processing()
|
||||||
|
|
||||||
|
# ###### For test
|
||||||
|
# item = (100, 'raena_crawler_enginer_amazon', '3066', 'Up to 25 AED', 'https://www.amazon.ae/Ross-Massager-Shampoo-Silicone-Bristles/dp/B09JGH1WM3?ref_=Oct_d_oup_d_12149480031_0&pd_rd_w=lfMTW&content-id=amzn1.sym.d6d96598-a48c-43a2-8244-52a2329bf791&pf_rd_p=d6d96598-a48c-43a2-8244-52a2329bf791&pf_rd_r=C1QM2XCSJDBVMS27JV7E&pd_rd_wg=gkRZv&pd_rd_r=f5af13ee-c6c4-4d8a-8677-cba9cbacdace&pd_rd_i=B09JGH1WM3', '8f0540b5919e176303cf24a1d46b0e1c', 0)
|
||||||
|
# product_info.get_product_info(item)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.info("Starting Shopee Crawler.......")
|
||||||
|
try:
|
||||||
|
logging.info("Loading config file.......")
|
||||||
|
with open("conf.json", "r") as jsonfile:
|
||||||
|
config = json.load(jsonfile)
|
||||||
|
logging.info("Config file loaded.......")
|
||||||
|
print(config)
|
||||||
|
|
||||||
|
main()
|
||||||
|
|
||||||
|
#raise Exception("Sorry, no numbers below zero")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.info("Error: ".format(e))
|
||||||
|
#logging.info("Cannot load config file. Please check. Exiting......")
|
||||||
|
#send_mail()
|
||||||
|
slack_notification(e)
|
||||||
|
exit(1)
|
|
@ -0,0 +1,590 @@
|
||||||
|
import logging
|
||||||
|
import psycopg2
|
||||||
|
|
||||||
|
###### Looger ######
|
||||||
|
format = "%(asctime)s: %(message)s"
|
||||||
|
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
class noon_db_writer:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
||||||
|
self.conn.autocommit = True
|
||||||
|
self.cur = self.conn.cursor()
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
logging.info("Closing connection.....")
|
||||||
|
self.conn.close()
|
||||||
|
|
||||||
|
def rce_category(self, data):
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" where category_name = '"+str(data['category_name'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
cat_name = data['category_name'].replace("'","''")
|
||||||
|
cat_url = data['category_page_url'].replace("'","''")
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" (parent_category_id,rce_source_id," \
|
||||||
|
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name) values (" \
|
||||||
|
+str(data['parent_category_id'])+","+str(data['rce_source_id'])+", "+str(data['rce_source_category_id'])+", "+str(data['rce_source_status'])+", " \
|
||||||
|
"'"+str(cat_url)+"', '"+str(data['category_page_url_hash'])+"', '"+str(cat_name)+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(id,parent_category_id,rce_source_id," \
|
||||||
|
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat) " \
|
||||||
|
"select id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \
|
||||||
|
"category_name,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \
|
||||||
|
"where rce_source_category_id = "+ str(data['rce_source_category_id'])
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
if str(data['parent_category_id'])==str(res[1]) and str(data['rce_source_category_id'])==str(res[3]) and str(data['category_name']) == str(res[7]) and \
|
||||||
|
str(data['category_page_url'])==str(res[5]):
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" set updatedat=now() " \
|
||||||
|
"where category_name = '"+ str(res[7])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" set parent_category_id = " \
|
||||||
|
""+str(data['parent_category_id'])+", rce_source_category_id = "+str(data['rce_source_category_id'])+", " \
|
||||||
|
"category_name='"+str(cat_name)+"', category_page_url='"+str(cat_url)+"', " \
|
||||||
|
"category_page_url_hash='"+str(data['category_page_url_hash'])+"', updatedat=now() where " \
|
||||||
|
"category_name = '"+ str(res[7])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(id,parent_category_id,rce_source_id," \
|
||||||
|
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat) " \
|
||||||
|
"select id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \
|
||||||
|
"category_name,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \
|
||||||
|
"where category_name = '"+ str(res[7])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_product(self, data):
|
||||||
|
|
||||||
|
data['product_page_url'] = data['product_page_url'].replace("'","")
|
||||||
|
data['rce_source_product_name'] = data['rce_source_product_name'].replace("'","")
|
||||||
|
data['product_description'] = data['product_description'].replace("'","")
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where product_page_url = '"+str(data['product_page_url'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" (rce_source_product_id," \
|
||||||
|
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
|
||||||
|
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
|
||||||
|
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
|
||||||
|
"product_section,rce_source_id) values("+str(data['rce_source_product_id'])+","+str(data['rce_source_product_status'])+",'"+str(data['product_page_url'])+"'," \
|
||||||
|
"'"+str(data['product_page_url_hash'])+"',"+str(data['rce_category_id'])+","+str(data['rce_brand_id'])+","+str(data['rce_store_id'])+"," \
|
||||||
|
"'"+str(data['rce_source_product_name'])+"','"+str(data['product_images'])+"','"+str(data['product_description'])+"',"+str(data['product_sold_total'])+"," \
|
||||||
|
""+str(data['product_sold'])+",'"+str(data['product_price_min'])+"','"+str(data['product_price_min_before_discount'])+"','"+str(data['product_price_max'])+"'," \
|
||||||
|
"'"+str(data['product_price_max_before_discount'])+"','"+str(data['ratings'])+"','"+str(data['product_section'])+"',"+str(data['rce_source_id'])+")"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" (id,rce_source_product_id," \
|
||||||
|
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
|
||||||
|
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
|
||||||
|
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
|
||||||
|
"product_section,createdat,updatedat,rce_source_id) select id,rce_source_product_id," \
|
||||||
|
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
|
||||||
|
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
|
||||||
|
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
|
||||||
|
"product_section,createdat,updatedat,rce_source_id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where " \
|
||||||
|
"product_page_url='"+str(data['product_page_url'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_source_product_id'])==str(res[1]) and str(data['rce_source_product_status'])==str(res[2]) and \
|
||||||
|
str(data['product_page_url'])==str(res[3]) and str(data['product_page_url_hash'])==str(res[4]) and str(data['rce_category_id'])==str(res[5]) and \
|
||||||
|
str(data['rce_brand_id'])==str(res[6]) and str(data['rce_store_id'])==str(res[7]) and str(data['rce_source_product_name'])==str(res[8]) and \
|
||||||
|
str(data['product_images'])==str(res[9]) and str(data['product_sold_total'])==str(res[11]) and \
|
||||||
|
str(data['product_sold'])==str(res[12]) and str(data['product_price_min'])==str(res[13]) and str(data['product_price_min_before_discount'])==str(res[14]) and \
|
||||||
|
str(data['product_price_max'])==str(res[15]) \
|
||||||
|
and str(data['product_price_max_before_discount'])==str(res[16]) \
|
||||||
|
and str(data['ratings'])==str(res[17]) and str(data['rce_source_id'])==str(res[21]) and \
|
||||||
|
str(data['product_section'])==str(res[22]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" set updatedat=now() " \
|
||||||
|
"where product_page_url = '"+ str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" set rce_source_product_id="+str(data['rce_source_product_id'])+"," \
|
||||||
|
"rce_source_product_status="+str(data['rce_source_product_status'])+",product_page_url='"+str(data['product_page_url'])+"',product_page_url_hash= " \
|
||||||
|
"'"+str(data['product_page_url_hash'])+"',rce_category_id="+str(data['rce_category_id'])+",rce_brand_id="+str(data['rce_brand_id'])+"," \
|
||||||
|
"rce_store_id="+str(data['rce_store_id'])+",rce_source_product_name='"+str(data['rce_source_product_name'])+"',product_images='"+str(data['product_images'])+"'" \
|
||||||
|
",product_description='"+str(data['product_description'])+"',product_sold_total="+str(data['product_sold_total'])+",product_sold="+str(data['product_sold'])+"," \
|
||||||
|
"product_price_min='"+str(data['product_price_min'])+"',product_price_min_before_discount='"+str(data['product_price_min_before_discount'])+"'," \
|
||||||
|
"product_price_max='"+str(data['product_price_max'])+"',product_price_max_before_discount='"+str(data['product_price_max_before_discount'])+"',ratings='"+str(data['ratings'])+"'," \
|
||||||
|
"product_section='"+str(data['product_section'])+"', updatedat=now(), rce_source_id="+str(data['rce_source_id'])+" where product_page_url = '"+ str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" (id,rce_source_product_id," \
|
||||||
|
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
|
||||||
|
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
|
||||||
|
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
|
||||||
|
"product_section,createdat,updatedat,rce_source_id) select id,rce_source_product_id," \
|
||||||
|
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
|
||||||
|
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
|
||||||
|
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
|
||||||
|
"product_section,createdat,updatedat,rce_source_id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where " \
|
||||||
|
"product_page_url='"+str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
def rce_product_variant(self, data):
|
||||||
|
data['product_variant_name'] = data['product_variant_name'].replace("'","''")
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where product_variant_name = '"+str(data['product_variant_name'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" (rce_source_variant_id,rce_product_id," \
|
||||||
|
"product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock) values("+str(data['rce_source_variant_id'])+"," \
|
||||||
|
""+str(data['rce_product_id'])+",'"+str(data['product_variant_name'])+"','"+str(data['product_variant_price'])+"'," \
|
||||||
|
"'"+str(data['product_variant_price_before_discount'])+"',"+str(data['product_variant_stock'])+")"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" (id,rce_source_variant_id,rce_product_id," \
|
||||||
|
"product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,createdat,updatedat) select * from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where product_variant_name='"+str(data['product_variant_name'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
if str(data['rce_source_variant_id'])==str(res[1]) and str(data['rce_product_id'])==str(res[2]) and str(data['product_variant_name'])==str(res[3]) and \
|
||||||
|
str(data['product_variant_price'])==str(res[4]) and str(data['product_variant_price_before_discount'])==str(res[5]) and str(data['product_variant_stock'])==str(res[6]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" set updatedat=now() " \
|
||||||
|
"where product_variant_name = '"+ str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" set rce_source_variant_id="+str(data['rce_source_variant_id'])+", " \
|
||||||
|
"rce_product_id="+str(data['rce_product_id'])+", product_variant_name='"+str(data['product_variant_name'])+"', product_variant_price=" \
|
||||||
|
"'"+str(data['product_variant_price'])+"',product_variant_price_before_discount='"+str(data['product_variant_price_before_discount'])+"'," \
|
||||||
|
"product_variant_stock="+str(data['product_variant_stock'])+", updatedat=now() where product_variant_name = '"+ str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" (id,rce_source_variant_id,rce_product_id," \
|
||||||
|
"product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,createdat,updatedat) select * from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where product_variant_name='"+str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
def rce_brand(self, data):
|
||||||
|
data['brand_page_url'] = data['brand_page_url'].replace("'","''")
|
||||||
|
data['brand_name'] = data['brand_name'].replace("'","''")
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_page_url = '"+str(data['brand_page_url'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" (rce_source_id,rce_source_brand_status," \
|
||||||
|
"brand_page_url,brand_page_url_hash,brand_name) values("+str(data['rce_source_id'])+"," \
|
||||||
|
""+str(data['rce_source_brand_status'])+",'"+str(data['brand_page_url'])+"','"+str(data['brand_page_url_hash'])+"'," \
|
||||||
|
"'"+str(data['brand_name'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" (id,rce_source_id,rce_source_brand_status," \
|
||||||
|
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat) select id,rce_source_id,rce_source_brand_status," \
|
||||||
|
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_page_url='"+str(data['brand_page_url'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_source_id'])==str(res[1]) and str(data['rce_source_brand_status'])==str(res[3]) and str(data['brand_page_url'])==str(res[4]) and \
|
||||||
|
str(data['brand_page_url_hash'])==str(res[5]) and str(data['brand_name'])==str(res[6]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" set updatedat=now() " \
|
||||||
|
"where brand_page_url = '"+ str(res[4])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" set rce_source_id="+str(data['rce_source_id'])+", " \
|
||||||
|
"rce_source_brand_status="+str(data['rce_source_brand_status'])+", brand_page_url='"+str(data['brand_page_url'])+"', brand_page_url_hash=" \
|
||||||
|
"'"+str(data['brand_page_url_hash'])+"',brand_name='"+str(data['brand_name'])+"', updatedat=now() where brand_page_url = '"+ str(res[4])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" (id,rce_source_id,rce_source_brand_status," \
|
||||||
|
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat) select id,rce_source_id,rce_source_brand_status, " \
|
||||||
|
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_page_url='"+str(res[4])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_reseller(self, data):
|
||||||
|
data['reseller_name'] = data['reseller_name'].replace("'","''")
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data['reseller_name'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" (rce_source_id,rce_source_reseller_status," \
|
||||||
|
"reseller_name,reseller_average_rating,reseller_description) values("+str(data['rce_source_id'])+"," \
|
||||||
|
""+str(data['rce_source_reseller_status'])+",'"+str(data['reseller_name'])+"','"+str(data['reseller_average_rating'])+"'," \
|
||||||
|
"'"+str(data['reseller_description'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (id,rce_source_id,rce_source_reseller_status," \
|
||||||
|
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat) select id,rce_source_id,rce_source_reseller_status," \
|
||||||
|
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name='"+str(data['reseller_name'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if data['rce_source_id']==res[1] and str(data['rce_source_reseller_status'])==str(res[3]) and str(data['reseller_name'])==str(res[4]) and \
|
||||||
|
str(data['reseller_average_rating'])==str(res[5]) and str(data['reseller_description'])==str(res[6]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set updatedat=now() " \
|
||||||
|
"where reseller_name = '"+ str(res[4])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set rce_source_id="+str(data['rce_source_id'])+", " \
|
||||||
|
"rce_source_reseller_status="+str(data['rce_source_reseller_status'])+", reseller_name='"+str(data['reseller_name'])+"', reseller_average_rating=" \
|
||||||
|
"'"+str(data['reseller_average_rating'])+"',reseller_description='"+str(data['reseller_description'])+"', updatedat=now() where reseller_name = '"+ str(res[4])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (id,rce_source_id,rce_source_reseller_status," \
|
||||||
|
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat) select id,rce_source_id,rce_source_reseller_status," \
|
||||||
|
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name='"+str(res[4])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_reseller_store(self, data):
|
||||||
|
|
||||||
|
data['store_page_url'] = data['store_page_url'].replace("'","''")
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url = '"+str(data['store_page_url'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" (rce_source_store_status," \
|
||||||
|
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,rce_source_id) values(" \
|
||||||
|
""+str(data['rce_source_store_status'])+",'"+str(data['store_page_url'])+"','"+str(data['store_page_url_hash'])+"'," \
|
||||||
|
"'"+str(data['store_location'])+"', "+str(data['rce_reseller_id'])+", "+str(data['rce_source_id'])+")"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (id,rce_source_store_status," \
|
||||||
|
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id) select id,rce_source_store_status," \
|
||||||
|
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url='"+str(data['store_page_url'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_source_store_status'])==str(res[2]) and str(data['store_page_url'])==str(res[3]) and \
|
||||||
|
str(data['store_page_url_hash'])==str(res[4]) and str(data['store_location'])==str(res[5]) and \
|
||||||
|
str(data['rce_reseller_id'])==str(res[6]) and str(data['rce_source_id'])==str(res[9]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set updatedat=now() " \
|
||||||
|
"where store_page_url = '"+ str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set " \
|
||||||
|
"rce_source_store_status="+str(data['rce_source_store_status'])+", store_page_url='"+str(data['store_page_url'])+"', store_page_url_hash=" \
|
||||||
|
"'"+str(data['store_page_url_hash'])+"',store_location='"+str(data['store_location'])+"', rce_reseller_id="+str(data['rce_reseller_id'])+", " \
|
||||||
|
"updatedat=now(), rce_source_id="+str(data['rce_source_id'])+" where store_page_url = '"+ str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (id,rce_source_store_status," \
|
||||||
|
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id) select id,rce_source_store_status," \
|
||||||
|
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url='"+str(res[3])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_ratings_reviews(self, data):
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id = "+str(data['rce_product_id'])+" and username ='"+str(data['username'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
data['username'] = data['username'].replace("'","''")
|
||||||
|
data['img_url'] = data['img_url'].replace("'","''")
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" (id,rce_product_id,username," \
|
||||||
|
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating) values("+str(data['id'])+","+str(data['rce_product_id'])+"," \
|
||||||
|
"'"+str(data['username'])+"','"+str(data['review'])+"','"+str(data['img_url'])+"',"+str(data['review_like_count'])+",'"+str(data['user_tier'])+"'," \
|
||||||
|
""+str(data['shop_id'])+", '"+str(data['video_url'])+"', '"+str(data['rating'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" (id,rce_product_id,username," \
|
||||||
|
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select id,rce_product_id,username," \
|
||||||
|
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id="+str(data['rce_product_id'])+" and username ='"+str(data['username'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_product_id'])==str(res[1]) and str(data['username'])==str(res[2]) and str(data['review'])==str(res[3]) and \
|
||||||
|
str(data['img_url'])==str(res[4]) and str(data['review_like_count'])==str(res[5]) and str(data['user_tier'])==str(res[6]) and \
|
||||||
|
str(data['shop_id'])==str(res[7]) and str(data['video_url'])==str(res[8]) and str(data['rating'])==str(res[9]):
|
||||||
|
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set updatedat=now() " \
|
||||||
|
"where rce_product_id = "+ str(res[1])+" and username ='"+res[2]+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set rce_product_id="+str(data['rce_product_id'])+", " \
|
||||||
|
"username='"+str(data['username'])+"', review='"+str(data['review'])+"', img_url=" \
|
||||||
|
"'"+str(data['img_url'])+"',review_like_count="+str(data['review_like_count'])+", user_tier='"+str(data['user_tier'])+"', " \
|
||||||
|
"shop_id="+str(data['shop_id'])+", video_url='"+str(data['video_url'])+"', rating='"+str(data['rating'])+"', updatedat=now() " \
|
||||||
|
"where rce_product_id = "+ str(res[1])+" and username ='"+str(data['username'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" (id,rce_product_id,username," \
|
||||||
|
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select id,rce_product_id,username," \
|
||||||
|
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id="+str(res[1])+" and username ='"+str(data['username'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_ratings_reviews_productmodels(self,data):
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id = "+str(data['rce_rating_id'])
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" (rce_rating_id,model_id) " \
|
||||||
|
"values("+str(data['rce_rating_id'])+",'"+str(data['model_id'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
|
||||||
|
"createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(data['rce_rating_id'])+""
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_rating_id'])==str(res[1]) and str(data['model_id'])==str(res[2]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set updatedat=now() " \
|
||||||
|
"where rce_rating_id = "+ str(res[1])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set model_id="+str(data['model_id'])+", " \
|
||||||
|
"updatedat=now() where rce_source_store_id = "+ str(res[1])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
|
||||||
|
"createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(res[1])+""
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
def rce_tags(self,data):
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description = '"+str(data['description'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" (id,description) " \
|
||||||
|
"values("+str(data['id'])+",'"+str(data['description'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
|
||||||
|
"createdat,updatedat) select id,description,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(data['description'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['description'])==str(res[1]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set updatedat=now() " \
|
||||||
|
"where description = '"+ str(res[1])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set description='"+str(data['description'])+"', " \
|
||||||
|
"updatedat=now() where description = "+ str(res[1])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
|
||||||
|
"createdat,updatedat) select id,description,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(res[1])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
def rce_ratings_reviews_producttags(self,data):
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id = '"+str(data['rce_rating_id'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" (rce_rating_id,tag_ids) " \
|
||||||
|
"values("+str(data['rce_rating_id'])+",'"+str(data['tag_ids'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
|
||||||
|
"createdat,updatedat) select id,rce_rating_id,tag_ids,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id='"+str(data['rce_rating_id'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_rating_id'])==str(res[1]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set updatedat=now() " \
|
||||||
|
"where rce_rating_id = '"+ str(res[1])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set rce_rating_id='"+str(data['rce_rating_id'])+"', " \
|
||||||
|
"updatedat=now() where rce_rating_id = "+ str(res[1])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
|
||||||
|
"createdat,updatedat) select id,description,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where description='"+str(res[1])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,426 @@
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import random
|
||||||
|
import sys
|
||||||
|
import string
|
||||||
|
import psycopg2
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from noon_db_writer import noon_db_writer
|
||||||
|
from datetime import datetime
|
||||||
|
from noon_raw_product import get_product_info_raw
|
||||||
|
|
||||||
|
class noon_products:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.crawler_name = self.config.get("crawler_name")
|
||||||
|
self.pattern = r'[' + string.punctuation + ']'
|
||||||
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
||||||
|
self.conn.autocommit = True
|
||||||
|
self.cur = self.conn.cursor()
|
||||||
|
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Noon'")
|
||||||
|
self.rce_source_id = self.cur.fetchone()[0]
|
||||||
|
self.cur.execute("select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where crawler_name='raena_crawler_enginer_noon' and flag=0")
|
||||||
|
self.items = self.cur.fetchall()
|
||||||
|
self.db_writer = noon_db_writer(config)
|
||||||
|
#self.display = Display(visible=0, size=(800, 600))
|
||||||
|
#self.display.start()
|
||||||
|
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
print("Closing connection.....")
|
||||||
|
self.conn.close()
|
||||||
|
#self.display.stop()
|
||||||
|
|
||||||
|
def slack_notification(message):
|
||||||
|
webhook_url = "https://hooks.slack.com/services/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"
|
||||||
|
slack_data = {"text": "Issue occurred on Noon Crawler. Error: " + str(message)}
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
webhook_url, data=json.dumps(slack_data),
|
||||||
|
headers={"Content-Type": "application/json"}
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise ValueError(
|
||||||
|
f"Request to Slack returned an error {response.status_code}, {response.text}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def start_processing(self):
|
||||||
|
count = 0
|
||||||
|
for item in self.items:
|
||||||
|
count += 1
|
||||||
|
try:
|
||||||
|
logging.info("============== Getting info for {}/{}: {} ================".format(str(count),str(len(self.items)),str(item)))
|
||||||
|
start = datetime.now()
|
||||||
|
self.get_product_info(item)
|
||||||
|
end = datetime.now()
|
||||||
|
logging.info('Total time taken to fetch the product: {}'.format(str(end-start)))
|
||||||
|
# sleeptime = random.randint(20,50)
|
||||||
|
# logging.info("Sleeping for {} sec".format(str(sleeptime)))
|
||||||
|
# time.sleep(sleeptime)
|
||||||
|
time.sleep(5)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
self.slack_notification(e)
|
||||||
|
|
||||||
|
def reseller_info(self, data):
|
||||||
|
try:
|
||||||
|
stores = data["product"]["variants"][0]["offers"]
|
||||||
|
|
||||||
|
if stores:
|
||||||
|
|
||||||
|
return_item = ""
|
||||||
|
flag = 0
|
||||||
|
|
||||||
|
for store in stores:
|
||||||
|
|
||||||
|
##### reseller info
|
||||||
|
|
||||||
|
data_reseller = {}
|
||||||
|
data_reseller['rce_source_id'] = self.rce_source_id
|
||||||
|
data_reseller['rce_source_reseller_status'] = 1
|
||||||
|
data_reseller['reseller_name'] = ""
|
||||||
|
data_reseller['reseller_average_rating'] = 0.0
|
||||||
|
data_reseller['reseller_description'] = ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_reseller['reseller_name'] = store["store_name"]
|
||||||
|
data_reseller['reseller_name'] = data_reseller['reseller_name'].replace("'","")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_reseller['reseller_average_rating'] = float(store["partner_ratings_sellerlab"]["partner_rating"])
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_reseller(data_reseller)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
|
||||||
|
##### Store info
|
||||||
|
|
||||||
|
data_reseller_store = {}
|
||||||
|
data_reseller_store['rce_source_store_status'] = 1
|
||||||
|
data_reseller_store['store_page_url'] = ""
|
||||||
|
data_reseller_store['store_page_url_hash'] = ""
|
||||||
|
data_reseller_store['store_location'] = ""
|
||||||
|
data_reseller_store['rce_reseller_id'] = ""
|
||||||
|
data_reseller_store['rce_source_id'] = self.rce_source_id
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_reseller_store['store_page_url'] = "https://www.noon.com/uae-en/seller/" + store["store_code"]
|
||||||
|
data_reseller_store['store_page_url'] = data_reseller_store['store_page_url'].replace("'","")
|
||||||
|
|
||||||
|
data_reseller_store['store_page_url_hash'] = hashlib.md5(data_reseller_store['store_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data_reseller['reseller_name'])+"'")
|
||||||
|
rce_reseller_id = self.cur.fetchone()
|
||||||
|
data_reseller_store['rce_reseller_id'] = rce_reseller_id[0]
|
||||||
|
if flag == 0:
|
||||||
|
return_item = data_reseller_store['rce_reseller_id']
|
||||||
|
flag = 1
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_reseller_store(data_reseller_store)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
|
||||||
|
return return_item
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
def brand_info(self, data):
|
||||||
|
data_brand = {}
|
||||||
|
|
||||||
|
data_brand['rce_source_id'] = self.rce_source_id
|
||||||
|
data_brand['rce_source_brand_status'] = 1
|
||||||
|
data_brand['brand_page_url'] = ""
|
||||||
|
data_brand['brand_page_url_hash'] = ""
|
||||||
|
data_brand['brand_name'] = ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_brand['brand_page_url'] = "https://www.noon.com/uae-en/" + data["product"]["brand_code"]
|
||||||
|
data_brand['brand_page_url_hash'] = hashlib.md5(data_brand['brand_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_brand['brand_name'] = data["product"]["brand"]
|
||||||
|
data_brand['brand_name'] = data_brand['brand_name'].replace("'","")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_brand(data_brand)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
|
||||||
|
return data_brand['brand_name']
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def product_info(self, data, category, keyword, url, url_hash, brand_name, rce_reseller_id):
|
||||||
|
data_product = {}
|
||||||
|
|
||||||
|
data_product['rce_source_product_id'] = 0
|
||||||
|
data_product['rce_source_id'] = self.rce_source_id
|
||||||
|
data_product['rce_source_product_status'] = 1
|
||||||
|
data_product['product_page_url'] = url.replace("'","''")
|
||||||
|
data_product['product_page_url_hash'] = url_hash
|
||||||
|
data_product['rce_category_id'] = int(category)
|
||||||
|
data_product['rce_brand_id'] = ""
|
||||||
|
data_product['rce_store_id'] = ""
|
||||||
|
data_product['rce_source_product_name'] = ""
|
||||||
|
data_product['product_images'] = ""
|
||||||
|
data_product['product_description'] = ""
|
||||||
|
data_product['product_sold_total'] = 0
|
||||||
|
data_product['product_sold'] = 0
|
||||||
|
data_product['product_price_min'] = ""
|
||||||
|
data_product['product_price_min_before_discount'] =""
|
||||||
|
data_product['product_price_max'] = ""
|
||||||
|
data_product['product_price_max_before_discount'] = ""
|
||||||
|
data_product['ratings'] = 0.0
|
||||||
|
data_product['product_section'] = keyword
|
||||||
|
|
||||||
|
try:
|
||||||
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_name = '"+str(brand_name)+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
data_product['rce_brand_id'] = self.cur.fetchone()[0]
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
data_product['rce_store_id'] = self.cur.fetchone()[0]
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
rce_source_product_name = data["product"]["product_title"]
|
||||||
|
data_product['rce_source_product_name'] = str(re.sub(self.pattern, '', rce_source_product_name)).replace("'","")
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
images = data["product"]["image_keys"]
|
||||||
|
data_product['product_images'] = ','.join(images)
|
||||||
|
#print(data_product['product_images'])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_product['product_description'] = data["product"]["long_description"] + " ".join(data["product"]["feature_bullets"])
|
||||||
|
data_product['product_description'] = str(re.sub(self.pattern, '', data_product['product_description'])).replace("'","")
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_product['product_price_min'] = str(data["product"]["variants"][0]["offers"][0]["sale_price"])
|
||||||
|
data_product['product_price_max'] = data_product['product_price_min']
|
||||||
|
except:
|
||||||
|
data_product['product_price_min'] = str(data["product"]["variants"][0]["offers"][0]["price"])
|
||||||
|
data_product['product_price_max'] = data_product['product_price_min']
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_product['product_price_min_before_discount'] = str(data["product"]["variants"][0]["offers"][0]["price"])
|
||||||
|
data_product['product_price_max_before_discount'] = str(data["product"]["variants"][0]["offers"][0]["price"])
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_product['ratings'] = float(data["product"]["product_rating"]["value"])
|
||||||
|
#print(data_product['ratings'])
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_product(data_product)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
|
||||||
|
### rce_product_variant
|
||||||
|
try:
|
||||||
|
variants = data["product"]["groups"][0]["options"]
|
||||||
|
if variants:
|
||||||
|
|
||||||
|
for variant in variants:
|
||||||
|
|
||||||
|
data_variant = {}
|
||||||
|
|
||||||
|
data_variant['rce_source_variant_id'] = 0
|
||||||
|
data_variant['rce_product_id'] = ""
|
||||||
|
data_variant['product_variant_name'] = ""
|
||||||
|
data_variant['product_variant_price'] = 0
|
||||||
|
data_variant['product_variant_price_before_discount'] = 0
|
||||||
|
data_variant['product_variant_stock'] = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_name = '"+str(data_product['rce_source_product_name'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
data_variant['rce_product_id'] = self.cur.fetchone()[0]
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
product_variant_name = variant["name"]
|
||||||
|
data_variant['product_variant_name'] = str(re.sub(self.pattern, '', product_variant_name)).replace("'","''")
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_product_variant(data_variant)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
|
||||||
|
time.sleep(random.randint(2,5))
|
||||||
|
|
||||||
|
else:
|
||||||
|
logging.info('No variant found')
|
||||||
|
except:
|
||||||
|
logging.info('No variant found')
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def rating_info(self, data, rce_reseller_id, url_hash):
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_reviews = []
|
||||||
|
data_reviews_ar = []
|
||||||
|
data_reviews_en = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
if data["product"]["reviews"]["comments"]["ar"]["reviews"]:
|
||||||
|
data_reviews_ar = data["product"]["reviews"]["comments"]["ar"]["reviews"]
|
||||||
|
data_reviews.extend(data_reviews_ar)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
if data["product"]["reviews"]["comments"]["en"]["reviews"]:
|
||||||
|
data_reviews_en = data["product"]["reviews"]["comments"]["en"]["reviews"]
|
||||||
|
data_reviews.extend(data_reviews_en)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
for review in data_reviews:
|
||||||
|
|
||||||
|
data_review = {}
|
||||||
|
|
||||||
|
data_review["id"] = ""
|
||||||
|
data_review["rce_product_id"] = ""
|
||||||
|
data_review["username"] = ""
|
||||||
|
data_review["review"] = ""
|
||||||
|
data_review["img_url"] = ""
|
||||||
|
data_review["review_like_count"] = 0
|
||||||
|
data_review["user_tier"] = ""
|
||||||
|
data_review["shop_id"] = 0
|
||||||
|
data_review["video_url"] = ""
|
||||||
|
data_review["rating"] = ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
sql = "select max(id) from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')
|
||||||
|
self.cur.execute(sql)
|
||||||
|
rating_id = self.cur.fetchone()
|
||||||
|
|
||||||
|
if rating_id[0]==None:
|
||||||
|
rating_id = 1
|
||||||
|
else:
|
||||||
|
rating_id = int(rating_id[0]) + 1
|
||||||
|
|
||||||
|
data_review["id"] = rating_id
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where product_page_url_hash = '"+str(url_hash)+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
data_review["rce_product_id"] = self.cur.fetchone()[0]
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_review["username"] = review["displayName"]
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
title = review["title"]
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
comment = review["comment"]
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
data_review["review"] = title + comment
|
||||||
|
data_review["review"] = data_review["review"].replace("'","")
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_review["review_like_count"] = review["helpfulCount"]
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_review["rating"] = review["rating"]
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
data_review["shop_id"] = self.cur.fetchone()[0]
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.db_writer.rce_ratings_reviews(data_review)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info(e)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_product_info(self,item):
|
||||||
|
try:
|
||||||
|
|
||||||
|
data = get_product_info_raw(item[4])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
##### Reseller info #####
|
||||||
|
rce_reseller_id = self.reseller_info(data)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
##### Product Info #####
|
||||||
|
##### Brand Info
|
||||||
|
brand_name = self.brand_info(data)
|
||||||
|
##### Product info
|
||||||
|
self.product_info(data, item[2], item[3], item[4], item[5], brand_name, rce_reseller_id)
|
||||||
|
|
||||||
|
|
||||||
|
##### Rating Info #####
|
||||||
|
self.rating_info(data, rce_reseller_id, item[5])
|
||||||
|
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 where product_page_url_hash='{item[5]}'
|
||||||
|
"""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
|
|
@ -0,0 +1,62 @@
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import requests
|
||||||
|
# import random
|
||||||
|
# import string
|
||||||
|
# import uuid
|
||||||
|
# import time
|
||||||
|
# import jwt
|
||||||
|
from urllib.parse import urlparse, quote
|
||||||
|
|
||||||
|
##### Looger ######
|
||||||
|
format = "%(asctime)s: %(message)s"
|
||||||
|
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
def get_product_info_raw(url):
|
||||||
|
|
||||||
|
#parsed_url = urlparse(url)
|
||||||
|
parsed_url = url.replace("noon.com/uae-en/", "noon.com/_svc/catalog/api/v3/u/")
|
||||||
|
print(parsed_url)
|
||||||
|
encoded_url = quote(parsed_url, safe='')
|
||||||
|
|
||||||
|
api_url= 'http://localhost:3090/rcs/v1/noon/'
|
||||||
|
|
||||||
|
print(url)
|
||||||
|
print(api_url+encoded_url)
|
||||||
|
response = requests.request("GET", api_url+encoded_url)
|
||||||
|
|
||||||
|
logging.info(response)
|
||||||
|
|
||||||
|
print(api_url+encoded_url)
|
||||||
|
data = json.loads(response.text)
|
||||||
|
|
||||||
|
return data['data']
|
||||||
|
|
||||||
|
|
||||||
|
# def generate_sentry_trace():
|
||||||
|
# trace_id = ''.join(random.choices(string.ascii_lowercase + string.digits, k=32))
|
||||||
|
# span_id = ''.join(random.choices(string.ascii_lowercase + string.digits, k=16))
|
||||||
|
# sampling_decision = random.randint(0, 1)
|
||||||
|
#
|
||||||
|
# sentry_trace = f'{trace_id}-{span_id}-{sampling_decision}'
|
||||||
|
# return sentry_trace
|
||||||
|
#
|
||||||
|
# def generate_x_visitor_id():
|
||||||
|
# x_visitor_id = str(uuid.uuid4())
|
||||||
|
# return x_visitor_id
|
||||||
|
#
|
||||||
|
# def generate_cookie():
|
||||||
|
# payload = {
|
||||||
|
# 'raId': 'd1e3f451135d40958672d78da1f8c612',
|
||||||
|
# 'iat': int(time.time()),
|
||||||
|
# 'exp': int(time.time()+60)
|
||||||
|
# }
|
||||||
|
# # Generate the cookie string without a secret key
|
||||||
|
# cookie = jwt.encode(payload, '', algorithm='HS256')
|
||||||
|
#
|
||||||
|
# return cookie
|
||||||
|
|
||||||
|
|
||||||
|
# url = 'https://www.noon.com/uae-en/niacinamide-10-and-zinc-1-clear-30ml/N23772548A/p/?o=cbd635fab2298abe'
|
||||||
|
# #
|
||||||
|
# print(get_product_info_raw(url))
|
|
@ -0,0 +1,30 @@
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
#import undetected_chromedriver as webdriver
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver import ActionChains, Keys
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
import psycopg2
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.common.keys import Keys
|
||||||
|
from noon_db_writer import noon_db_writer
|
||||||
|
from pyvirtualdisplay import Display
|
||||||
|
from scroller.scroller import smartScroll
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
import ssl
|
||||||
|
ssl._create_default_https_context = ssl._create_unverified_context
|
||||||
|
|
||||||
|
driver = webdriver.Firefox()
|
||||||
|
|
||||||
|
driver.get('https://www.noon.com/uae-en/beauty/')
|
||||||
|
driver.implicitly_wait(5)
|
||||||
|
|
||||||
|
elements = driver.find_element(By.XPATH, '//*[@id="__next"]/div/section/div/div/div[23]/div/div/div/div/div/div/div/div/div[2]/div[1]/div').find_elements(By.CSS_SELECTOR,'.swiper-slide')
|
||||||
|
|
||||||
|
for element in elements:
|
||||||
|
link = element.find_element(By.TAG_NAME, 'a').get_attribute('href')
|
||||||
|
print(link)
|
||||||
|
|
||||||
|
driver.close()
|
|
@ -0,0 +1,20 @@
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
|
||||||
|
def slack_notification(message):
|
||||||
|
webhook_url = "https://hooks.slack.com/services/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"
|
||||||
|
slack_data = {"text": message}
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
webhook_url, data=json.dumps(slack_data),
|
||||||
|
headers={"Content-Type": "application/json"}
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
raise ValueError(
|
||||||
|
f"Request to Slack returned an error {response.status_code}, {response.text}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
message = "Hello from Python!"
|
||||||
|
slack_notification(message)
|
|
@ -0,0 +1,12 @@
|
||||||
|
# Define here the models for your scraped items
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
|
||||||
|
|
||||||
|
class RaenaCrawlerItem(scrapy.Item):
|
||||||
|
# define the fields for your item here like:
|
||||||
|
# name = scrapy.Field()
|
||||||
|
pass
|
|
@ -0,0 +1,103 @@
|
||||||
|
# Define here the models for your spider middleware
|
||||||
|
#
|
||||||
|
# See documentation in:
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
from scrapy import signals
|
||||||
|
|
||||||
|
# useful for handling different item types with a single interface
|
||||||
|
from itemadapter import is_item, ItemAdapter
|
||||||
|
|
||||||
|
|
||||||
|
class RaenaCrawlerSpiderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the spider middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_spider_input(self, response, spider):
|
||||||
|
# Called for each response that goes through the spider
|
||||||
|
# middleware and into the spider.
|
||||||
|
|
||||||
|
# Should return None or raise an exception.
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_spider_output(self, response, result, spider):
|
||||||
|
# Called with the results returned from the Spider, after
|
||||||
|
# it has processed the response.
|
||||||
|
|
||||||
|
# Must return an iterable of Request, or item objects.
|
||||||
|
for i in result:
|
||||||
|
yield i
|
||||||
|
|
||||||
|
def process_spider_exception(self, response, exception, spider):
|
||||||
|
# Called when a spider or process_spider_input() method
|
||||||
|
# (from other spider middleware) raises an exception.
|
||||||
|
|
||||||
|
# Should return either None or an iterable of Request or item objects.
|
||||||
|
pass
|
||||||
|
|
||||||
|
def process_start_requests(self, start_requests, spider):
|
||||||
|
# Called with the start requests of the spider, and works
|
||||||
|
# similarly to the process_spider_output() method, except
|
||||||
|
# that it doesn’t have a response associated.
|
||||||
|
|
||||||
|
# Must return only requests (not items).
|
||||||
|
for r in start_requests:
|
||||||
|
yield r
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
||||||
|
|
||||||
|
|
||||||
|
class RaenaCrawlerDownloaderMiddleware:
|
||||||
|
# Not all methods need to be defined. If a method is not defined,
|
||||||
|
# scrapy acts as if the downloader middleware does not modify the
|
||||||
|
# passed objects.
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_crawler(cls, crawler):
|
||||||
|
# This method is used by Scrapy to create your spiders.
|
||||||
|
s = cls()
|
||||||
|
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
||||||
|
return s
|
||||||
|
|
||||||
|
def process_request(self, request, spider):
|
||||||
|
# Called for each request that goes through the downloader
|
||||||
|
# middleware.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this request
|
||||||
|
# - or return a Response object
|
||||||
|
# - or return a Request object
|
||||||
|
# - or raise IgnoreRequest: process_exception() methods of
|
||||||
|
# installed downloader middleware will be called
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_response(self, request, response, spider):
|
||||||
|
# Called with the response returned from the downloader.
|
||||||
|
|
||||||
|
# Must either;
|
||||||
|
# - return a Response object
|
||||||
|
# - return a Request object
|
||||||
|
# - or raise IgnoreRequest
|
||||||
|
return response
|
||||||
|
|
||||||
|
def process_exception(self, request, exception, spider):
|
||||||
|
# Called when a download handler or a process_request()
|
||||||
|
# (from other downloader middleware) raises an exception.
|
||||||
|
|
||||||
|
# Must either:
|
||||||
|
# - return None: continue processing this exception
|
||||||
|
# - return a Response object: stops process_exception() chain
|
||||||
|
# - return a Request object: stops process_exception() chain
|
||||||
|
pass
|
||||||
|
|
||||||
|
def spider_opened(self, spider):
|
||||||
|
spider.logger.info("Spider opened: %s" % spider.name)
|
|
@ -0,0 +1,18 @@
|
||||||
|
# pipelines.py
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
class OliveYoungPipeline:
|
||||||
|
def __init__(self):
|
||||||
|
self.file = None
|
||||||
|
|
||||||
|
def open_spider(self, spider):
|
||||||
|
self.file = open('output.json', 'w')
|
||||||
|
|
||||||
|
def close_spider(self, spider):
|
||||||
|
self.file.close()
|
||||||
|
|
||||||
|
def process_item(self, item, spider):
|
||||||
|
line = json.dumps(item) + "\n"
|
||||||
|
self.file.write(line)
|
||||||
|
return item
|
|
@ -0,0 +1,111 @@
|
||||||
|
appdirs==1.4.4
|
||||||
|
appnope @ file:///opt/concourse/worker/volumes/live/4f734db2-9ca8-4d8b-5b29-6ca15b4b4772/volume/appnope_1606859466979/work
|
||||||
|
argon2-cffi @ file:///opt/conda/conda-bld/argon2-cffi_1645000214183/work
|
||||||
|
argon2-cffi-bindings @ file:///opt/concourse/worker/volumes/live/c6f9b05d-dc80-4dbc-7473-70bfcb66883c/volume/argon2-cffi-bindings_1644569703264/work
|
||||||
|
attrs @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_33k1uces4n/croot/attrs_1668696162258/work
|
||||||
|
Automat==22.10.0
|
||||||
|
backcall @ file:///home/ktietz/src/ci/backcall_1611930011877/work
|
||||||
|
bleach @ file:///opt/conda/conda-bld/bleach_1641577558959/work
|
||||||
|
brotlipy==0.7.0
|
||||||
|
certifi @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_477u68wvzm/croot/certifi_1671487773341/work/certifi
|
||||||
|
cffi @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_1b0qzba5nr/croot/cffi_1670423213150/work
|
||||||
|
charset-normalizer @ file:///tmp/build/80754af9/charset-normalizer_1630003229654/work
|
||||||
|
colorama @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_f5t80kwp9l/croot/colorama_1672386533201/work
|
||||||
|
ConfigUpdater @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_495uyr_0u4/croot/configupdater_1668698019809/work
|
||||||
|
constantly==15.1.0
|
||||||
|
cryptography @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_19cvzxmeb9/croot/cryptography_1677533085498/work
|
||||||
|
cssselect==1.2.0
|
||||||
|
debugpy @ file:///opt/concourse/worker/volumes/live/32b11d06-4d64-4ec8-497a-cf4fc97343d2/volume/debugpy_1637091821874/work
|
||||||
|
decorator @ file:///opt/conda/conda-bld/decorator_1643638310831/work
|
||||||
|
defusedxml @ file:///tmp/build/80754af9/defusedxml_1615228127516/work
|
||||||
|
entrypoints @ file:///opt/concourse/worker/volumes/live/194c0a28-55ce-4e83-6a87-0d9f2e06ab2c/volume/entrypoints_1649926487944/work
|
||||||
|
fake-useragent==1.2.1
|
||||||
|
Faker==18.13.0
|
||||||
|
fastjsonschema @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b5c1gee32t/croots/recipe/python-fastjsonschema_1661368622875/work
|
||||||
|
filelock==3.12.2
|
||||||
|
hyperlink @ file:///tmp/build/80754af9/hyperlink_1610130746837/work
|
||||||
|
idna @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_00jf0h4zbt/croot/idna_1666125573348/work
|
||||||
|
imagesize @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_4a6ed1be-fe30-4d6a-91d4-f867600caa0be5_dxzvt/croots/recipe/imagesize_1657179500955/work
|
||||||
|
importlib-metadata @ file:///opt/concourse/worker/volumes/live/4e1a3384-472f-4bcb-7776-cb0076aaea40/volume/importlib-metadata_1648562431336/work
|
||||||
|
importlib-resources @ file:///tmp/build/80754af9/importlib_resources_1625135880749/work
|
||||||
|
incremental==22.10.0
|
||||||
|
ipykernel @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_05yte6zd0k/croots/recipe/ipykernel_1662361808878/work
|
||||||
|
ipython @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b9echyik_d/croots/recipe/ipython_1659529861316/work
|
||||||
|
ipython-genutils @ file:///tmp/build/80754af9/ipython_genutils_1606773439826/work
|
||||||
|
itemadapter==0.8.0
|
||||||
|
itemloaders==1.1.0
|
||||||
|
jedi @ file:///opt/concourse/worker/volumes/live/c9d2fa99-8bc1-4572-41e7-6beba6391441/volume/jedi_1644315238822/work
|
||||||
|
Jinja2 @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_6adj7x0ejx/croot/jinja2_1666908137966/work
|
||||||
|
jmespath==1.0.1
|
||||||
|
jsonschema @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_21cqeq1xnk/croot/jsonschema_1676558686956/work
|
||||||
|
jupyter_client @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_41tzpfqkok/croots/recipe/jupyter_client_1661848920196/work
|
||||||
|
jupyter_core @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_fc_0us_ta7/croot/jupyter_core_1668084443574/work
|
||||||
|
jupyterlab-pygments @ file:///tmp/build/80754af9/jupyterlab_pygments_1601490720602/work
|
||||||
|
langcodes @ file:///opt/conda/conda-bld/langcodes_1643477751144/work
|
||||||
|
lxml==4.9.3
|
||||||
|
MarkupSafe @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_d4a9444f-bd4c-4043-b47d-cede33979b0fve7bm42r/croots/recipe/markupsafe_1654597878200/work
|
||||||
|
matplotlib-inline @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_9ddl71oqte/croots/recipe/matplotlib-inline_1662014471815/work
|
||||||
|
mccabe @ file:///opt/conda/conda-bld/mccabe_1644221741721/work
|
||||||
|
mistune==0.8.4
|
||||||
|
nbclient @ file:///opt/concourse/worker/volumes/live/2b77047f-e15a-4d19-54ac-7d87d20b74de/volume/nbclient_1650308375803/work
|
||||||
|
nbconvert @ file:///opt/concourse/worker/volumes/live/84c159ef-8fac-4372-7b64-25f831ab7aec/volume/nbconvert_1624479064764/work
|
||||||
|
nbformat @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_2daun1fill/croot/nbformat_1670352339504/work
|
||||||
|
nest-asyncio @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_64pfm74mxq/croot/nest-asyncio_1672387129786/work
|
||||||
|
nose @ file:///opt/conda/conda-bld/nose_1642704612149/work
|
||||||
|
notebook @ file:///opt/concourse/worker/volumes/live/f984e24b-6ef4-4a5b-55be-c5db1417e27a/volume/notebook_1621528337539/work
|
||||||
|
packaging @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_bet5qdixgt/croot/packaging_1671697440883/work
|
||||||
|
pandocfilters @ file:///opt/conda/conda-bld/pandocfilters_1643405455980/work
|
||||||
|
parsel==1.8.1
|
||||||
|
parso @ file:///tmp/build/80754af9/parso_1617223946239/work
|
||||||
|
pexpect @ file:///tmp/build/80754af9/pexpect_1605563209008/work
|
||||||
|
pickleshare @ file:///tmp/build/80754af9/pickleshare_1606932040724/work
|
||||||
|
pkgutil_resolve_name @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_c9l5hym8w0/croots/recipe/pkgutil-resolve-name_1661463329338/work
|
||||||
|
prometheus-client @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_19kjbndib7/croots/recipe/prometheus_client_1659455105394/work
|
||||||
|
prompt-toolkit @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_82emz7mook/croot/prompt-toolkit_1672387300396/work
|
||||||
|
Protego==0.2.1
|
||||||
|
psutil @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_c9b604bf-685f-47f6-8304-238e4e70557e1o7mmsot/croots/recipe/psutil_1656431274701/work
|
||||||
|
psycopg2-binary==2.9.7
|
||||||
|
ptyprocess @ file:///tmp/build/80754af9/ptyprocess_1609355006118/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
|
||||||
|
py @ file:///tmp/build/80754af9/py_1607971587848/work
|
||||||
|
pyasn1==0.5.0
|
||||||
|
pyasn1-modules==0.3.0
|
||||||
|
pycodestyle @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_a7riaf725h/croot/pycodestyle_1674267226642/work
|
||||||
|
pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work
|
||||||
|
PyDispatcher==2.0.7
|
||||||
|
pyflakes @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_a87qrne4ps/croot/pyflakes_1674165135821/work
|
||||||
|
Pygments @ file:///opt/conda/conda-bld/pygments_1644249106324/work
|
||||||
|
pyOpenSSL @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_6dweji2whw/croot/pyopenssl_1677607689781/work
|
||||||
|
pyrsistent @ file:///opt/concourse/worker/volumes/live/24b7a9ab-37d8-463c-575f-69184f9cfbc8/volume/pyrsistent_1636111022304/work
|
||||||
|
PySocks @ file:///opt/concourse/worker/volumes/live/ef943889-94fc-4539-798d-461c60b77804/volume/pysocks_1605305801690/work
|
||||||
|
python-dateutil @ file:///tmp/build/80754af9/python-dateutil_1626374649649/work
|
||||||
|
pyzmq @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_15f7a459-ad98-422b-b8da-cbf1f626e2115nt0ocwy/croots/recipe/pyzmq_1657724193704/work
|
||||||
|
queuelib==1.6.2
|
||||||
|
requests @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_516b78ce-034d-4395-b9b5-1d78c2847384qtnol99l/croots/recipe/requests_1657734628886/work
|
||||||
|
requests-file @ file:///Users/ktietz/demo/mc3/conda-bld/requests-file_1629455781986/work
|
||||||
|
Scrapy==2.9.0
|
||||||
|
scrapy-fake-useragent==1.4.4
|
||||||
|
scrapy-rotating-proxies==0.6.2
|
||||||
|
scrapy-splash==0.9.0
|
||||||
|
Send2Trash @ file:///tmp/build/80754af9/send2trash_1632406701022/work
|
||||||
|
service-identity==21.1.0
|
||||||
|
six @ file:///tmp/build/80754af9/six_1644875935023/work
|
||||||
|
snowballstemmer @ file:///tmp/build/80754af9/snowballstemmer_1637937080595/work
|
||||||
|
sphinxcontrib-devhelp @ file:///home/ktietz/src/ci/sphinxcontrib-devhelp_1611920923094/work
|
||||||
|
sphinxcontrib-jsmath @ file:///home/ktietz/src/ci/sphinxcontrib-jsmath_1611920942228/work
|
||||||
|
sphinxcontrib-qthelp @ file:///home/ktietz/src/ci/sphinxcontrib-qthelp_1611921055322/work
|
||||||
|
sphinxcontrib-serializinghtml @ file:///tmp/build/80754af9/sphinxcontrib-serializinghtml_1624451540180/work
|
||||||
|
terminado @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_18_p3gbeio/croot/terminado_1671751835656/work
|
||||||
|
testpath @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_aaf4aec7-dbb6-43d6-9707-824338b4efc82yrt6xjp/croots/recipe/testpath_1655908558843/work
|
||||||
|
tldextract==3.4.4
|
||||||
|
toml @ file:///tmp/build/80754af9/toml_1616166611790/work
|
||||||
|
tornado @ file:///opt/concourse/worker/volumes/live/d531d395-893c-4ca1-6a5f-717b318eb08c/volume/tornado_1606942307627/work
|
||||||
|
traitlets @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_0dtilxc0bw/croot/traitlets_1671143889152/work
|
||||||
|
Twisted==22.10.0
|
||||||
|
typing==3.7.4.3
|
||||||
|
typing_extensions @ file:///opt/conda/conda-bld/typing_extensions_1647553014482/work
|
||||||
|
urllib3 @ file:///opt/conda/conda-bld/urllib3_1643638302206/work
|
||||||
|
w3lib==2.1.1
|
||||||
|
wcwidth @ file:///Users/ktietz/demo/mc3/conda-bld/wcwidth_1629357192024/work
|
||||||
|
webencodings==0.5.1
|
||||||
|
zipp @ file:///private/var/folders/sy/f16zz6x50xz3113nwtb9bvq00000gp/T/abs_b71z79bye2/croot/zipp_1672387125902/work
|
||||||
|
zope.interface==6.0
|
|
@ -0,0 +1,18 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
docker rm splash-local
|
||||||
|
|
||||||
|
docker pull scrapinghub/splash
|
||||||
|
|
||||||
|
docker run --name splash-local -p 8050:8050 -d scrapinghub/splash
|
||||||
|
|
||||||
|
sleep 10
|
||||||
|
|
||||||
|
scrapy crawl oliveyoung_product
|
||||||
|
|
||||||
|
sleep 10
|
||||||
|
|
||||||
|
scrapy crawl tiktok_hashtag
|
||||||
|
|
||||||
|
docker stop splash-local
|
||||||
|
docker rm splash-local
|
|
@ -0,0 +1,117 @@
|
||||||
|
# Scrapy settings for raena_crawler project
|
||||||
|
#
|
||||||
|
# For simplicity, this file contains only settings considered important or
|
||||||
|
# commonly used. You can find more settings consulting the documentation:
|
||||||
|
#
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/settings.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
|
||||||
|
BOT_NAME = "raena_crawler"
|
||||||
|
|
||||||
|
SPIDER_MODULES = ["raena_crawler.spiders"]
|
||||||
|
NEWSPIDER_MODULE = "raena_crawler.spiders"
|
||||||
|
|
||||||
|
|
||||||
|
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||||
|
#USER_AGENT = "raena_crawler (+http://www.yourdomain.com)"
|
||||||
|
|
||||||
|
# Obey robots.txt rules
|
||||||
|
ROBOTSTXT_OBEY = True
|
||||||
|
|
||||||
|
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
||||||
|
CONCURRENT_REQUESTS = 10
|
||||||
|
|
||||||
|
# Configure a delay for requests for the same website (default: 0)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
|
||||||
|
# See also autothrottle settings and docs
|
||||||
|
DOWNLOAD_DELAY = 3
|
||||||
|
# The download delay setting will honor only one of:
|
||||||
|
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
||||||
|
#CONCURRENT_REQUESTS_PER_IP = 16
|
||||||
|
|
||||||
|
# Disable cookies (enabled by default)
|
||||||
|
#COOKIES_ENABLED = False
|
||||||
|
|
||||||
|
# Disable Telnet Console (enabled by default)
|
||||||
|
#TELNETCONSOLE_ENABLED = False
|
||||||
|
|
||||||
|
# Override the default request headers:
|
||||||
|
#DEFAULT_REQUEST_HEADERS = {
|
||||||
|
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||||
|
# "Accept-Language": "en",
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable spider middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
||||||
|
#SPIDER_MIDDLEWARES = {
|
||||||
|
# "raena_crawler.middlewares.RaenaCrawlerSpiderMiddleware": 543,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable downloader middlewares
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
||||||
|
#DOWNLOADER_MIDDLEWARES = {
|
||||||
|
# "raena_crawler.middlewares.RaenaCrawlerDownloaderMiddleware": 543,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Enable or disable extensions
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/extensions.html
|
||||||
|
#EXTENSIONS = {
|
||||||
|
# "scrapy.extensions.telnet.TelnetConsole": None,
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Configure item pipelines
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||||
|
ITEM_PIPELINES = {
|
||||||
|
'raena_crawler.pipelines.OliveYoungPipeline': 300,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
|
||||||
|
AUTOTHROTTLE_ENABLED = True
|
||||||
|
# The initial download delay
|
||||||
|
#AUTOTHROTTLE_START_DELAY = 5
|
||||||
|
# The maximum download delay to be set in case of high latencies
|
||||||
|
#AUTOTHROTTLE_MAX_DELAY = 60
|
||||||
|
# The average number of requests Scrapy should be sending in parallel to
|
||||||
|
# each remote server
|
||||||
|
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
||||||
|
# Enable showing throttling stats for every response received:
|
||||||
|
#AUTOTHROTTLE_DEBUG = False
|
||||||
|
|
||||||
|
# Enable and configure HTTP caching (disabled by default)
|
||||||
|
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||||
|
#HTTPCACHE_ENABLED = True
|
||||||
|
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||||
|
#HTTPCACHE_DIR = "httpcache"
|
||||||
|
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||||
|
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||||
|
|
||||||
|
# Set settings whose default value is deprecated to a future-proof value
|
||||||
|
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
|
||||||
|
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||||
|
FEED_EXPORT_ENCODING = "utf-8"
|
||||||
|
|
||||||
|
SPLASH_URL = 'http://localhost:8050'
|
||||||
|
DOWNLOADER_MIDDLEWARES = {
|
||||||
|
'scrapy_splash.SplashCookiesMiddleware': 723,
|
||||||
|
'scrapy_splash.SplashMiddleware': 725,
|
||||||
|
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
|
||||||
|
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
|
||||||
|
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
|
||||||
|
'scrapy_fake_useragent.middleware.RetryUserAgentMiddleware': 401,
|
||||||
|
'rotating_proxies.middlewares.RotatingProxyMiddleware': 610,
|
||||||
|
'rotating_proxies.middlewares.BanDetectionMiddleware': 620,
|
||||||
|
}
|
||||||
|
SPIDER_MIDDLEWARES = {
|
||||||
|
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
|
||||||
|
}
|
||||||
|
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
|
||||||
|
|
||||||
|
FAKEUSERAGENT_PROVIDERS = [
|
||||||
|
'scrapy_fake_useragent.providers.FakeUserAgentProvider', # This is the first provider we'll try
|
||||||
|
'scrapy_fake_useragent.providers.FakerProvider', # If FakeUserAgentProvider fails, we'll use faker to generate a user-agent string for us
|
||||||
|
'scrapy_fake_useragent.providers.FixedUserAgentProvider', # Fall back to USER_AGENT value
|
||||||
|
]
|
||||||
|
|
||||||
|
USER_AGENT = 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'
|
|
@ -0,0 +1,4 @@
|
||||||
|
# This package will contain the spiders of your Scrapy project
|
||||||
|
#
|
||||||
|
# Please refer to the documentation for information on how to create and manage
|
||||||
|
# your spiders.
|
|
@ -0,0 +1,94 @@
|
||||||
|
import scrapy
|
||||||
|
from scrapy_splash import SplashRequest
|
||||||
|
import psycopg2
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
|
||||||
|
"db_port": "5432",
|
||||||
|
"db": "analytics",
|
||||||
|
"db_user": "dbadmin",
|
||||||
|
"db_pass": "5qCif6eyY3Kmg4z"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class OliveyoungSpider(scrapy.Spider):
|
||||||
|
name = 'oliveyoung_product'
|
||||||
|
allowed_domains = ['https://global.oliveyoung.com/?gad=1&gclid=CjwKCAjwq4imBhBQEiwA9Nx1Bi5w7mSF9wgKTFqfX37hyG_c3ocYHldGoXbIX1XfYKQQFxLOPECJCxoCxpEQAvD_BwE']
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
url = 'https://global.oliveyoung.com/'
|
||||||
|
yield SplashRequest(url, self.parse, args={'wait': 5})
|
||||||
|
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
|
||||||
|
conn = psycopg2.connect(database=config.get('db'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port'))
|
||||||
|
logging.info(conn)
|
||||||
|
conn.autocommit = True
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
product_sections = [
|
||||||
|
('Best Sellers','#\#tab12'),
|
||||||
|
('MDS PICK','#\#tab22'),
|
||||||
|
('K-POP','div.main-section:nth-child(6) > div:nth-child(2)'),
|
||||||
|
('Featured','.main-brand-banner'),
|
||||||
|
('RECOMMENDATION','div.main-section:nth-child(9) > div:nth-child(2)'),
|
||||||
|
('FEATURED BRANDS', '#featuredBrands > div:nth-child(2)')
|
||||||
|
]
|
||||||
|
|
||||||
|
for product_section in product_sections:
|
||||||
|
|
||||||
|
products = response.css(str(product_section[1]))
|
||||||
|
|
||||||
|
product_selector = '.wrap-prd-info'
|
||||||
|
brand_selector = '.list-thumb-tit::text'
|
||||||
|
|
||||||
|
if 'FEATURED BRANDS' in product_section[0]:
|
||||||
|
product_selector = '.fig-title.ellipsis'
|
||||||
|
brand_selector = '.fig-title.ellipsis::text'
|
||||||
|
|
||||||
|
for product in products:
|
||||||
|
items = product.css(product_selector)
|
||||||
|
for item in items:
|
||||||
|
|
||||||
|
product_brand = (item.css(brand_selector).extract_first("")).replace("'","").strip()
|
||||||
|
product_name = item.css('.list-thumb-info::text').extract_first("").replace("'","").strip()
|
||||||
|
original_price = item.css('.price-cost::text').extract_first("").strip()
|
||||||
|
discounted_price = item.css('.prd-list-amountDue::text').extract_first("").strip()
|
||||||
|
|
||||||
|
logging.info("Collecting data for: {}".format(product_name))
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
select product_section,product_brand,product_name from raena_spider_management.oliveyoung_products where product_section='{product_section[0]}' and product_brand='{product_brand}' and product_name='{product_name}'
|
||||||
|
"""
|
||||||
|
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
cur.execute(sql)
|
||||||
|
|
||||||
|
res = cur.fetchone()
|
||||||
|
|
||||||
|
if res:
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
update raena_spider_management.oliveyoung_products set original_price='{original_price}',
|
||||||
|
discounted_price='{discounted_price}', updatedat=now()
|
||||||
|
where product_section='{product_section[0]}' and product_brand='{product_brand}' and product_name='{product_name}'
|
||||||
|
"""
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into raena_spider_management.oliveyoung_products(product_section,product_brand,product_name,original_price,discounted_price,createdat,updatedat)
|
||||||
|
values('{product_section[0]}','{product_brand}','{product_name}','{original_price}','{discounted_price}',now(),now())
|
||||||
|
"""
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
cur.execute(sql)
|
||||||
|
|
||||||
|
conn.close()
|
|
@ -0,0 +1,63 @@
|
||||||
|
# oliveyoung.py
|
||||||
|
import scrapy
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
class OliveYoungSpider(scrapy.Spider):
|
||||||
|
name = 'oliveyoung_bk'
|
||||||
|
start_urls = [
|
||||||
|
'https://global.oliveyoung.com/?gad=1&gclid=CjwKCAjwq4imBhBQEiwA9Nx1Bi5w7mSF9wgKTFqfX37hyG_c3ocYHldGoXbIX1XfYKQQFxLOPECJCxoCxpEQAvD_BwE']
|
||||||
|
|
||||||
|
def parse(self, response):
|
||||||
|
sections = {
|
||||||
|
"Best Sellers": "//div[@class='slick-slider-customized']/div[contains(@class,'slick-slide')]",
|
||||||
|
# "MD's Pick": "//section[@id='md_pick']/div[@class='item']/div[@class='product-item']",
|
||||||
|
# "Featured Brands": "//section[@id='brand_list']/div[@class='product-item']",
|
||||||
|
# "K-Pop": "//section[@id='kpop_list']/div[@class='product-item']",
|
||||||
|
# "INNISFREE": "//section[@id='brand_zone']/div[contains(@class,'brand-inn-store')]//div["
|
||||||
|
# "@class='product-item']",
|
||||||
|
# "Recommendation": "//section[@id='recommendation']/div[contains(@class,'product-item')]",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract data from each section
|
||||||
|
for section_name, section_xpath in sections.items():
|
||||||
|
products = response.xpath(section_xpath)
|
||||||
|
for product in products:
|
||||||
|
brand_name = product.xpath(".//span[@class='brand']/text()").get()
|
||||||
|
product_name = product.xpath(".//span[@class='name']/text()").get()
|
||||||
|
price = product.xpath(".//span[@class='num']/text()").get()
|
||||||
|
|
||||||
|
if brand_name:
|
||||||
|
yield {
|
||||||
|
"brand_name": brand_name.strip(),
|
||||||
|
"product_name": product_name.strip(),
|
||||||
|
"price": price.strip(),
|
||||||
|
"section": section_name,
|
||||||
|
}
|
||||||
|
|
||||||
|
# # Generate hashtags for each brand name
|
||||||
|
# hashtags = [word.lower() for word in brand_name.split()]
|
||||||
|
# hashtags = '#'.join(hashtags)
|
||||||
|
# yield {
|
||||||
|
# "brand_name": brand_name.strip(),
|
||||||
|
# "hashtags": f"#{hashtags}",
|
||||||
|
# }
|
||||||
|
#
|
||||||
|
# # Fetch views data from TikTok API using tiktok_api.py
|
||||||
|
# views_all, views = get_hashtag_views(hashtags)
|
||||||
|
# yield {
|
||||||
|
# "brand_name": brand_name.strip(),
|
||||||
|
# "hashtags": f"#{hashtags}",
|
||||||
|
# "views_all": views_all,
|
||||||
|
# "views": views,
|
||||||
|
# }
|
||||||
|
|
||||||
|
|
||||||
|
def get_hashtag_views(hashtag):
|
||||||
|
url = f'https://ads.tiktok.com/creative_radar_api/v1/popular_trend/hashtag/detail?period=7&hashtag_name={hashtag}&country_code=IS'
|
||||||
|
headers = {
|
||||||
|
# Add the headers from the CURL request here
|
||||||
|
}
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
data = response.json()
|
||||||
|
return data.get('hashtag', {}).get('video_views_all', 0), data.get('hashtag', {}).get('video_views', 0)
|
|
@ -0,0 +1,103 @@
|
||||||
|
import scrapy
|
||||||
|
import psycopg2
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"db_host": "analytics-db-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
|
||||||
|
"db_port": "5432",
|
||||||
|
"db": "analytics",
|
||||||
|
"db_user": "dbadmin",
|
||||||
|
"db_pass": "5qCif6eyY3Kmg4z"
|
||||||
|
}
|
||||||
|
|
||||||
|
class TiktokHashtag(scrapy.Spider):
|
||||||
|
name = 'tiktok_hashtag'
|
||||||
|
start_urls = ['https://ads.tiktok.com/business/creativecenter/hashtag/beautyofjoseon/pc/en?countryCode=ID&period=7']
|
||||||
|
|
||||||
|
def start_requests(self):
|
||||||
|
|
||||||
|
conn = psycopg2.connect(database=config.get('db'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port'))
|
||||||
|
logging.info(conn)
|
||||||
|
conn.autocommit = True
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
select distinct product_brand, hashtag from (
|
||||||
|
select distinct product_brand, replace(regexp_replace(lower(product_brand), '[^\w]+','','g'),' ','') hashtag
|
||||||
|
from raena_spider_management.oliveyoung_products
|
||||||
|
union
|
||||||
|
select distinct product_brand, replace(regexp_replace(lower(product_brand), '[^\w]+',' ','g'),' ','_') hashtag
|
||||||
|
from raena_spider_management.oliveyoung_products) a
|
||||||
|
order by product_brand
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging.info(sql)
|
||||||
|
|
||||||
|
cur.execute(sql)
|
||||||
|
|
||||||
|
brands = cur.fetchall()
|
||||||
|
|
||||||
|
logging.info(brands)
|
||||||
|
|
||||||
|
for brand in brands:
|
||||||
|
url_hashtag = "https://ads.tiktok.com/business/creativecenter/hashtag/"+brand[1]+"/pc/en?countryCode=ID&period=7"
|
||||||
|
|
||||||
|
yield scrapy.Request(url_hashtag, self.get_hashtag_info, meta={'meta': brand})
|
||||||
|
time.sleep(random.randint(10,20))
|
||||||
|
|
||||||
|
def get_hashtag_info(self, response):
|
||||||
|
|
||||||
|
logging.info("Collecting hashTag info")
|
||||||
|
|
||||||
|
conn = psycopg2.connect(database=config.get('db'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port'))
|
||||||
|
conn.autocommit = True
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
brand = response.meta.get('meta')
|
||||||
|
|
||||||
|
post_last7days = "0"
|
||||||
|
post_overall = "0"
|
||||||
|
view_last7days = "0"
|
||||||
|
view_overall = "0"
|
||||||
|
|
||||||
|
try:
|
||||||
|
post_last7days = response.xpath('/html/body/div[1]/div/main/div/div[1]/div/div[2]/div[1]/div[2]/div[1]/div/div[1]/span[1]/text()').get()
|
||||||
|
post_overall = response.xpath('/html/body/div[1]/div/main/div/div[1]/div/div[2]/div[1]/div[2]/div[1]/div/div[3]/span[1]/text()').get()
|
||||||
|
|
||||||
|
view_last7days = response.xpath('/html/body/div[1]/div/main/div/div[1]/div/div[2]/div[1]/div[2]/div[2]/div/div[1]/span[1]/text()').get()
|
||||||
|
view_overall = response.xpath('/html/body/div[1]/div/main/div/div[1]/div/div[2]/div[1]/div[2]/div[2]/div/div[3]/span[1]/text()').get()
|
||||||
|
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
select product_brand,brand_hashtag from raena_spider_management.oliveyoung_brand_hashtag
|
||||||
|
where product_brand='{brand[0]}' and brand_hashtag='{brand[1]}'
|
||||||
|
"""
|
||||||
|
|
||||||
|
cur.execute(sql)
|
||||||
|
res = cur.fetchone()
|
||||||
|
|
||||||
|
if res:
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
update raena_spider_management.oliveyoung_brand_hashtag set posts='{post_last7days}', posts_total='{post_overall}',
|
||||||
|
views='{view_last7days}', views_overall='{view_overall}', updatedat=now()
|
||||||
|
where product_brand='{brand[0]}' and brand_hashtag='{brand[1]}'
|
||||||
|
"""
|
||||||
|
|
||||||
|
cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = f"""
|
||||||
|
insert into raena_spider_management.oliveyoung_brand_hashtag(product_brand,brand_hashtag,posts,posts_total,views,views_overall,createdat,updatedat)
|
||||||
|
values('{brand[0]}','{brand[1]}','{post_last7days}','{post_overall}','{view_last7days}','{view_overall}',now(),now())
|
||||||
|
"""
|
||||||
|
|
||||||
|
cur.execute(sql)
|
||||||
|
|
||||||
|
conn.close()
|
|
@ -0,0 +1,76 @@
|
||||||
|
***Run:***
|
||||||
|
1. Change config accourding to the crawler type.
|
||||||
|
2. run "python shopee_crawler.py"
|
||||||
|
|
||||||
|
***Config for Master:***
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"crawler_name": "raena_crawler_enginer_shopee",
|
||||||
|
"crawler_schema": "raena_spider_management",
|
||||||
|
"category_tab": "rce_category",
|
||||||
|
"tracker_tab": "crawler_tracker",
|
||||||
|
"product_tab": "rce_product",
|
||||||
|
"variant_tab": "rce_product_variant",
|
||||||
|
"brand_tab": "rce_brand",
|
||||||
|
"reseller_tab": "rce_reseller",
|
||||||
|
"reseller_store_tab": "rce_reseller_store",
|
||||||
|
"review_tab": "rce_ratings_reviews",
|
||||||
|
"product_per_category": "136",
|
||||||
|
"source_category": "11043145",
|
||||||
|
"db_user": "crawler",
|
||||||
|
"db_pass": "4Z063Zp9Aczv",
|
||||||
|
"database": "raena_db",
|
||||||
|
"db_host": "raen-prd-sg-aurora-pg-rds-cluster-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
|
||||||
|
"db_port": "5432",
|
||||||
|
"crawler_main": "1",
|
||||||
|
"crawler_slave_no": ""
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
***Config for Slave01:***
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"crawler_name": "raena_crawler_enginer_shopee",
|
||||||
|
"crawler_schema": "raena_spider_management",
|
||||||
|
"category_tab": "rce_category",
|
||||||
|
"tracker_tab": "crawler_tracker",
|
||||||
|
"product_tab": "rce_product",
|
||||||
|
"variant_tab": "rce_product_variant",
|
||||||
|
"brand_tab": "rce_brand",
|
||||||
|
"reseller_tab": "rce_reseller",
|
||||||
|
"reseller_store_tab": "rce_reseller_store",
|
||||||
|
"review_tab": "rce_ratings_reviews",
|
||||||
|
"product_per_category": "136",
|
||||||
|
"source_category": "11043145",
|
||||||
|
"db_user": "crawler",
|
||||||
|
"db_pass": "4Z063Zp9Aczv",
|
||||||
|
"database": "raena_db",
|
||||||
|
"db_host": "raen-prd-sg-aurora-pg-rds-cluster-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
|
||||||
|
"db_port": "5432",
|
||||||
|
"crawler_main": "0",
|
||||||
|
"crawler_slave_no": "1"
|
||||||
|
}
|
||||||
|
|
||||||
|
***Config for Slave02:***
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"crawler_name": "raena_crawler_enginer_shopee",
|
||||||
|
"crawler_schema": "raena_spider_management",
|
||||||
|
"category_tab": "rce_category",
|
||||||
|
"tracker_tab": "crawler_tracker",
|
||||||
|
"product_tab": "rce_product",
|
||||||
|
"variant_tab": "rce_product_variant",
|
||||||
|
"brand_tab": "rce_brand",
|
||||||
|
"reseller_tab": "rce_reseller",
|
||||||
|
"reseller_store_tab": "rce_reseller_store",
|
||||||
|
"review_tab": "rce_ratings_reviews",
|
||||||
|
"product_per_category": "136",
|
||||||
|
"source_category": "11043145",
|
||||||
|
"db_user": "crawler",
|
||||||
|
"db_pass": "4Z063Zp9Aczv",
|
||||||
|
"database": "raena_db",
|
||||||
|
"db_host": "raen-prd-sg-aurora-pg-rds-cluster-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
|
||||||
|
"db_port": "5432",
|
||||||
|
"crawler_main": "0",
|
||||||
|
"crawler_slave_no": "2"
|
||||||
|
}
|
|
@ -0,0 +1,25 @@
|
||||||
|
{
|
||||||
|
"crawler_name": "raena_crawler_enginer_shopee",
|
||||||
|
"crawler_schema": "raena_spider_management",
|
||||||
|
"category_tab": "rce_category",
|
||||||
|
"tracker_tab": "crawler_tracker",
|
||||||
|
"product_tab": "rce_product",
|
||||||
|
"variant_tab": "rce_product_variant",
|
||||||
|
"brand_tab": "rce_brand",
|
||||||
|
"reseller_tab": "rce_reseller",
|
||||||
|
"reseller_store_tab": "rce_reseller_store",
|
||||||
|
"review_tab": "rce_ratings_reviews",
|
||||||
|
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
|
||||||
|
"review_producttags_tab": "rce_ratings_reviews_producttags",
|
||||||
|
"review_tags": "rce_tags",
|
||||||
|
"source_tab": "rce_source",
|
||||||
|
"product_per_category": "136",
|
||||||
|
"source_category": "11043145",
|
||||||
|
"db_user": "crawler",
|
||||||
|
"db_pass": "4Z063Zp9Aczv",
|
||||||
|
"database": "raena_db",
|
||||||
|
"db_host": "raen-prd-sg-aurora-pg-rds-cluster-instance-1.cd7qipz3esdx.ap-southeast-1.rds.amazonaws.com",
|
||||||
|
"db_port": "5432",
|
||||||
|
"crawler_main": "1",
|
||||||
|
"crawler_slave_no": ""
|
||||||
|
}
|
|
@ -0,0 +1,147 @@
|
||||||
|
alembic==1.9.3
|
||||||
|
anyio==3.6.2
|
||||||
|
apache-airflow==2.5.1
|
||||||
|
apache-airflow-providers-amazon==7.2.0
|
||||||
|
apache-airflow-providers-common-sql==1.3.3
|
||||||
|
apache-airflow-providers-ftp==3.3.1
|
||||||
|
apache-airflow-providers-http==4.1.1
|
||||||
|
apache-airflow-providers-imap==3.1.1
|
||||||
|
apache-airflow-providers-sqlite==3.3.1
|
||||||
|
apispec==3.3.2
|
||||||
|
argcomplete==1.12.3
|
||||||
|
asn1crypto==1.5.1
|
||||||
|
attrs==22.2.0
|
||||||
|
Babel==2.11.0
|
||||||
|
beautifulsoup4==4.11.2
|
||||||
|
blinker==1.5
|
||||||
|
boto3==1.26.69
|
||||||
|
botocore==1.29.69
|
||||||
|
cached-property==1.5.2
|
||||||
|
cachelib==0.9.0
|
||||||
|
cattrs==22.2.0
|
||||||
|
certifi==2022.12.7
|
||||||
|
cffi==1.15.1
|
||||||
|
chardet==3.0.4
|
||||||
|
charset-normalizer==3.0.1
|
||||||
|
click==8.1.3
|
||||||
|
clickclick==20.10.2
|
||||||
|
colorama==0.4.6
|
||||||
|
colorlog==4.0.2
|
||||||
|
configparser==3.5.3
|
||||||
|
ConfigUpdater==3.1.1
|
||||||
|
connexion==2.14.2
|
||||||
|
cron-descriptor==1.2.35
|
||||||
|
croniter==0.3.37
|
||||||
|
cryptography==39.0.1
|
||||||
|
decorator==5.1.1
|
||||||
|
defusedxml==0.7.1
|
||||||
|
Deprecated==1.2.13
|
||||||
|
dill==0.3.6
|
||||||
|
dnspython==2.3.0
|
||||||
|
docutils==0.19
|
||||||
|
email-validator==1.3.1
|
||||||
|
exceptiongroup==1.1.0
|
||||||
|
Flask==2.2.2
|
||||||
|
Flask-Admin==1.5.4
|
||||||
|
Flask-AppBuilder==4.1.4
|
||||||
|
Flask-Babel==1.0.0
|
||||||
|
Flask-Caching==2.0.2
|
||||||
|
Flask-JWT-Extended==4.4.4
|
||||||
|
Flask-Login==0.6.2
|
||||||
|
Flask-OpenID==1.3.0
|
||||||
|
Flask-Session==0.4.0
|
||||||
|
Flask-SQLAlchemy==2.5.1
|
||||||
|
flask-swagger==0.2.14
|
||||||
|
Flask-WTF==1.1.1
|
||||||
|
funcsigs==1.0.2
|
||||||
|
future==0.18.3
|
||||||
|
graphviz==0.20.1
|
||||||
|
greenlet==2.0.2
|
||||||
|
gunicorn==20.1.0
|
||||||
|
h11==0.14.0
|
||||||
|
httpcore==0.16.3
|
||||||
|
httpx==0.23.3
|
||||||
|
idna==2.10
|
||||||
|
importlib-resources==1.5.0
|
||||||
|
inflection==0.5.1
|
||||||
|
iso8601==1.1.0
|
||||||
|
itsdangerous==2.1.2
|
||||||
|
Jinja2==3.1.2
|
||||||
|
jmespath==0.10.0
|
||||||
|
json-merge-patch==0.2
|
||||||
|
jsonpath-ng==1.5.3
|
||||||
|
jsonschema==3.2.0
|
||||||
|
lazy-object-proxy==1.4.3
|
||||||
|
linkify-it-py==2.0.0
|
||||||
|
lockfile==0.12.2
|
||||||
|
lxml==4.9.2
|
||||||
|
Mako==1.2.4
|
||||||
|
Markdown==3.4.1
|
||||||
|
markdown-it-py==2.1.0
|
||||||
|
MarkupSafe==2.1.2
|
||||||
|
marshmallow==3.19.0
|
||||||
|
marshmallow-enum==1.5.1
|
||||||
|
marshmallow-oneofschema==3.0.1
|
||||||
|
marshmallow-sqlalchemy==0.23.1
|
||||||
|
mdit-py-plugins==0.3.3
|
||||||
|
mdurl==0.1.2
|
||||||
|
mypy-boto3-appflow==1.26.53
|
||||||
|
mypy-boto3-rds==1.26.47
|
||||||
|
mypy-boto3-redshift-data==1.26.30
|
||||||
|
natsort==8.2.0
|
||||||
|
numpy==1.24.2
|
||||||
|
packaging==23.0
|
||||||
|
pandas==1.5.3
|
||||||
|
pathspec==0.9.0
|
||||||
|
pendulum==2.1.2
|
||||||
|
piapy==0.2.0
|
||||||
|
pluggy==1.0.0
|
||||||
|
ply==3.11
|
||||||
|
prison==0.2.1
|
||||||
|
protobuf==4.21.12
|
||||||
|
psutil==5.9.4
|
||||||
|
pycparser==2.21
|
||||||
|
Pygments==2.14.0
|
||||||
|
PyJWT==2.6.0
|
||||||
|
pyrsistent==0.19.3
|
||||||
|
python-daemon==2.3.2
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
python-dotenv==0.21.1
|
||||||
|
python-nvd3==0.15.0
|
||||||
|
python-slugify==8.0.0
|
||||||
|
python3-openid==3.2.0
|
||||||
|
pytz==2022.7.1
|
||||||
|
pytzdata==2020.1
|
||||||
|
PyYAML==6.0
|
||||||
|
redshift-connector==2.0.910
|
||||||
|
requests==2.28.2
|
||||||
|
requests-toolbelt==0.10.1
|
||||||
|
rfc3986==1.5.0
|
||||||
|
rich==13.3.1
|
||||||
|
s3transfer==0.6.0
|
||||||
|
scramp==1.4.4
|
||||||
|
setproctitle==1.3.2
|
||||||
|
six==1.16.0
|
||||||
|
sniffio==1.3.0
|
||||||
|
soupsieve==2.3.2.post1
|
||||||
|
SQLAlchemy==1.4.9
|
||||||
|
SQLAlchemy-JSONField==1.0.1.post0
|
||||||
|
sqlalchemy-redshift==0.8.12
|
||||||
|
SQLAlchemy-Utils==0.40.0
|
||||||
|
sqlparse==0.4.3
|
||||||
|
swagger-ui-bundle==0.0.9
|
||||||
|
tabulate==0.8.10
|
||||||
|
tenacity==8.2.1
|
||||||
|
termcolor==2.2.0
|
||||||
|
text-unidecode==1.3
|
||||||
|
thrift==0.16.0
|
||||||
|
typing_extensions==4.4.0
|
||||||
|
tzlocal==1.5.1
|
||||||
|
uc-micro-py==1.0.1
|
||||||
|
unicodecsv==0.14.1
|
||||||
|
urllib3==1.25.11
|
||||||
|
watchtower==2.0.1
|
||||||
|
Werkzeug==2.2.2
|
||||||
|
wrapt==1.14.1
|
||||||
|
WTForms==2.3.3
|
||||||
|
zope.deprecation==4.4.0
|
|
@ -0,0 +1,177 @@
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver import ActionChains, Keys
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
import psycopg2
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
import bs4
|
||||||
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
import random
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
|
||||||
|
class shopee_category_products:
|
||||||
|
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.crawler_name = self.config.get("crawler_name")
|
||||||
|
self.url = "https://shopee.co.id/"
|
||||||
|
self.product_limit = int(self.config.get("product_per_category"))
|
||||||
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
||||||
|
self.conn.autocommit = True
|
||||||
|
self.cur = self.conn.cursor()
|
||||||
|
sql = "delete from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where crawler_name='"+str(self.crawler_name)+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
print("Closing connection.....")
|
||||||
|
self.conn.close()
|
||||||
|
|
||||||
|
def browse_category_page(self):
|
||||||
|
op = webdriver.ChromeOptions()
|
||||||
|
hight = str(random.randint(640,1280))
|
||||||
|
width = str(random.randint(1024,1920))
|
||||||
|
op.add_argument("window-size="+width+","+hight+"")
|
||||||
|
op.add_experimental_option("useAutomationExtension", False)
|
||||||
|
op.add_argument('--no-sandbox')
|
||||||
|
op.add_argument('--disable-notifications')
|
||||||
|
op.add_argument("--lang=en-GB")
|
||||||
|
op.headless = True
|
||||||
|
driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op)
|
||||||
|
|
||||||
|
driver.get("https://shopee.co.id")
|
||||||
|
time.sleep(5)
|
||||||
|
cat = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/div/div/div[3]/div[2]/div[1]/div/div/div[2]/div/div[1]/ul/li[2]/div/a[2]/div')
|
||||||
|
ActionChains(driver).move_to_element(cat).double_click().perform()
|
||||||
|
time.sleep(10)
|
||||||
|
driver.execute_script("document.body.style.zoom='15%'")
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
filters = driver.find_elements(By.CLASS_NAME, 'shopee-sort-by-options__option')
|
||||||
|
for filter in filters:
|
||||||
|
if filter.text == 'Terlaris':
|
||||||
|
logging.info("Sorting data by top sales.......")
|
||||||
|
driver.execute_script("arguments[0].click();", filter)
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
lim = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/div/div[1]/div[4]/div[2]/div/div[1]/div[2]/div/span[2]').text
|
||||||
|
|
||||||
|
|
||||||
|
cat = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/div/div[1]/div[4]/div[1]/div[1]/div/div/div[1]/a').text
|
||||||
|
print("Collecting products for category: {}".format(str(cat)))
|
||||||
|
pg_cnt = 1
|
||||||
|
print("Collecting data for page: {}".format(str(pg_cnt)))
|
||||||
|
|
||||||
|
cnt = 0
|
||||||
|
skip = 0
|
||||||
|
cnt, skip = self.get_product(driver.page_source, cat, cnt, skip)
|
||||||
|
for i in range(int(lim)-1):
|
||||||
|
pg_cnt += 1
|
||||||
|
next = driver.find_element(By.XPATH,'/html/body/div[1]/div/div[2]/div/div[1]/div[4]/div[2]/div/div[1]/div[2]/button[2]')
|
||||||
|
driver.execute_script("arguments[0].click();", next)
|
||||||
|
time.sleep(5)
|
||||||
|
print("Collecting data for page: {}".format(str(pg_cnt)))
|
||||||
|
cnt, skip = self.get_product(driver.page_source, cat, cnt, skip)
|
||||||
|
if cnt >=self.product_limit:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
more_cat = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/div/div/div[4]/div[1]/div[1]/div/div/div[2]/div/div[1]/div')
|
||||||
|
driver.execute_script("arguments[0].click();", more_cat)
|
||||||
|
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
elements = driver.find_elements(By.CLASS_NAME, 'shopee-category-list__sub-category')
|
||||||
|
|
||||||
|
for element in elements:
|
||||||
|
driver.execute_script("arguments[0].click();", element)
|
||||||
|
time.sleep(5)
|
||||||
|
filters = driver.find_elements(By.CLASS_NAME, 'shopee-sort-by-options__option')
|
||||||
|
for filter in filters:
|
||||||
|
if filter.text == 'Terlaris':
|
||||||
|
logging.info("Sorting data by top sales.......")
|
||||||
|
driver.execute_script("arguments[0].click();", filter)
|
||||||
|
time.sleep(5)
|
||||||
|
lim = driver.find_element(By.XPATH, '/html/body/div[1]/div/div[2]/div/div/div[3]/div[2]/div/div[1]/div[2]/div/span[2]').text
|
||||||
|
|
||||||
|
|
||||||
|
print("Collecting products for subcategory: {}".format(str(element.text)))
|
||||||
|
pg_cnt = 1
|
||||||
|
print("Collecting data for page: {}".format(str(pg_cnt)))
|
||||||
|
|
||||||
|
cnt = 0
|
||||||
|
skip = 0
|
||||||
|
cnt, skip = self.get_product(driver.page_source, element.text, cnt, skip)
|
||||||
|
for i in range(int(lim)-1):
|
||||||
|
pg_cnt += 1
|
||||||
|
next = driver.find_element(By.XPATH,'/html/body/div[1]/div/div[2]/div/div/div[3]/div[2]/div/div[1]/div[2]/button[2]')
|
||||||
|
driver.execute_script("arguments[0].click();", next)
|
||||||
|
time.sleep(5)
|
||||||
|
print("Collecting data for page: {}".format(str(pg_cnt)))
|
||||||
|
cnt, skip = self.get_product(driver.page_source, element.text, cnt, skip)
|
||||||
|
if cnt >=self.product_limit:
|
||||||
|
break
|
||||||
|
|
||||||
|
time.sleep(random.randint(20,35))
|
||||||
|
|
||||||
|
def get_product(self, page_source, cat, cnt_main, skip_main):
|
||||||
|
try:
|
||||||
|
#Fetch page source
|
||||||
|
data = page_source
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
#Fetch data from page source
|
||||||
|
try:
|
||||||
|
soup = bs4.BeautifulSoup(data,features="lxml")
|
||||||
|
all_product = soup.find_all('div',{'class':"col-xs-2-4 shopee-search-item-result__item"})
|
||||||
|
|
||||||
|
cnt = cnt_main
|
||||||
|
skip = skip_main
|
||||||
|
|
||||||
|
for product in all_product:
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
product_link_element = product.find('a')
|
||||||
|
product_page_url = product_link_element.get('href')
|
||||||
|
product_page_url = ("https://shopee.co.id"+product_page_url).replace("'","''")
|
||||||
|
product_page_url_hash = hashlib.md5(product_page_url.encode('utf-8')).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
ids = ((product_page_url.split('-i.')[1]).split('?')[0]).split('.')
|
||||||
|
itemid = ids[1]
|
||||||
|
shopid = ids[0]
|
||||||
|
flag = 0
|
||||||
|
|
||||||
|
#print("itemid: {}; shopid: {}".format(str(itemid), str(shopid)))
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where itemid='"+itemid+"' and shopid='"+shopid+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchall()
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+"(crawler_name,keyword,shopid,itemid,product_page_url,product_page_url_hash,flag) values('"+str(self.crawler_name)+"','"+str(cat)+"',"+str(shopid)+","+str(itemid)+",'"+product_page_url+"','"+product_page_url_hash+"',"+str(flag)+")"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
cnt += 1
|
||||||
|
if cnt >=self.product_limit:
|
||||||
|
break
|
||||||
|
|
||||||
|
#conn.commit()
|
||||||
|
else:
|
||||||
|
#print("Already collected. Skipping")
|
||||||
|
skip += 1
|
||||||
|
except Exception as e:
|
||||||
|
print("ERROR: {}".format(str(e)))
|
||||||
|
|
||||||
|
print("Total Items: {}\nTotal Collected: {}\nTotal Skipped: {}".format(str(len(all_product)),str(cnt), str(skip)))
|
||||||
|
return cnt, skip
|
||||||
|
except Exception as e:
|
||||||
|
print("Error: {}".format(str(e)))
|
||||||
|
|
||||||
|
except:
|
||||||
|
print("ERROR: Data cannot be collected.")
|
||||||
|
|
|
@ -0,0 +1,213 @@
|
||||||
|
from shopee_sub_categories import shopee_sub_categories
|
||||||
|
from shopee_category_products import shopee_category_products
|
||||||
|
from shopee_products import shopee_products
|
||||||
|
import logging
|
||||||
|
import psycopg2
|
||||||
|
import json
|
||||||
|
|
||||||
|
###### Looger ######
|
||||||
|
format = "%(asctime)s: %(message)s"
|
||||||
|
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
config = {}
|
||||||
|
|
||||||
|
def get_sub_category():
|
||||||
|
sub_cat = shopee_sub_categories(config)
|
||||||
|
sub_cat.get_sub_categories()
|
||||||
|
|
||||||
|
|
||||||
|
def get_category_products(cur, slave01, slave02):
|
||||||
|
products = shopee_category_products(config)
|
||||||
|
products.browse_category_page()
|
||||||
|
|
||||||
|
if not slave01:
|
||||||
|
sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_slave01',1)"
|
||||||
|
cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=1 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave01'"
|
||||||
|
cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
if not slave02:
|
||||||
|
sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_slave02',1)"
|
||||||
|
cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=1 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave02'"
|
||||||
|
cur.execute(sql)
|
||||||
|
|
||||||
|
def get_products_info():
|
||||||
|
product_info = shopee_products(config)
|
||||||
|
product_info.get_shopee_products()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
crawler_main = int(config.get('crawler_main'))
|
||||||
|
crawler_slave_no = int(config.get('crawler_slave_no')) if config.get('crawler_slave_no') else None
|
||||||
|
|
||||||
|
if crawler_main:
|
||||||
|
crawler_master()
|
||||||
|
else:
|
||||||
|
if crawler_slave_no == 1:
|
||||||
|
crawler_slave1()
|
||||||
|
elif crawler_slave_no ==2:
|
||||||
|
crawler_slave2()
|
||||||
|
|
||||||
|
def crawler_master():
|
||||||
|
conn = psycopg2.connect(database=config.get('database'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port'))
|
||||||
|
conn.autocommit = True
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
sql = "select crawler_name,keyword,flag from "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
|
||||||
|
cur.execute(sql)
|
||||||
|
res = cur.fetchone()
|
||||||
|
|
||||||
|
sql = "select crawler_name,keyword,flag from "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave01'"
|
||||||
|
cur.execute(sql)
|
||||||
|
slave01 = cur.fetchone()
|
||||||
|
|
||||||
|
sql = "select crawler_name,keyword,flag from "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave02'"
|
||||||
|
cur.execute(sql)
|
||||||
|
slave02 = cur.fetchone()
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_master',0)"
|
||||||
|
cur.execute(sql)
|
||||||
|
if not slave01:
|
||||||
|
sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_slave01',0)"
|
||||||
|
cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=0 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave01'"
|
||||||
|
cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
if not slave02:
|
||||||
|
sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_slave02',0)"
|
||||||
|
cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=0 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave02'"
|
||||||
|
cur.execute(sql)
|
||||||
|
|
||||||
|
get_sub_category()
|
||||||
|
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=1 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
|
||||||
|
cur.execute(sql)
|
||||||
|
|
||||||
|
get_category_products(cur, slave01, slave02)
|
||||||
|
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=2 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
|
||||||
|
cur.execute(sql)
|
||||||
|
|
||||||
|
get_products_info()
|
||||||
|
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=3 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
|
||||||
|
cur.execute(sql)
|
||||||
|
else:
|
||||||
|
if res[2]==0:
|
||||||
|
if not slave01:
|
||||||
|
sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_slave01',0)"
|
||||||
|
cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=0 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave01'"
|
||||||
|
cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
if not slave02:
|
||||||
|
sql = "insert into "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" (crawler_name,keyword,flag) values('flag','"+config.get('crawler_name')+"_slave02',0)"
|
||||||
|
cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=0 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave02'"
|
||||||
|
cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
get_sub_category()
|
||||||
|
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=1 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
|
||||||
|
cur.execute(sql)
|
||||||
|
|
||||||
|
get_category_products(cur, slave01, slave02)
|
||||||
|
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=2 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
|
||||||
|
cur.execute(sql)
|
||||||
|
|
||||||
|
get_products_info()
|
||||||
|
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=3 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
|
||||||
|
cur.execute(sql)
|
||||||
|
elif res[2]==1:
|
||||||
|
get_category_products(cur, slave01, slave02)
|
||||||
|
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=2 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
|
||||||
|
cur.execute(sql)
|
||||||
|
|
||||||
|
get_products_info()
|
||||||
|
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=3 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
|
||||||
|
cur.execute(sql)
|
||||||
|
elif res[2]==2:
|
||||||
|
get_products_info()
|
||||||
|
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=3 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
|
||||||
|
cur.execute(sql)
|
||||||
|
elif res[2]==3:
|
||||||
|
|
||||||
|
if slave01[2]==2 and slave02[2]==2:
|
||||||
|
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=0 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_master'"
|
||||||
|
cur.execute(sql)
|
||||||
|
main()
|
||||||
|
else:
|
||||||
|
logging.info("Slaves are working.....")
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def crawler_slave1():
|
||||||
|
conn = psycopg2.connect(database=config.get('database'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port'))
|
||||||
|
conn.autocommit = True
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
sql = "select crawler_name,keyword,flag from "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave01'"
|
||||||
|
cur.execute(sql)
|
||||||
|
res = cur.fetchone()
|
||||||
|
|
||||||
|
if res:
|
||||||
|
if res[2]==1:
|
||||||
|
get_products_info()
|
||||||
|
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=2 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave01'"
|
||||||
|
cur.execute(sql)
|
||||||
|
else:
|
||||||
|
logging.info("Slave02 or Master are working.....")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
def crawler_slave2():
|
||||||
|
conn = psycopg2.connect(database=config.get('database'), user=config.get('db_user'), password=config.get('db_pass'), host=config.get('db_host'), port=config.get('db_port'))
|
||||||
|
conn.autocommit = True
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
sql = "select crawler_name,keyword,flag from "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave02'"
|
||||||
|
cur.execute(sql)
|
||||||
|
res = cur.fetchone()
|
||||||
|
|
||||||
|
if res:
|
||||||
|
if res[2]==1:
|
||||||
|
get_products_info()
|
||||||
|
sql = "update "+config.get('crawler_schema')+"."+config.get('tracker_tab')+" set flag=2 where crawler_name='flag' and keyword='"+config.get('crawler_name')+"_slave02'"
|
||||||
|
cur.execute(sql)
|
||||||
|
else:
|
||||||
|
logging.info("Slave01 or Master are working.....")
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.info("Starting Shopee Crawler.......")
|
||||||
|
try:
|
||||||
|
logging.info("Loading config file.......")
|
||||||
|
with open("conf.json", "r") as jsonfile:
|
||||||
|
config = json.load(jsonfile)
|
||||||
|
logging.info("Config file loaded.......")
|
||||||
|
|
||||||
|
main()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
#logging.info("Error: ".format(e))
|
||||||
|
logging.info("Cannot load cofig file. Please check. Exiting......")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,587 @@
|
||||||
|
import logging
|
||||||
|
import psycopg2
|
||||||
|
|
||||||
|
###### Looger ######
|
||||||
|
format = "%(asctime)s: %(message)s"
|
||||||
|
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
class shopee_db_writer:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
||||||
|
self.conn.autocommit = True
|
||||||
|
self.cur = self.conn.cursor()
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
logging.info("Closing connection.....")
|
||||||
|
self.conn.close()
|
||||||
|
|
||||||
|
def rce_category(self, data):
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" where rce_source_category_id = "+str(data['rce_source_category_id'])
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
cat_name = data['category_name'].replace("'","''")
|
||||||
|
cat_url = data['category_page_url'].replace("'","''")
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" (parent_category_id,rce_source_id," \
|
||||||
|
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name) values (" \
|
||||||
|
+str(data['parent_category_id'])+","+str(data['rce_source_id'])+", "+str(data['rce_source_category_id'])+", "+str(data['rce_source_status'])+", " \
|
||||||
|
"'"+str(cat_url)+"', '"+str(data['category_page_url_hash'])+"', '"+str(cat_name)+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(id,parent_category_id,rce_source_id," \
|
||||||
|
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat) " \
|
||||||
|
"select id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \
|
||||||
|
"category_name,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \
|
||||||
|
"where rce_source_category_id = "+ str(data['rce_source_category_id'])
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
if str(data['parent_category_id'])==str(res[1]) and str(data['rce_source_category_id'])==str(res[3]) and str(data['category_name']) == str(res[7]) and \
|
||||||
|
str(data['category_page_url'])==str(res[5]):
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" set updatedat=now() " \
|
||||||
|
"where rce_source_category_id = "+ str(res[3])
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" set parent_category_id = " \
|
||||||
|
""+str(data['parent_category_id'])+", rce_source_category_id = "+str(data['rce_source_category_id'])+", " \
|
||||||
|
"category_name='"+str(cat_name)+"', category_page_url='"+str(cat_url)+"', " \
|
||||||
|
"category_page_url_hash='"+str(data['category_page_url_hash'])+"', updatedat=now() where " \
|
||||||
|
"rce_source_category_id = "+ str(res[3])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(id,parent_category_id,rce_source_id," \
|
||||||
|
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat) " \
|
||||||
|
"select id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \
|
||||||
|
"category_name,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \
|
||||||
|
"where rce_source_category_id = "+ str(res[3])
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_product(self, data):
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_id = "+str(data['rce_source_product_id'])
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
data['product_page_url'] = data['product_page_url'].replace("'","''")
|
||||||
|
data['rce_source_product_name'] = data['rce_source_product_name'].replace("'","''")
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" (rce_source_product_id," \
|
||||||
|
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
|
||||||
|
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
|
||||||
|
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
|
||||||
|
"ships_from) values("+str(data['rce_source_product_id'])+","+str(data['rce_source_product_status'])+",'"+str(data['product_page_url'])+"'," \
|
||||||
|
"'"+str(data['product_page_url_hash'])+"',"+str(data['rce_category_id'])+","+str(data['rce_brand_id'])+","+str(data['rce_store_id'])+"," \
|
||||||
|
"'"+str(data['rce_source_product_name'])+"','"+str(data['product_images'])+"','"+str(data['product_description'])+"',"+str(data['product_sold_total'])+"," \
|
||||||
|
""+str(data['product_sold'])+",'"+str(data['product_price_min'])+"','"+str(data['product_price_min_before_discount'])+"','"+str(data['product_price_max'])+"'," \
|
||||||
|
"'"+str(data['product_price_max_before_discount'])+"','"+str(data['ratings'])+"','"+str(data['ships_from'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" (id,rce_source_product_id," \
|
||||||
|
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
|
||||||
|
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
|
||||||
|
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
|
||||||
|
"ships_from,createdat,updatedat) select id,rce_source_product_id," \
|
||||||
|
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
|
||||||
|
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
|
||||||
|
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
|
||||||
|
"ships_from,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where " \
|
||||||
|
"rce_source_product_id="+str(data['rce_source_product_id'])+""
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_source_product_id'])==str(res[1]) and str(data['rce_source_product_status'])==str(res[2]) and \
|
||||||
|
str(data['product_page_url'])==str(res[3]) and str(data['product_page_url_hash'])==str(res[4]) and str(data['rce_category_id'])==str(res[5]) and \
|
||||||
|
str(data['rce_brand_id'])==str(res[6]) and str(data['rce_store_id'])==str(res[7]) and str(data['rce_source_product_name'])==str(res[8]) and \
|
||||||
|
str(data['product_images'])==str(res[9]) and str(data['product_sold_total'])==str(res[11]) and \
|
||||||
|
str(data['product_sold'])==str(res[12]) and str(data['product_price_min'])==str(res[13]) and str(data['product_price_min_before_discount'])==str(res[14]) and \
|
||||||
|
str(data['product_price_max'])==str(res[15]) and str(data['product_price_max_before_discount'])==str(res[16]) and str(data['ratings'])==str(res[17]) and \
|
||||||
|
str(data['ships_from'])==str(res[18]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" set updatedat=now() " \
|
||||||
|
"where rce_source_product_id = "+ str(res[1])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" set rce_source_product_id="+str(data['rce_source_product_id'])+"," \
|
||||||
|
"rce_source_product_status="+str(data['rce_source_product_status'])+",product_page_url='"+str(data['product_page_url'])+"',product_page_url_hash= " \
|
||||||
|
"'"+str(data['product_page_url_hash'])+"',rce_category_id="+str(data['rce_category_id'])+",rce_brand_id="+str(data['rce_brand_id'])+"," \
|
||||||
|
"rce_store_id="+str(data['rce_store_id'])+",rce_source_product_name='"+str(data['rce_source_product_name'])+"',product_images='"+str(data['product_images'])+"'" \
|
||||||
|
",product_description='"+str(data['product_description'])+"',product_sold_total="+str(data['product_sold_total'])+",product_sold="+str(data['product_sold'])+"," \
|
||||||
|
"product_price_min='"+str(data['product_price_min'])+"',product_price_min_before_discount='"+str(data['product_price_min_before_discount'])+"'," \
|
||||||
|
"product_price_max='"+str(data['product_price_max'])+"',product_price_max_before_discount='"+str(data['product_price_max_before_discount'])+"',ratings='"+str(data['ratings'])+"'," \
|
||||||
|
"ships_from='"+str(data['ships_from'])+"', updatedat=now() where rce_source_product_id = "+ str(res[1])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" (id,rce_source_product_id," \
|
||||||
|
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
|
||||||
|
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
|
||||||
|
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
|
||||||
|
"ships_from,createdat,updatedat) select id,rce_source_product_id," \
|
||||||
|
"rce_source_product_status,product_page_url,product_page_url_hash,rce_category_id,rce_brand_id," \
|
||||||
|
"rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold," \
|
||||||
|
"product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings," \
|
||||||
|
"ships_from,createdat,updatedat from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where " \
|
||||||
|
"rce_source_product_id="+str(res[1])+""
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
def rce_product_variant(self, data):
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where rce_source_variant_id = "+str(data['rce_source_variant_id'])
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
data['product_variant_name'] = data['product_variant_name'].replace("'","''")
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" (rce_source_variant_id,rce_product_id," \
|
||||||
|
"product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock) values("+str(data['rce_source_variant_id'])+"," \
|
||||||
|
""+str(data['rce_product_id'])+",'"+str(data['product_variant_name'])+"','"+str(data['product_variant_price'])+"'," \
|
||||||
|
"'"+str(data['product_variant_price_before_discount'])+"',"+str(data['product_variant_stock'])+")"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" (id,rce_source_variant_id,rce_product_id," \
|
||||||
|
"product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,createdat,updatedat) select * from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where rce_source_variant_id="+str(data['rce_source_variant_id'])+""
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
if str(data['rce_source_variant_id'])==str(res[1]) and str(data['rce_product_id'])==str(res[2]) and str(data['product_variant_name'])==str(res[3]) and \
|
||||||
|
str(data['product_variant_price'])==str(res[4]) and str(data['product_variant_price_before_discount'])==str(res[5]) and str(data['product_variant_stock'])==str(res[6]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" set updatedat=now() " \
|
||||||
|
"where rce_source_variant_id = "+ str(res[1])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" set rce_source_variant_id="+str(data['rce_source_variant_id'])+", " \
|
||||||
|
"rce_product_id="+str(data['rce_product_id'])+", product_variant_name='"+str(data['product_variant_name'])+"', product_variant_price=" \
|
||||||
|
"'"+str(data['product_variant_price'])+"',product_variant_price_before_discount='"+str(data['product_variant_price_before_discount'])+"'," \
|
||||||
|
"product_variant_stock="+str(data['product_variant_stock'])+", updatedat=now() where rce_source_variant_id = "+ str(res[1])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('variant_tab')+" (id,rce_source_variant_id,rce_product_id," \
|
||||||
|
"product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,createdat,updatedat) select * from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('variant_tab')+" where rce_source_variant_id="+str(res[1])+""
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
def rce_brand(self, data):
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where rce_source_brand_id = "+str(data['rce_source_brand_id'])
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
data['brand_page_url'] = data['brand_page_url'].replace("'","''")
|
||||||
|
data['brand_name'] = data['brand_name'].replace("'","''")
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" (rce_source_id,rce_source_brand_id,rce_source_brand_status," \
|
||||||
|
"brand_page_url,brand_page_url_hash,brand_name) values("+str(data['rce_source_id'])+","+str(data['rce_source_brand_id'])+"," \
|
||||||
|
""+str(data['rce_source_brand_status'])+",'"+str(data['brand_page_url'])+"','"+str(data['brand_page_url_hash'])+"'," \
|
||||||
|
"'"+str(data['brand_name'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" (id,rce_source_id,rce_source_brand_id,rce_source_brand_status," \
|
||||||
|
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat) select id,rce_source_id,rce_source_brand_id,rce_source_brand_status," \
|
||||||
|
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where rce_source_brand_id="+str(data['rce_source_brand_id'])+""
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_source_brand_id'])==str(res[2]) and str(data['rce_source_brand_status'])==str(res[3]) and str(data['brand_page_url'])==str(res[4]) and \
|
||||||
|
str(data['brand_page_url_hash'])==str(res[5]) and str(data['brand_name'])==str(res[6]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" set updatedat=now() " \
|
||||||
|
"where rce_source_brand_id = "+ str(res[2])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" set rce_source_id="+str(data['rce_source_id'])+", rce_source_brand_id="+str(data['rce_source_brand_id'])+", " \
|
||||||
|
"rce_source_brand_status="+str(data['rce_source_brand_status'])+", brand_page_url='"+str(data['brand_page_url'])+"', brand_page_url_hash=" \
|
||||||
|
"'"+str(data['brand_page_url_hash'])+"',brand_name='"+str(data['brand_name'])+"', updatedat=now() where rce_source_brand_id = "+ str(res[2])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('brand_tab')+" (id,rce_source_id,rce_source_brand_id,rce_source_brand_status," \
|
||||||
|
"brand_page_url,brand_page_url_hash,brand_name,createdat,updatedat) select * from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where rce_source_brand_id="+str(res[2])+""
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_reseller(self, data):
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where rce_source_reseller_id = "+str(data['rce_source_reseller_id'])
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
data['reseller_name'] = data['reseller_name'].replace("'","''")
|
||||||
|
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" (rce_source_id,rce_source_reseller_id,rce_source_reseller_status," \
|
||||||
|
"reseller_name,reseller_average_rating,reseller_follower_count,reseller_response_rate) values("+str(data['rce_source_id'])+","+str(data['rce_source_reseller_id'])+"," \
|
||||||
|
""+str(data['rce_source_reseller_status'])+",'"+str(data['reseller_name'])+"','"+str(data['reseller_average_rating'])+"'," \
|
||||||
|
""+str(data['reseller_follower_count'])+",'"+str(data['reseller_response_rate'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (id,rce_source_id,rce_source_reseller_id,rce_source_reseller_status," \
|
||||||
|
"reseller_name,reseller_average_rating,reseller_follower_count,reseller_response_rate,createdat,updatedat) select id,rce_source_id,rce_source_reseller_id,rce_source_reseller_status," \
|
||||||
|
"reseller_name,reseller_average_rating,reseller_follower_count,reseller_response_rate,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where rce_source_reseller_id="+str(data['rce_source_reseller_id'])+""
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_source_reseller_id'])==str(res[2]) and str(data['rce_source_reseller_status'])==str(res[3]) and str(data['reseller_name'])==str(res[4]) and \
|
||||||
|
str(data['reseller_average_rating'])==str(res[5]) and str(data['reseller_follower_count'])==str(res[7]) and str(data['reseller_response_rate'])==str(res[8]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set updatedat=now() " \
|
||||||
|
"where rce_source_reseller_id = "+ str(res[2])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set rce_source_id="+str(data['rce_source_id'])+",rce_source_reseller_id="+str(data['rce_source_reseller_id'])+", " \
|
||||||
|
"rce_source_reseller_status="+str(data['rce_source_reseller_status'])+", reseller_name='"+str(data['reseller_name'])+"', reseller_average_rating=" \
|
||||||
|
"'"+str(data['reseller_average_rating'])+"',reseller_follower_count='"+str(data['reseller_follower_count'])+"', reseller_response_rate=" \
|
||||||
|
"'"+str(data['reseller_response_rate'])+"', updatedat=now() where rce_source_reseller_id = "+ str(res[2])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (id,rce_source_id,rce_source_reseller_id,rce_source_reseller_status," \
|
||||||
|
"reseller_name,reseller_average_rating,reseller_follower_count,reseller_response_rate,createdat,updatedat) select id,rce_source_id,rce_source_reseller_id,rce_source_reseller_status," \
|
||||||
|
"reseller_name,reseller_average_rating,reseller_follower_count,reseller_response_rate,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where rce_source_reseller_id="+str(res[2])
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_reseller_store(self, data):
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_source_store_id = "+str(data['rce_source_store_id'])
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
data['store_page_url'] = data['store_page_url'].replace("'","''")
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" (rce_source_store_id,rce_source_store_status," \
|
||||||
|
"store_page_url,store_page_url_hash,store_location,rce_reseller_id) values("+str(data['rce_source_store_id'])+"," \
|
||||||
|
""+str(data['rce_source_store_status'])+",'"+str(data['store_page_url'])+"','"+str(data['store_page_url_hash'])+"'," \
|
||||||
|
"'"+str(data['store_location'])+"', "+str(data['rce_reseller_id'])+")"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (id,rce_source_store_id,rce_source_store_status," \
|
||||||
|
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat) select id,rce_source_store_id,rce_source_store_status," \
|
||||||
|
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_source_store_id="+str(data['rce_source_store_id'])+""
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_source_store_id'])==str(res[1]) and str(data['rce_source_store_status'])==str(res[2]) and str(data['store_page_url'])==str(res[3]) and \
|
||||||
|
str(data['store_page_url_hash'])==str(res[4]) and str(data['store_location'])==str(res[5]) and str(data['rce_reseller_id'])==str(res[6]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set updatedat=now() " \
|
||||||
|
"where rce_source_store_id = "+ str(res[1])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set rce_source_store_id="+str(data['rce_source_store_id'])+", " \
|
||||||
|
"rce_source_store_status="+str(data['rce_source_store_status'])+", store_page_url='"+str(data['store_page_url'])+"', store_page_url_hash=" \
|
||||||
|
"'"+str(data['store_page_url_hash'])+"',store_location='"+str(data['store_location'])+"', rce_reseller_id="+str(data['rce_reseller_id'])+", " \
|
||||||
|
"updatedat=now() where rce_source_store_id = "+ str(res[1])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (id,rce_source_store_id,rce_source_store_status," \
|
||||||
|
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat) select id,rce_source_store_id,rce_source_store_status," \
|
||||||
|
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_source_store_id="+str(res[1])+""
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_ratings_reviews(self, data):
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id = "+str(data['rce_product_id'])+" and username ='"+str(data['username'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
data['username'] = data['username'].replace("'","''")
|
||||||
|
data['img_url'] = data['img_url'].replace("'","''")
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" (id,rce_product_id,username," \
|
||||||
|
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating) values("+str(data['id'])+","+str(data['rce_product_id'])+"," \
|
||||||
|
"'"+str(data['username'])+"','"+str(data['review'])+"','"+str(data['img_url'])+"',"+str(data['review_like_count'])+",'"+str(data['user_tier'])+"'," \
|
||||||
|
""+str(data['shop_id'])+", '"+str(data['video_url'])+"', '"+str(data['rating'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" (id,rce_product_id,username," \
|
||||||
|
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select id,rce_product_id,username," \
|
||||||
|
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id="+str(data['rce_product_id'])+" and username ='"+str(data['username'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_product_id'])==str(res[1]) and str(data['username'])==str(res[2]) and str(data['review'])==str(res[3]) and \
|
||||||
|
str(data['img_url'])==str(res[4]) and str(data['review_like_count'])==str(res[5]) and str(data['user_tier'])==str(res[6]) and \
|
||||||
|
str(data['shop_id'])==str(res[7]) and str(data['video_url'])==str(res[8]) and str(data['rating'])==str(res[9]):
|
||||||
|
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set updatedat=now() " \
|
||||||
|
"where rce_product_id = "+ str(res[1])+" and username ='"+res[2]+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set rce_product_id="+str(data['rce_product_id'])+", " \
|
||||||
|
"username='"+str(data['username'])+"', review='"+str(data['review'])+"', img_url=" \
|
||||||
|
"'"+str(data['img_url'])+"',review_like_count="+str(data['review_like_count'])+", user_tier='"+str(data['user_tier'])+"', " \
|
||||||
|
"shop_id="+str(data['shop_id'])+", video_url='"+str(data['video_url'])+"', rating='"+str(data['rating'])+"', updatedat=now() " \
|
||||||
|
"where rce_product_id = "+ str(res[1])+" and username ='"+str(data['username'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" (id,rce_product_id,username," \
|
||||||
|
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select id,rce_product_id,username," \
|
||||||
|
"review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where rce_product_id="+str(res[1])+" and username ='"+str(data['username'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
def rce_ratings_reviews_productmodels(self,data):
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id = "+str(data['rce_rating_id'])
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" (rce_rating_id,model_id) " \
|
||||||
|
"values("+str(data['rce_rating_id'])+",'"+str(data['model_id'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
|
||||||
|
"createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(data['rce_rating_id'])+""
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_rating_id'])==str(res[1]) and str(data['model_id'])==str(res[2]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set updatedat=now() " \
|
||||||
|
"where rce_rating_id = "+ str(res[1])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set model_id="+str(data['model_id'])+", " \
|
||||||
|
"updatedat=now() where rce_source_store_id = "+ str(res[1])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
|
||||||
|
"createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(res[1])+""
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
def rce_tags(self,data):
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description = '"+str(data['description'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" (id,description) " \
|
||||||
|
"values("+str(data['id'])+",'"+str(data['description'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
|
||||||
|
"createdat,updatedat) select id,description,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(data['description'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['description'])==str(res[1]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set updatedat=now() " \
|
||||||
|
"where description = '"+ str(res[1])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set description='"+str(data['description'])+"', " \
|
||||||
|
"updatedat=now() where description = "+ str(res[1])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
|
||||||
|
"createdat,updatedat) select id,description,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(res[1])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
def rce_ratings_reviews_producttags(self,data):
|
||||||
|
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id = '"+str(data['rce_rating_id'])+"'"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchone()
|
||||||
|
|
||||||
|
|
||||||
|
if not res:
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" (rce_rating_id,tag_ids) " \
|
||||||
|
"values("+str(data['rce_rating_id'])+",'"+str(data['tag_ids'])+"')"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
|
||||||
|
"createdat,updatedat) select id,rce_rating_id,tag_ids,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id='"+str(data['rce_rating_id'])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if str(data['rce_rating_id'])==str(res[1]):
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set updatedat=now() " \
|
||||||
|
"where rce_rating_id = '"+ str(res[1])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" a set updatedat=b.updatedat " \
|
||||||
|
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
else:
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set rce_rating_id='"+str(data['rce_rating_id'])+"', " \
|
||||||
|
"updatedat=now() where rce_rating_id = "+ str(res[1])
|
||||||
|
#logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
|
||||||
|
"createdat,updatedat) select id,description,createdat,updatedat from " \
|
||||||
|
""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where description='"+str(res[1])+"'"
|
||||||
|
#logging.info(sql)
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,540 @@
|
||||||
|
import hashlib
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from selenium.webdriver.remote.remote_connection import LOGGER
|
||||||
|
LOGGER.setLevel(logging.WARNING)
|
||||||
|
import string
|
||||||
|
from seleniumwire import webdriver
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
import psycopg2
|
||||||
|
import bs4
|
||||||
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
import random
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import gzip
|
||||||
|
import re
|
||||||
|
from shopee_db_writer import shopee_db_writer
|
||||||
|
import random
|
||||||
|
|
||||||
|
class shopee_products:
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
self.crawler_name = self.config.get("crawler_name")
|
||||||
|
self.pattern = r'[' + string.punctuation + ']'
|
||||||
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
||||||
|
self.conn.autocommit = True
|
||||||
|
self.cur = self.conn.cursor()
|
||||||
|
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Shopee'")
|
||||||
|
self.rce_source_id = self.cur.fetchone()[0]
|
||||||
|
self.db_writer = shopee_db_writer(config)
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
print("Closing connection.....")
|
||||||
|
self.conn.close()
|
||||||
|
|
||||||
|
def get_raw_product(self, url):
|
||||||
|
op = webdriver.ChromeOptions()
|
||||||
|
hight = str(random.randint(640,1280))
|
||||||
|
width = str(random.randint(1024,1920))
|
||||||
|
op.add_argument("window-size="+width+","+hight+"")
|
||||||
|
op.add_experimental_option("useAutomationExtension", False)
|
||||||
|
op.add_argument('--no-sandbox')
|
||||||
|
op.add_argument('--disable-notifications')
|
||||||
|
op.add_argument("--lang=en-GB")
|
||||||
|
op.add_argument("--log-level=3")
|
||||||
|
op.headless = True
|
||||||
|
driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op)
|
||||||
|
|
||||||
|
driver.get(url)
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
iteminfo = ""
|
||||||
|
shopinfo = ""
|
||||||
|
ratinginfo = ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
for request in driver.requests:
|
||||||
|
if request.response:
|
||||||
|
if '/api/v4/item/get?item' in request.url:
|
||||||
|
encoding = request.response.headers.get('content-encoding')
|
||||||
|
if encoding:
|
||||||
|
iteminfo = gzip.decompress(request.response.body).decode()
|
||||||
|
else:
|
||||||
|
iteminfo = request.response.body
|
||||||
|
if '/api/v4/product/get_shop_info?shopid' in request.url:
|
||||||
|
encoding = request.response.headers.get('content-encoding')
|
||||||
|
if encoding:
|
||||||
|
shopinfo = gzip.decompress(request.response.body).decode()
|
||||||
|
else:
|
||||||
|
shopinfo = request.response.body
|
||||||
|
if '/api/v2/item/get_ratings' in request.url:
|
||||||
|
if encoding:
|
||||||
|
ratinginfo = gzip.decompress(request.response.body).decode()
|
||||||
|
else:
|
||||||
|
ratinginfo = request.response.body
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
driver.close()
|
||||||
|
|
||||||
|
return iteminfo, shopinfo, ratinginfo
|
||||||
|
|
||||||
|
|
||||||
|
def product_info(self, data_item, item):
|
||||||
|
|
||||||
|
### rce_brand
|
||||||
|
|
||||||
|
data_brand = {}
|
||||||
|
|
||||||
|
data_brand['rce_source_id'] = self.rce_source_id
|
||||||
|
data_brand['rce_source_brand_id'] = ""
|
||||||
|
data_brand['rce_source_brand_status'] = 1
|
||||||
|
data_brand['brand_page_url'] = ""
|
||||||
|
data_brand['brand_page_url_hash'] = ""
|
||||||
|
data_brand['brand_name'] = ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_brand['rce_source_brand_id'] = data_item['data']['brand_id']
|
||||||
|
data_brand['brand_page_url'] = "https://shopee.co.id/search?brands=" + str(data_item['data']['brand_id'])
|
||||||
|
data_brand['brand_page_url_hash'] = hashlib.md5(data_brand['brand_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
|
||||||
|
try:
|
||||||
|
brand_name = data_item['data']['brand']
|
||||||
|
data_brand['brand_name'] = re.sub(self.pattern, '', brand_name)
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
self.db_writer.rce_brand(data_brand)
|
||||||
|
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
### rce_product
|
||||||
|
|
||||||
|
data_product = {}
|
||||||
|
|
||||||
|
data_product['rce_source_product_id'] = item[3] #itemid
|
||||||
|
data_product['rce_source_product_status'] = 1
|
||||||
|
data_product['product_page_url'] = item[4] #product page url
|
||||||
|
data_product['product_page_url_hash'] = item[5] #product page url hash
|
||||||
|
data_product['rce_category_id'] = ""
|
||||||
|
data_product['rce_brand_id'] = ""
|
||||||
|
data_product['rce_store_id'] = ""
|
||||||
|
data_product['rce_source_product_name'] = ""
|
||||||
|
data_product['product_images'] = ""
|
||||||
|
data_product['product_description'] = ""
|
||||||
|
data_product['product_sold_total'] = ""
|
||||||
|
data_product['product_sold'] = ""
|
||||||
|
data_product['product_price_min'] = ""
|
||||||
|
data_product['product_price_min_before_discount'] =""
|
||||||
|
data_product['product_price_max'] = ""
|
||||||
|
data_product['product_price_max_before_discount'] = ""
|
||||||
|
data_product['ratings'] = ""
|
||||||
|
data_product['ships_from'] = ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
keyword = item[1]
|
||||||
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" where lower(category_name) = lower('"+keyword+"')"
|
||||||
|
self.cur.execute(sql)
|
||||||
|
data_product['rce_category_id'] = self.cur.fetchone()[0]
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where rce_source_brand_id = "+str(data_brand['rce_source_brand_id'])
|
||||||
|
self.cur.execute(sql)
|
||||||
|
data_product['rce_brand_id'] = self.cur.fetchone()[0]
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_source_store_id = "+str(item[2])+""
|
||||||
|
self.cur.execute(sql)
|
||||||
|
data_product['rce_store_id'] = self.cur.fetchone()[0]
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
rce_source_product_name = data_item['data']['name']
|
||||||
|
data_product['rce_source_product_name'] = str(re.sub(self.pattern, '', rce_source_product_name))
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
product_images = str(data_item["data"]["images"])
|
||||||
|
data_product['product_images'] = str(product_images.replace("'",""))
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
product_description = str(data_item["data"]["description"])
|
||||||
|
data_product['product_description'] = str(re.sub(self.pattern, '', product_description))
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_product['product_sold_total'] = str(data_item["data"]["historical_sold"])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_product['product_sold'] = str(data_item["data"]["sold"])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_product['product_price_min'] = str(data_item["data"]["price_min"])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_product['product_price_min_before_discount'] = str(data_item["data"]["price_min_before_discount"])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_product['product_price_max'] = str(data_item["data"]["price_max"])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_product['product_price_max_before_discount'] = str(data_item["data"]["price_max_before_discount"])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_product['ratings'] = str(data_item["data"]["item_rating"]["rating_star"])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_product['ships_from'] = str(data_item["data"]["shop_location"])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
self.db_writer.rce_product(data_product)
|
||||||
|
|
||||||
|
### rce_product_variant
|
||||||
|
data_variant = {}
|
||||||
|
|
||||||
|
data_variant['rce_source_variant_id'] = ""
|
||||||
|
data_variant['rce_product_id'] = ""
|
||||||
|
data_variant['product_variant_name'] = ""
|
||||||
|
data_variant['product_variant_price'] = ""
|
||||||
|
data_variant['product_variant_price_before_discount'] = ""
|
||||||
|
data_variant['product_variant_stock'] = ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_id = "+str(data_product['rce_source_product_id'])
|
||||||
|
self.cur.execute(sql)
|
||||||
|
data_variant['rce_product_id'] = self.cur.fetchone()[0]
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
MODELS = data_item["data"]["models"]
|
||||||
|
|
||||||
|
for i in MODELS:
|
||||||
|
try:
|
||||||
|
data_variant['rce_source_variant_id'] = str(i["modelid"])
|
||||||
|
|
||||||
|
try:
|
||||||
|
product_variant_name = str(i["name"])
|
||||||
|
data_variant['product_variant_name'] = re.sub(self.pattern, '', product_variant_name)
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_variant['product_variant_price'] = str(i["price"])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_variant['product_variant_price_before_discount'] = str(i["price_before_discount"])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_variant['product_variant_stock'] = str(i["stock"])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
self.db_writer.rce_product_variant(data_variant)
|
||||||
|
|
||||||
|
except: pass
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def reseller_info(self, data_shop, item):
|
||||||
|
|
||||||
|
data_reseller = {}
|
||||||
|
|
||||||
|
data_reseller['rce_source_id'] = self.rce_source_id
|
||||||
|
data_reseller['rce_source_reseller_id'] = ""
|
||||||
|
data_reseller['rce_source_reseller_status'] = 1
|
||||||
|
data_reseller['reseller_name'] = ""
|
||||||
|
data_reseller['reseller_average_rating'] = ""
|
||||||
|
data_reseller['reseller_follower_count'] = ""
|
||||||
|
data_reseller['reseller_response_rate'] = ""
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
data_reseller['rce_source_reseller_id'] = str(data_shop["data"]["userid"])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
reseller_name = str(data_shop["data"]["name"])
|
||||||
|
data_reseller['reseller_name'] = re.sub(self.pattern, '', reseller_name)
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_reseller['reseller_average_rating'] = str(data_shop["data"]["rating_star"])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_reseller['reseller_follower_count'] = str(data_shop["data"]["follower_count"])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_reseller['reseller_response_rate'] = str(data_shop["data"]["response_rate"])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
self.db_writer.rce_reseller(data_reseller)
|
||||||
|
|
||||||
|
|
||||||
|
data_reseller_store = {}
|
||||||
|
|
||||||
|
data_reseller_store['rce_source_store_id'] = item[2]
|
||||||
|
data_reseller_store['rce_source_store_status'] = 1
|
||||||
|
data_reseller_store['store_page_url'] = ""
|
||||||
|
data_reseller_store['store_page_url_hash'] = ""
|
||||||
|
data_reseller_store['store_location'] = ""
|
||||||
|
data_reseller_store['rce_reseller_id'] = ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
username = str(data_shop["data"]["account"]["username"])
|
||||||
|
data_reseller_store['store_page_url'] = "https://shopee.co.id/"+username
|
||||||
|
data_reseller_store['store_page_url_hash'] = hashlib.md5(data_reseller_store['store_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try: data_reseller_store['store_location'] = str(data_shop["data"]["shop_location"])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where rce_source_reseller_id = "+str(data_reseller['rce_source_reseller_id']))
|
||||||
|
rce_reseller_id = self.cur.fetchone()
|
||||||
|
data_reseller_store['rce_reseller_id'] = rce_reseller_id[0]
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
self.db_writer.rce_reseller_store(data_reseller_store)
|
||||||
|
|
||||||
|
|
||||||
|
def rating_info(self, data_rating, item):
|
||||||
|
|
||||||
|
sql = "select max(id) from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')
|
||||||
|
self.cur.execute(sql)
|
||||||
|
rating_id = self.cur.fetchone()
|
||||||
|
|
||||||
|
if rating_id[0]==None:
|
||||||
|
rating_id = 1
|
||||||
|
else:
|
||||||
|
rating_id = int(rating_id[0]) + 1
|
||||||
|
|
||||||
|
for data in data_rating['data']['ratings']:
|
||||||
|
|
||||||
|
data_review = {}
|
||||||
|
|
||||||
|
data_review["id"] = rating_id
|
||||||
|
data_review["rce_product_id"] = ""
|
||||||
|
data_review["username"] = ""
|
||||||
|
data_review["review"] = ""
|
||||||
|
data_review["img_url"] = ""
|
||||||
|
data_review["review_like_count"] = ""
|
||||||
|
data_review["user_tier"] = ""
|
||||||
|
data_review["shop_id"] = item[2]
|
||||||
|
data_review["video_url"] = ""
|
||||||
|
data_review["rating"] = ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_id = "+str(item[3])
|
||||||
|
self.cur.execute(sql)
|
||||||
|
data_review["rce_product_id"] = self.cur.fetchone()[0]
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_review["username"] = str(data['author_username'])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
review = str(data['comment'])
|
||||||
|
review = review.replace(",", " ")
|
||||||
|
review = review.replace("'", " ")
|
||||||
|
|
||||||
|
comments = list(review.split("\n"))
|
||||||
|
|
||||||
|
for comment_items in range(len(comments)):
|
||||||
|
temp_comment = re.sub('[^a-zA-Z0-9\: ]([a-zA-Z\:]+)?\s{0,2}[^a-zA-Z0-9\: ]?', ' ', comments[comment_items])
|
||||||
|
if not re.match('[A-Za-z0-9\s*]*\s*(\:)\s*[A-Za-z0-9\s*]*', temp_comment):
|
||||||
|
data_review["review"] = data_review["review"] + (comments[comment_items])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_review["img_url"] = str(data['images']).replace("'","").replace("[","").replace("]","")
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
if data['like_count']:
|
||||||
|
data_review["review_like_count"] = str(data['like_count'])
|
||||||
|
else:
|
||||||
|
data_review["review_like_count"]=0
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_review["user_tier"] = str(data['loyalty_info']['tier_text'])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
rce_video_url = []
|
||||||
|
for urls in data["videos"]:
|
||||||
|
rce_video_url.append(urls["url"])
|
||||||
|
|
||||||
|
data_review["video_url"] = str(",".join(rce_video_url))
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
try: data_review["rating"] = str(data['rating_star'])
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
self.db_writer.rce_ratings_reviews(data_review)
|
||||||
|
|
||||||
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" where id="+str(data_review['id'])
|
||||||
|
|
||||||
|
self.cur.execute(sql)
|
||||||
|
res = self.cur.fetchall()
|
||||||
|
if res:
|
||||||
|
|
||||||
|
data_review_product_model = {}
|
||||||
|
|
||||||
|
data_review_product_model["rce_rating_id"] = rating_id
|
||||||
|
data_review_product_model["model_id"] = ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
product_models = []
|
||||||
|
for models in data["product_items"]:
|
||||||
|
product_models.append(models["modelid"])
|
||||||
|
|
||||||
|
data_review_product_model["model_id"] = str(product_models).replace("[","").replace("]","")
|
||||||
|
|
||||||
|
self.db_writer.rce_ratings_reviews_productmodels(data_review_product_model)
|
||||||
|
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
|
||||||
|
if data['tags']:
|
||||||
|
rce_tags_list = []
|
||||||
|
for tags in data["tags"]:
|
||||||
|
|
||||||
|
sql = "select max(id) from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')
|
||||||
|
self.cur.execute(sql)
|
||||||
|
tag_id = self.cur.fetchone()
|
||||||
|
|
||||||
|
if tag_id[0]==None:
|
||||||
|
tag_id = 1
|
||||||
|
else:
|
||||||
|
tag_id = int(tag_id[0]) + 1
|
||||||
|
|
||||||
|
data_tags = {}
|
||||||
|
|
||||||
|
data_tags['id'] = tag_id
|
||||||
|
data_tags['description'] = tags["tag_description"]
|
||||||
|
|
||||||
|
self.db_writer.rce_tags(data_tags)
|
||||||
|
|
||||||
|
rce_tags_list.append(tags["tag_description"])
|
||||||
|
|
||||||
|
rce_tags_list = str(rce_tags_list).replace('[','').replace(']','')
|
||||||
|
|
||||||
|
|
||||||
|
tags_id_query = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description in (" + str(rce_tags_list) + ")"
|
||||||
|
self.cur.execute(tags_id_query)
|
||||||
|
tags_id_query = self.cur.fetchall()
|
||||||
|
rce_tag_ids = str(tags_id_query)
|
||||||
|
rce_tag_ids = rce_tag_ids.replace("[", "")
|
||||||
|
rce_tag_ids = rce_tag_ids.replace("]", "")
|
||||||
|
rce_tag_ids = rce_tag_ids.replace("(", "")
|
||||||
|
rce_tag_ids = rce_tag_ids.replace(")", "")
|
||||||
|
rce_tag_ids = rce_tag_ids.replace(",,", ",")
|
||||||
|
rce_tag_ids = rce_tag_ids.rstrip(",")
|
||||||
|
|
||||||
|
data_review_product_tags = {}
|
||||||
|
|
||||||
|
data_review_product_tags['rce_rating_id'] = rating_id
|
||||||
|
data_review_product_tags['tag_ids'] = rce_tag_ids
|
||||||
|
|
||||||
|
self.db_writer.rce_ratings_reviews_producttags(data_review_product_tags)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_shopee_products(self):
|
||||||
|
|
||||||
|
crawler_main = int(self.config.get('crawler_main'))
|
||||||
|
crawler_slave_no = int(self.config.get('crawler_slave_no')) if self.config.get('crawler_slave_no') else None
|
||||||
|
|
||||||
|
sql = None
|
||||||
|
|
||||||
|
if crawler_main:
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where flag=0 and crawler_name='"+self.config.get('crawler_name')+"' " \
|
||||||
|
"and keyword in ('Perawatan & Kecantikan','Perawatan Tubuh','Perawatan Tangan','Perawatan Kaki','Perawatan Kuku','Perawatan Rambut','Perawatan Pria'," \
|
||||||
|
"'Parfum & Wewangian') order by id"
|
||||||
|
else:
|
||||||
|
if crawler_slave_no == 1:
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where flag=0 and crawler_name='"+self.config.get('crawler_name')+"' " \
|
||||||
|
"and keyword in ('Kosmetik Wajah','Kosmetik Mata','Kosmetik Bibir','Pembersih Make Up','Aksesoris Make Up','Alat Perawatan Wajah','Alat Pelangsing Tubuh') order by id"
|
||||||
|
elif crawler_slave_no ==2:
|
||||||
|
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where flag=0 and crawler_name='"+self.config.get('crawler_name')+"' " \
|
||||||
|
"and keyword in ('Alat Penghilang Bulu Rambut','Alat Rambut','Perawatan Wajah','Treatment Mata','Treatment Bibir','Paket & Set Kecantikan','Kecantikan Lainnya') order by id"
|
||||||
|
|
||||||
|
|
||||||
|
if sql:
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
items = self.cur.fetchall()
|
||||||
|
|
||||||
|
logging.info("Total Item found: {}".format(str(len(items))))
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
self.crawl_shopee_products(item)
|
||||||
|
time.sleep(random.randint(15,25))
|
||||||
|
else:
|
||||||
|
logging.info("SQL not generated. Please check if Master or Slaves are working correctly.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def crawl_shopee_products(self,item, flag=0):
|
||||||
|
|
||||||
|
logging.info("Collecting info for itemid="+str(item[3])+" and shopid="+str(item[2]))
|
||||||
|
|
||||||
|
iteminfo, shopinfo, ratinginfo = self.get_raw_product(item[4])
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
data_item = json.loads(iteminfo)
|
||||||
|
data_shop = json.loads(shopinfo)
|
||||||
|
data_rating = json.loads(ratinginfo)
|
||||||
|
|
||||||
|
X = None
|
||||||
|
Y = None
|
||||||
|
Z = None
|
||||||
|
try : X = data_item["data"]
|
||||||
|
except: pass
|
||||||
|
try : Y = data_shop["data"]
|
||||||
|
except: pass
|
||||||
|
try : Z = data_rating["data"]
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
if not X or not Y or not Z:
|
||||||
|
if flag == 0:
|
||||||
|
print("Data is NULL. Retrying..... Itemid: {}, Shopid: {}".format(str(item[3]),str(item[2])))
|
||||||
|
self.crawl_shopee_products(item, flag=1)
|
||||||
|
else:
|
||||||
|
print("Data is NULL. Skipping")
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
self.reseller_info(data_shop,item)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info("Reseller info: "+ str(e))
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.product_info(data_item,item)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info("Product info: "+ str(e))
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.rating_info(data_rating,item)
|
||||||
|
except Exception as e:
|
||||||
|
logging.info("Rating info: "+ str(e))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.info("Data not parsable..... Skipping....")
|
||||||
|
#self.crawl_shopee_products(item, flag=1)
|
||||||
|
|
||||||
|
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" set flag=1 where itemid="+str(item[3])+" and shopid="+str(item[2])+" and crawler_name='"+self.config.get('crawler_name')+"'"
|
||||||
|
logging.info(sql)
|
||||||
|
self.cur.execute(sql)
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,107 @@
|
||||||
|
import hashlib
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
import psycopg2
|
||||||
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
import random
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from shopee_db_writer import shopee_db_writer
|
||||||
|
|
||||||
|
###### Looger ######
|
||||||
|
format = "%(asctime)s: %(message)s"
|
||||||
|
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
class shopee_sub_categories:
|
||||||
|
def __init__(self, config):
|
||||||
|
logging.info("Loading Sub Categories of Beauty & Care.........")
|
||||||
|
self.config = config
|
||||||
|
self.url = "https://shopee.co.id/api/v4/pages/get_category_tree"
|
||||||
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
||||||
|
self.conn.autocommit = True
|
||||||
|
self.cur = self.conn.cursor()
|
||||||
|
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Shopee'")
|
||||||
|
try : self.rce_source_id = self.cur.fetchone()[0]
|
||||||
|
except:
|
||||||
|
logging.info("Source tab is empty. Please check. Exiting.....")
|
||||||
|
exit(1)
|
||||||
|
self.db_writer = shopee_db_writer(config)
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
logging.info("Closing connection.....")
|
||||||
|
self.conn.close()
|
||||||
|
|
||||||
|
def get_sub_categories(self):
|
||||||
|
op = webdriver.ChromeOptions()
|
||||||
|
hight = str(random.randint(640,1280))
|
||||||
|
width = str(random.randint(1024,1920))
|
||||||
|
op.add_argument("window-size="+width+","+hight+"")
|
||||||
|
op.add_experimental_option("useAutomationExtension", False)
|
||||||
|
op.add_argument('--no-sandbox')
|
||||||
|
op.add_argument('--disable-notifications')
|
||||||
|
op.headless = True
|
||||||
|
driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=op)
|
||||||
|
|
||||||
|
driver.get(self.url)
|
||||||
|
|
||||||
|
self.page_source = driver.page_source
|
||||||
|
|
||||||
|
self.parse()
|
||||||
|
|
||||||
|
def parse(self):
|
||||||
|
soup = BeautifulSoup(self.page_source,features="html.parser")
|
||||||
|
|
||||||
|
all_cat = json.loads(soup.body.text)['data']['category_list']
|
||||||
|
|
||||||
|
for cat in all_cat:
|
||||||
|
if cat['catid'] == int(self.config.get('source_category')):
|
||||||
|
self.sub_cats = cat['children']
|
||||||
|
data = {}
|
||||||
|
data['parent_category_id'] = cat['parent_catid']
|
||||||
|
data['rce_source_id'] = self.rce_source_id
|
||||||
|
data['rce_source_category_id'] = cat['catid']
|
||||||
|
data['rce_source_status'] = 1
|
||||||
|
data['category_name'] = cat['display_name']
|
||||||
|
data['category_page_url'] = self.get_url(name=data['category_name'], pcatid=data['rce_source_category_id'])
|
||||||
|
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
self.db_writer.rce_category(data)
|
||||||
|
|
||||||
|
self.process_sub_categories()
|
||||||
|
|
||||||
|
|
||||||
|
def process_sub_categories(self):
|
||||||
|
|
||||||
|
for sub_cat in self.sub_cats:
|
||||||
|
data = {}
|
||||||
|
data['parent_category_id'] = sub_cat['parent_catid']
|
||||||
|
data['rce_source_id'] = self.rce_source_id
|
||||||
|
data['rce_source_category_id'] = sub_cat['catid']
|
||||||
|
data['rce_source_status'] = 1
|
||||||
|
data['category_name'] = sub_cat['display_name']
|
||||||
|
data['category_page_url'] = self.get_url(name=data['category_name'], pcatid=data['parent_category_id'], ccatid=data['rce_source_category_id'])
|
||||||
|
data['category_page_url_hash'] = hashlib.md5(data['category_page_url'].encode('utf-8')).hexdigest()
|
||||||
|
self.db_writer.rce_category(data)
|
||||||
|
|
||||||
|
|
||||||
|
def get_url(self, name, pcatid=None, ccatid=None):
|
||||||
|
uri = name.split('& ')
|
||||||
|
uri = ''.join(uri)
|
||||||
|
uri = uri.split(' ')
|
||||||
|
uri = '-'.join(uri)
|
||||||
|
|
||||||
|
url = 'https://shopee.co.id/' + uri
|
||||||
|
|
||||||
|
if not ccatid:
|
||||||
|
url = url + '-cat.' + str(pcatid)
|
||||||
|
else:
|
||||||
|
url = url + '-cat.' + str(pcatid) + '.' + str(ccatid)
|
||||||
|
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
conf.json
|
|
@ -0,0 +1,13 @@
|
||||||
|
### Run: ###
|
||||||
|
* run "python tokopedia_crawler.py"
|
||||||
|
|
||||||
|
### Configuration: ###
|
||||||
|
* Ensure that tables are created already.
|
||||||
|
* cp conf.json.sample conf.json
|
||||||
|
* Install zyte certificate - https://docs.zyte.com/smart-proxy-manager/next-steps/fetching-https-pages-with-smart-proxy.html#fetching-https-pages-with-smart-proxy
|
||||||
|
|
||||||
|
### Notes: ###
|
||||||
|
* Cronjob can be setup for 'Master' to run every 1 minute.
|
||||||
|
* It is expected to capture all product urls in ~107 minutes.
|
||||||
|
* It makes only 2 API calls per minute(3 in the first minute) to prevent IP blocking.
|
||||||
|
* Infinite slaves can be added.
|
|
@ -0,0 +1,28 @@
|
||||||
|
{
|
||||||
|
"crawler_name": "raena_crawler_engine_tokopedia",
|
||||||
|
"crawler_target": "Tokopedia",
|
||||||
|
"crawler_target_url": "https://www.tokopedia.com/",
|
||||||
|
"crawler_schema": "raena_spider_management",
|
||||||
|
"category_tab": "rce_category",
|
||||||
|
"tracker_tab": "crawler_tracker",
|
||||||
|
"product_tab": "rce_product",
|
||||||
|
"variant_tab": "rce_product_variant",
|
||||||
|
"brand_tab": "rce_brand",
|
||||||
|
"reseller_tab": "rce_reseller",
|
||||||
|
"reseller_store_tab": "rce_reseller_store",
|
||||||
|
"review_tab": "rce_ratings_reviews",
|
||||||
|
"review_productmodels_tab": "rce_ratings_reviews_productmodels",
|
||||||
|
"review_producttags_tab": "rce_ratings_reviews_producttags",
|
||||||
|
"review_tags": "rce_tags",
|
||||||
|
"source_tab": "rce_source",
|
||||||
|
"product_per_category": "120",
|
||||||
|
"source_category": "61",
|
||||||
|
"proxy_url": "http://59e7e01ebdf54a6585c7db8824efa1e8:@proxy.crawlera.com:8011/",
|
||||||
|
"db_user": "",
|
||||||
|
"db_pass": "",
|
||||||
|
"database": "raena_db",
|
||||||
|
"db_host": "localhost",
|
||||||
|
"db_port": "5432",
|
||||||
|
"crawler_main": "1",
|
||||||
|
"crawler_slave_no": ""
|
||||||
|
}
|
|
@ -0,0 +1,31 @@
|
||||||
|
import requests
|
||||||
|
from pathlib import Path
|
||||||
|
from tokopedia_config import Config
|
||||||
|
|
||||||
|
class api():
|
||||||
|
config = Config().get()
|
||||||
|
|
||||||
|
def post(self, url, payload):
|
||||||
|
try:
|
||||||
|
response = requests.post(url, payload)
|
||||||
|
return response.json()
|
||||||
|
except:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def postProxy(self, url, payload, headers):
|
||||||
|
path = Path.cwd()
|
||||||
|
proxyUrl = self.config.get('proxy_url')
|
||||||
|
# print(data)
|
||||||
|
try:
|
||||||
|
response = requests.post(url,
|
||||||
|
data=payload,
|
||||||
|
headers=headers,
|
||||||
|
proxies={
|
||||||
|
"http": proxyUrl,
|
||||||
|
"https": proxyUrl,
|
||||||
|
},
|
||||||
|
verify=f'{path}/zyte-proxy-ca.crt'
|
||||||
|
)
|
||||||
|
return response.json()
|
||||||
|
except:
|
||||||
|
return []
|
|
@ -0,0 +1,25 @@
|
||||||
|
import json
|
||||||
|
from tokopedia_logger import logger
|
||||||
|
|
||||||
|
class Config():
|
||||||
|
config = None
|
||||||
|
|
||||||
|
def __new__(cls, *args, **kw):
|
||||||
|
if not hasattr(cls, '_instance'):
|
||||||
|
orig = super(Config, cls)
|
||||||
|
cls._instance = orig.__new__(cls, *args, **kw)
|
||||||
|
return cls._instance
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
if not self.config:
|
||||||
|
try:
|
||||||
|
logger.info("Loading config fine...")
|
||||||
|
with open("conf.json", "r") as jsonfile:
|
||||||
|
self.config = json.load(jsonfile)
|
||||||
|
logger.info("Config file loaded.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Cannot load config file. Please check. Exiting......")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
def get(self):
|
||||||
|
return self.config
|
|
@ -0,0 +1,43 @@
|
||||||
|
from tokopedia_logger import logger
|
||||||
|
from tokopedia_db_writer import DB
|
||||||
|
from tokopedia_config import Config
|
||||||
|
from tokopedia_sub_categories import TokopediaSubCategories
|
||||||
|
from tokopedia_db_migrations import db_migrations
|
||||||
|
from tokopedia_product_list import ProductList
|
||||||
|
from tokopedia_products import Products
|
||||||
|
|
||||||
|
def checkSource():
|
||||||
|
config = Config().get()
|
||||||
|
table = config.get("crawler_schema") + "." + config.get("source_tab")
|
||||||
|
query = "select id from " + table + " where source_name='Tokopedia'"
|
||||||
|
data = DB().fetchone(query)
|
||||||
|
if not data:
|
||||||
|
logger.error("Please create source in " + table)
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
def runMainCrawler():
|
||||||
|
db_migrations()
|
||||||
|
checkSource()
|
||||||
|
TokopediaSubCategories()
|
||||||
|
ProductList()
|
||||||
|
|
||||||
|
def runSlaveCrawler():
|
||||||
|
config = Config().get()
|
||||||
|
try:
|
||||||
|
int(config.get('crawler_slave_no'))
|
||||||
|
except:
|
||||||
|
logger.error("Please set slave number")
|
||||||
|
exit(1)
|
||||||
|
Products()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
config = Config().get()
|
||||||
|
isMainCrawler = bool(int(config.get('crawler_main')))
|
||||||
|
|
||||||
|
if isMainCrawler:
|
||||||
|
runMainCrawler()
|
||||||
|
else:
|
||||||
|
runSlaveCrawler()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,109 @@
|
||||||
|
from tokopedia_logger import logger
|
||||||
|
from tokopedia_db_writer import DB
|
||||||
|
from tokopedia_config import Config
|
||||||
|
|
||||||
|
class db_migrations():
|
||||||
|
config = Config().get()
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
logger.info('Running database migrations')
|
||||||
|
self.updateSource()
|
||||||
|
self.updateCategoryColumn()
|
||||||
|
self.alterCrawlerTracker()
|
||||||
|
self.alterProductTab()
|
||||||
|
self.alterResellerStoreTab()
|
||||||
|
logger.info('Database migrations completed')
|
||||||
|
|
||||||
|
def updateSource(self):
|
||||||
|
table = f'{self.config.get("crawler_schema")}.{self.config.get("source_tab")}'
|
||||||
|
target = self.config.get("crawler_target")
|
||||||
|
target_url = self.config.get("crawler_target_url")
|
||||||
|
query = f'''INSERT INTO {table} (source_name, source_main_url)
|
||||||
|
SELECT '{target}', '{target_url}'
|
||||||
|
WHERE
|
||||||
|
NOT EXISTS (
|
||||||
|
SELECT id FROM {table} WHERE source_name = '{target}'
|
||||||
|
);'''
|
||||||
|
try:
|
||||||
|
DB().execute_query(query)
|
||||||
|
except:
|
||||||
|
logger.error(f'Problem while creating source in {table}')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
def updateCategoryColumn(self):
|
||||||
|
table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}'
|
||||||
|
aud_table = f'{self.config.get("crawler_schema")}.aud_{self.config.get("category_tab")}'
|
||||||
|
query = f'Alter table {table} ADD COLUMN IF NOT EXISTS category_slug character varying UNIQUE'
|
||||||
|
aud_query = f'Alter table {aud_table} ADD COLUMN IF NOT EXISTS category_slug character varying UNIQUE'
|
||||||
|
try:
|
||||||
|
DB().execute_query(query)
|
||||||
|
DB().execute_query(aud_query)
|
||||||
|
except:
|
||||||
|
logger.error(f'Problem while updating column in {table}')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
def alterCrawlerTracker(self):
|
||||||
|
table = f'{self.config.get("crawler_schema")}.{self.config.get("tracker_tab")}'
|
||||||
|
query = f'''
|
||||||
|
ALTER TABLE {table}
|
||||||
|
ADD CONSTRAINT unique_product_page_url UNIQUE (product_page_url);
|
||||||
|
'''
|
||||||
|
try:
|
||||||
|
DB().execute_query(query)
|
||||||
|
except:
|
||||||
|
# This might be the reason of a silent error
|
||||||
|
pass
|
||||||
|
|
||||||
|
def alterProductTab(self):
|
||||||
|
table = f'{self.config.get("crawler_schema")}.{self.config.get("product_tab")}'
|
||||||
|
aud_table = f'{self.config.get("crawler_schema")}.aud_{self.config.get("product_tab")}'
|
||||||
|
query = f'Alter table {table} ADD COLUMN IF NOT EXISTS rce_source_id bigint;'
|
||||||
|
aud_query = f'Alter table {aud_table} ADD COLUMN IF NOT EXISTS rce_source_id bigint;'
|
||||||
|
|
||||||
|
constraint_query = f'''
|
||||||
|
ALTER TABLE {table}
|
||||||
|
ADD CONSTRAINT product_source_id_ukey UNIQUE (rce_source_product_id, rce_source_id);
|
||||||
|
'''
|
||||||
|
|
||||||
|
try:
|
||||||
|
DB().execute_query(query + aud_query)
|
||||||
|
except:
|
||||||
|
logger.error(f'Problem while updating column in {table}')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
DB().execute_query(constraint_query)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def alterResellerStoreTab(self):
|
||||||
|
table = f'{self.config.get("crawler_schema")}.{self.config.get("reseller_store_tab")}'
|
||||||
|
aud_table = f'{self.config.get("crawler_schema")}.aud_{self.config.get("reseller_store_tab")}'
|
||||||
|
query = f'Alter table {table} ADD COLUMN IF NOT EXISTS rce_source_id bigint;'
|
||||||
|
aud_query = f'Alter table {aud_table} ADD COLUMN IF NOT EXISTS rce_source_id bigint;'
|
||||||
|
|
||||||
|
constraint_query = f'''
|
||||||
|
ALTER TABLE {table}
|
||||||
|
ADD CONSTRAINT store_source_id_ukey UNIQUE (rce_source_store_id, rce_source_id);
|
||||||
|
'''
|
||||||
|
|
||||||
|
aud_constraint_query = f'''
|
||||||
|
ALTER TABLE {aud_table}
|
||||||
|
ADD CONSTRAINT aud_store_source_id_ukey UNIQUE (rce_source_store_id, rce_source_id);
|
||||||
|
'''
|
||||||
|
|
||||||
|
try:
|
||||||
|
DB().execute_query(query + aud_query)
|
||||||
|
except:
|
||||||
|
logger.error(f'Problem while updating column in {table}')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
DB().execute_query(constraint_query)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
DB().execute_query(aud_constraint_query)
|
||||||
|
except:
|
||||||
|
pass
|
|
@ -0,0 +1,60 @@
|
||||||
|
from tokopedia_config import Config
|
||||||
|
from tokopedia_logger import logger
|
||||||
|
import psycopg2
|
||||||
|
|
||||||
|
class DBConnector:
|
||||||
|
def __init__(self):
|
||||||
|
config = Config().get()
|
||||||
|
self.host = config.get('db_host')
|
||||||
|
self.database = config.get('database')
|
||||||
|
self.user = config.get('db_user')
|
||||||
|
self.password = config.get('db_pass')
|
||||||
|
self.port = config.get('db_port')
|
||||||
|
self.dbconn = None
|
||||||
|
|
||||||
|
def create_connection(self):
|
||||||
|
return psycopg2.connect(
|
||||||
|
database=self.database,
|
||||||
|
user=self.user,
|
||||||
|
password=self.password,
|
||||||
|
host=self.host,
|
||||||
|
port=self.port
|
||||||
|
)
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
self.dbconn = self.create_connection()
|
||||||
|
return self.dbconn
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
self.dbconn.close()
|
||||||
|
|
||||||
|
class DB(object):
|
||||||
|
connection = None
|
||||||
|
|
||||||
|
def __new__(cls, *args, **kw):
|
||||||
|
if not hasattr(cls, '_instance'):
|
||||||
|
orig = super(DB, cls)
|
||||||
|
cls._instance = orig.__new__(cls, *args, **kw)
|
||||||
|
return cls._instance
|
||||||
|
|
||||||
|
def get_connection(self):
|
||||||
|
if not self.connection:
|
||||||
|
self.connection = DBConnector().create_connection()
|
||||||
|
return self.connection
|
||||||
|
|
||||||
|
def execute_query(self, query):
|
||||||
|
connection = self.get_connection()
|
||||||
|
connection.autocommit = True
|
||||||
|
try:
|
||||||
|
cursor = connection.cursor()
|
||||||
|
except psycopg2.ProgrammingError:
|
||||||
|
connection = self.get_connection()
|
||||||
|
cursor = connection.cursor()
|
||||||
|
cursor.execute(query)
|
||||||
|
return cursor
|
||||||
|
|
||||||
|
def fetchone(self, query):
|
||||||
|
return self.execute_query(query).fetchone()
|
||||||
|
|
||||||
|
def fetchall(self, query):
|
||||||
|
return self.execute_query(query).fetchall()
|
|
@ -0,0 +1,7 @@
|
||||||
|
import logging
|
||||||
|
|
||||||
|
###### Logger ######
|
||||||
|
format = "%(asctime)s: %(message)s"
|
||||||
|
logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
logger = logging.getLogger("tokopedia")
|
|
@ -0,0 +1,108 @@
|
||||||
|
import json
|
||||||
|
from tokopedia_db_writer import DB
|
||||||
|
from tokopedia_logger import logger
|
||||||
|
from tokopedia_config import Config
|
||||||
|
from tokopedia_api import api
|
||||||
|
|
||||||
|
class ProductList():
|
||||||
|
config = Config().get()
|
||||||
|
sourceId = None
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.sourceId = self.getSourceId()
|
||||||
|
self.get()
|
||||||
|
|
||||||
|
def getSourceId(self):
|
||||||
|
table = f'{self.config.get("crawler_schema")}.{self.config.get("source_tab")}'
|
||||||
|
query = f'select id from {table} where source_name=\'Tokopedia\''
|
||||||
|
data = DB().fetchone(query)
|
||||||
|
return data[0]
|
||||||
|
|
||||||
|
# fetch 1 row of category which does not have rce_source_status set
|
||||||
|
# Make api call to fetch 120 products and store in crawler_tracker
|
||||||
|
def getCategoryIdentifier(self):
|
||||||
|
table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}'
|
||||||
|
query = f"""
|
||||||
|
SELECT category_slug FROM {table}
|
||||||
|
WHERE rce_source_id = {self.sourceId} and rce_source_status is null
|
||||||
|
ORDER BY id ASC
|
||||||
|
Limit 1
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
data = DB().fetchone(query)
|
||||||
|
return data[0] if data else None
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def getProductList(self, identifier, page):
|
||||||
|
url = 'https://gql.tokopedia.com/graphql/SearchProductQuery'
|
||||||
|
params = f"ob=&page={page}&start={1 + (page-1)*60}&identifier={identifier}&sc=2266&user_id=0&rows=60&source=directory&device=desktop&related=true&st=product&safe_search=false"
|
||||||
|
payload = json.dumps([{
|
||||||
|
"operationName": "SearchProductQuery",
|
||||||
|
"variables": {
|
||||||
|
"params": params
|
||||||
|
},
|
||||||
|
"query": "query SearchProductQuery($params: String) {\n CategoryProducts: searchProduct(params: $params) {\n data: products {\n id\n url\n }\n }\n }\n"
|
||||||
|
}])
|
||||||
|
data = api().post(url, payload)
|
||||||
|
return data
|
||||||
|
|
||||||
|
def processData(self, data1, data2):
|
||||||
|
crawler_name = self.config.get("crawler_name")
|
||||||
|
data = None
|
||||||
|
try:
|
||||||
|
rootData1 = data1[0]["data"]["CategoryProducts"]["data"]
|
||||||
|
rootData2 = data2[0]["data"]["CategoryProducts"]["data"]
|
||||||
|
data = rootData1 + rootData2
|
||||||
|
except:
|
||||||
|
data = []
|
||||||
|
|
||||||
|
proccessedData = list(map(lambda x: (f"'{crawler_name}'", f'\'{x["url"]}\''), data))
|
||||||
|
return proccessedData
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def convertToString(n, delimeter = ','):
|
||||||
|
return delimeter.join(n)
|
||||||
|
|
||||||
|
def updateTracker(self, rawData):
|
||||||
|
table = f'{self.config.get("crawler_schema")}.{self.config.get("tracker_tab")}'
|
||||||
|
data = f"({self.convertToString(map(self.convertToString, rawData), '),(')})"
|
||||||
|
query = f"""
|
||||||
|
INSERT INTO {table}(crawler_name, product_page_url)
|
||||||
|
VALUES {data}
|
||||||
|
ON CONFLICT (product_page_url) DO Nothing;
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
DB().execute_query(query)
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
logger.info(f'Error while inserting data in {table}')
|
||||||
|
return False
|
||||||
|
|
||||||
|
def updateCategoryTableRow(self, identifier):
|
||||||
|
table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}'
|
||||||
|
query = f"""
|
||||||
|
Update {table}
|
||||||
|
SET rce_source_status = 1
|
||||||
|
WHERE category_slug='{identifier}'
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
data = DB().execute_query(query)
|
||||||
|
except:
|
||||||
|
logger.error(f'Something went wrong while updating {table}')
|
||||||
|
|
||||||
|
def get(self):
|
||||||
|
identifier = self.getCategoryIdentifier()
|
||||||
|
if not identifier:
|
||||||
|
logger.info("All the categories are processed, no task left for master")
|
||||||
|
return
|
||||||
|
|
||||||
|
data1 = self.getProductList(identifier, 1)
|
||||||
|
data2 = self.getProductList(identifier, 2)
|
||||||
|
processedData = self.processData(data1, data2)
|
||||||
|
isDataInserted = self.updateTracker(processedData)
|
||||||
|
|
||||||
|
if isDataInserted:
|
||||||
|
self.updateCategoryTableRow(identifier)
|
||||||
|
|
||||||
|
logger.info(f'All the URLs are fetched for the following category identifier - {identifier}')
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,142 @@
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
from tokopedia_logger import logger
|
||||||
|
from tokopedia_config import Config
|
||||||
|
from tokopedia_api import api
|
||||||
|
from tokopedia_db_writer import DB
|
||||||
|
|
||||||
|
class TokopediaSubCategories:
|
||||||
|
config = Config().get()
|
||||||
|
sourceCategoryId = int(config.get("source_category"))
|
||||||
|
sourceId = None
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.sourceId = self.getSourceId()
|
||||||
|
self.populate()
|
||||||
|
|
||||||
|
def getSourceId(self):
|
||||||
|
table = f'{self.config.get("crawler_schema")}.{self.config.get("source_tab")}'
|
||||||
|
query = f'select id from {table} where source_name=\'Tokopedia\''
|
||||||
|
data = DB().fetchone(query)
|
||||||
|
return data[0]
|
||||||
|
|
||||||
|
def getSourceCategoryUpdatedTime(self):
|
||||||
|
table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}'
|
||||||
|
where = f'rce_source_category_id={self.sourceCategoryId} and rce_source_id={self.sourceId}'
|
||||||
|
query = f'select updatedat from {table} where {where}'
|
||||||
|
data = DB().fetchone(query)
|
||||||
|
return data[0] if data else None
|
||||||
|
|
||||||
|
def fetchCategories(self):
|
||||||
|
url = 'https://gql.tokopedia.com/graphql/categoryAllList'
|
||||||
|
payload = json.dumps([{
|
||||||
|
"operationName": "categoryAllList",
|
||||||
|
"variables": {
|
||||||
|
"categoryID": self.sourceCategoryId
|
||||||
|
},
|
||||||
|
"query": "query categoryAllList($categoryID: Int, $type: String) {\n CategoryAllList: categoryAllList(categoryID: $categoryID, type: $type) {\n categories {\n identifier\n url\n name\n id\n child {\n id\n identifier\n name\n url\n child {\n name\n identifier\n url\n id\n }\n }\n }\n }\n }\n"
|
||||||
|
}])
|
||||||
|
data = api().post(url, payload)
|
||||||
|
return data
|
||||||
|
|
||||||
|
def processData(self, rawData):
|
||||||
|
sourceId = self.sourceId
|
||||||
|
data = rawData[0]['data']['CategoryAllList']['categories'][0]
|
||||||
|
values = [(str(sourceId), str(0), data['id'], f"'{data['url']}'", f"'{data['name']}'", f"'{data['identifier']}'")]
|
||||||
|
for fc in data['child']:
|
||||||
|
values.insert(len(values), (str(sourceId), data['id'], fc['id'], f"'{fc['url']}'", f"'{fc['name']}'", f"'{fc['identifier']}'"))
|
||||||
|
for sc in fc['child']:
|
||||||
|
values.insert(len(values), (str(sourceId), fc['id'], sc['id'], f"'{sc['url']}'", f"'{sc['name']}'", f"'{sc['identifier']}'"))
|
||||||
|
return values
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def convertToString(n, delimeter = ','):
|
||||||
|
return delimeter.join(n)
|
||||||
|
|
||||||
|
def upsertData(self):
|
||||||
|
table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}'
|
||||||
|
categories = self.fetchCategories()
|
||||||
|
rawData = self.processData(categories)
|
||||||
|
data = f"({self.convertToString(map(self.convertToString, rawData), '),(')})"
|
||||||
|
query = f'''
|
||||||
|
INSERT INTO {table} (rce_source_id, parent_category_id, rce_source_category_id, category_page_url, category_name, category_slug)
|
||||||
|
VALUES {data}
|
||||||
|
ON CONFLICT (category_slug) DO UPDATE SET updatedat = now();
|
||||||
|
'''
|
||||||
|
|
||||||
|
try:
|
||||||
|
DB().execute_query(query)
|
||||||
|
except:
|
||||||
|
logger.error('Issue while inserting categories')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
def deleteTokoCategories(self):
|
||||||
|
table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}'
|
||||||
|
query = f'Delete from {table} where rce_source_id={self.sourceId};'
|
||||||
|
try:
|
||||||
|
DB().execute_query(query)
|
||||||
|
except:
|
||||||
|
logger.error(f'Tokopedia categories were not deleted from {table}')
|
||||||
|
|
||||||
|
def fetchCategoriesFromDB(self):
|
||||||
|
table = f'{self.config.get("crawler_schema")}.{self.config.get("category_tab")}'
|
||||||
|
query = f'Select id, rce_source_id, parent_category_id, rce_source_category_id, category_page_url, category_name, category_slug, createdat, updatedat from {table} where rce_source_id={self.sourceId};'
|
||||||
|
# query = f'Select (id, rce_source_id, parent_category_id, rce_source_category_id, category_page_url, category_name, category_slug, updatedat) from {table} where rce_source_id={self.sourceId};'
|
||||||
|
try:
|
||||||
|
return DB().fetchall(query)
|
||||||
|
except:
|
||||||
|
logger.error(f'Issue while fetching data from {table}')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
def processAudData(self, data):
|
||||||
|
processedData = []
|
||||||
|
for x in data:
|
||||||
|
t = list(x)
|
||||||
|
t[0] = str(t[0])
|
||||||
|
t[1] = str(t[1])
|
||||||
|
t[2] = str(t[2])
|
||||||
|
t[3] = str(t[3])
|
||||||
|
t[4] = f"'{t[4]}'"
|
||||||
|
t[5] = f"'{t[5]}'"
|
||||||
|
t[6] = f"'{t[6]}'"
|
||||||
|
t[7] = f'\'{t[7].strftime("%Y-%m-%d %H:%M:%S.%f")}\''
|
||||||
|
t[8] = f'\'{t[8].strftime("%Y-%m-%d %H:%M:%S.%f")}\''
|
||||||
|
processedData.insert(len(processedData), tuple(t))
|
||||||
|
return processedData
|
||||||
|
|
||||||
|
def updateAudTable(self):
|
||||||
|
dbData = self.fetchCategoriesFromDB()
|
||||||
|
rawData = self.processAudData(dbData)
|
||||||
|
table = f'{self.config.get("crawler_schema")}.aud_{self.config.get("category_tab")}'
|
||||||
|
data = f"({self.convertToString(map(self.convertToString, rawData), '),(')})"
|
||||||
|
query = f'''
|
||||||
|
Insert into {table}
|
||||||
|
(id, rce_source_id, parent_category_id, rce_source_category_id, category_page_url, category_name, category_slug, createdat, updatedat)
|
||||||
|
values {data}
|
||||||
|
ON CONFLICT (category_slug) DO UPDATE SET updatedat = now(), id=EXCLUDED.id;
|
||||||
|
'''
|
||||||
|
|
||||||
|
try:
|
||||||
|
return DB().execute_query(query)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f'Issue while updating {table} {str(e)}')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
def populate(self):
|
||||||
|
sourceCategoryUpdatedTime = self.getSourceCategoryUpdatedTime()
|
||||||
|
if sourceCategoryUpdatedTime:
|
||||||
|
diffDays = (datetime.now() - sourceCategoryUpdatedTime).days
|
||||||
|
# Let's keep a frequency of 1 day to fetch/update categories
|
||||||
|
if diffDays < 1:
|
||||||
|
logger.info('Categories were populated recently, so skipping this step')
|
||||||
|
return
|
||||||
|
|
||||||
|
# delete data from main table
|
||||||
|
logger.info('Deleting categories from main table')
|
||||||
|
self.deleteTokoCategories()
|
||||||
|
# insert fresh data
|
||||||
|
logger.info('Inserting categories in main table')
|
||||||
|
self.upsertData()
|
||||||
|
# update audit table, if required
|
||||||
|
logger.info('Inserting/Updating categories in audit table')
|
||||||
|
self.updateAudTable()
|
|
@ -0,0 +1,25 @@
|
||||||
|
-----BEGIN CERTIFICATE-----
|
||||||
|
MIIERzCCAy+gAwIBAgIJAN/VCi6U4Y5SMA0GCSqGSIb3DQEBCwUAMIG5MQswCQYD
|
||||||
|
VQQGEwJJRTEQMA4GA1UECAwHTXVuc3RlcjENMAsGA1UEBwwEQ29yazEUMBIGA1UE
|
||||||
|
CgwLU2NyYXBpbmdIdWIxNTAzBgNVBAsMLExlYWRpbmcgVGVjaG5vbG9neSBhbmQg
|
||||||
|
UHJvZmVzc2lvbmFsIFNlcnZpY2VzMRQwEgYDVQQDDAtDcmF3bGVyYSBDQTEmMCQG
|
||||||
|
CSqGSIb3DQEJARYXc3VwcG9ydEBzY3JhcGluZ2h1Yi5jb20wHhcNMTUwNTE5MTQ1
|
||||||
|
NjA3WhcNMjUwNTE2MTQ1NjA3WjCBuTELMAkGA1UEBhMCSUUxEDAOBgNVBAgMB011
|
||||||
|
bnN0ZXIxDTALBgNVBAcMBENvcmsxFDASBgNVBAoMC1NjcmFwaW5nSHViMTUwMwYD
|
||||||
|
VQQLDCxMZWFkaW5nIFRlY2hub2xvZ3kgYW5kIFByb2Zlc3Npb25hbCBTZXJ2aWNl
|
||||||
|
czEUMBIGA1UEAwwLQ3Jhd2xlcmEgQ0ExJjAkBgkqhkiG9w0BCQEWF3N1cHBvcnRA
|
||||||
|
c2NyYXBpbmdodWIuY29tMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA
|
||||||
|
3I3nDH62M7FHT6HG5ZNS9cBeXmMZaKaxYdr+7ioSiVXzruDkH3uX6CQZLkvR2KpG
|
||||||
|
icHOnd0FM4S4rHYQoWc82b/UGgwjQdi47ED8fqCPusEcgo/7eY3y2Y/JivEWKk6f
|
||||||
|
z+gBlvEHjKj2EyzZ7FaExTEMQTTe28EroXTNySUctY9jprtKrs8jjGXd2sR6AHF1
|
||||||
|
M6O+5CT/5kXhuDO9/Q9Tfym7wxBsU/k+6hhNH+RkYlNEvkv0d8vdku/ZKTCBuL9D
|
||||||
|
NTqgXFvAmOj0MNEjf5kFrF95g+k5+PxPU04TPUtOwU30GYbCjE+ecYsoTODg6+ju
|
||||||
|
TQoNk3RFt0A0wZS3ly1rnQIDAQABo1AwTjAdBgNVHQ4EFgQUn6fXHOpDIsaswTMr
|
||||||
|
K2DwcOHLtZ0wHwYDVR0jBBgwFoAUn6fXHOpDIsaswTMrK2DwcOHLtZ0wDAYDVR0T
|
||||||
|
BAUwAwEB/zANBgkqhkiG9w0BAQsFAAOCAQEAOLtBuyHixFblY2BieG3ZCs8D74Xc
|
||||||
|
Z1usYCUNuVxOzKhuLt/cv49r39SVienqvS2UTr3kmKdyaaRJnYQ06b5FmAP72vdI
|
||||||
|
4wUAU2F7bFErAVnH1rihB+YMRE/5/6VPLfwuK8yf3rkzdrKcV2DlRQwsnwroSIR8
|
||||||
|
iON6JK2HOI0/LsKxPXUk9cHrli7e99yazS5+jBhRFGx8AVfoJg/6uLe6IKuw5xEZ
|
||||||
|
xAzDdjEIB/tf1cE0SQ+5sdmepO1cIjQYVSL7U+br+y9A1J9N+FYkBKVevM/W25tb
|
||||||
|
iGWBe46djkdm/6eyQ7gtuxhby5lwtRl5sIm9/ID/vWWDMf8O4GPPnW/Xug==
|
||||||
|
-----END CERTIFICATE-----
|
Loading…
Reference in New Issue