427 lines
16 KiB
Python
Executable File
427 lines
16 KiB
Python
Executable File
import hashlib
|
|
import json
|
|
import logging
|
|
import random
|
|
import sys
|
|
import string
|
|
import psycopg2
|
|
import time
|
|
import re
|
|
|
|
import requests
|
|
|
|
from noon_db_writer import noon_db_writer
|
|
from datetime import datetime
|
|
from noon_raw_product import get_product_info_raw
|
|
|
|
class noon_products:
|
|
def __init__(self, config):
|
|
self.config = config
|
|
self.crawler_name = self.config.get("crawler_name")
|
|
self.pattern = r'[' + string.punctuation + ']'
|
|
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
|
|
self.conn.autocommit = True
|
|
self.cur = self.conn.cursor()
|
|
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Noon'")
|
|
self.rce_source_id = self.cur.fetchone()[0]
|
|
self.cur.execute("select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where crawler_name='raena_crawler_enginer_noon' and flag=0")
|
|
self.items = self.cur.fetchall()
|
|
self.db_writer = noon_db_writer(config)
|
|
#self.display = Display(visible=0, size=(800, 600))
|
|
#self.display.start()
|
|
|
|
|
|
def __del__(self):
|
|
print("Closing connection.....")
|
|
self.conn.close()
|
|
#self.display.stop()
|
|
|
|
def slack_notification(message):
|
|
webhook_url = "https://hooks.slack.com/services/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"
|
|
slack_data = {"text": "Issue occurred on Noon Crawler. Error: " + str(message)}
|
|
|
|
response = requests.post(
|
|
webhook_url, data=json.dumps(slack_data),
|
|
headers={"Content-Type": "application/json"}
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise ValueError(
|
|
f"Request to Slack returned an error {response.status_code}, {response.text}"
|
|
)
|
|
|
|
def start_processing(self):
|
|
count = 0
|
|
for item in self.items:
|
|
count += 1
|
|
try:
|
|
logging.info("============== Getting info for {}/{}: {} ================".format(str(count),str(len(self.items)),str(item)))
|
|
start = datetime.now()
|
|
self.get_product_info(item)
|
|
end = datetime.now()
|
|
logging.info('Total time taken to fetch the product: {}'.format(str(end-start)))
|
|
# sleeptime = random.randint(20,50)
|
|
# logging.info("Sleeping for {} sec".format(str(sleeptime)))
|
|
# time.sleep(sleeptime)
|
|
time.sleep(5)
|
|
except Exception as e:
|
|
print(e)
|
|
self.slack_notification(e)
|
|
|
|
def reseller_info(self, data):
|
|
try:
|
|
stores = data["product"]["variants"][0]["offers"]
|
|
|
|
if stores:
|
|
|
|
return_item = ""
|
|
flag = 0
|
|
|
|
for store in stores:
|
|
|
|
##### reseller info
|
|
|
|
data_reseller = {}
|
|
data_reseller['rce_source_id'] = self.rce_source_id
|
|
data_reseller['rce_source_reseller_status'] = 1
|
|
data_reseller['reseller_name'] = ""
|
|
data_reseller['reseller_average_rating'] = 0.0
|
|
data_reseller['reseller_description'] = ""
|
|
|
|
try:
|
|
data_reseller['reseller_name'] = store["store_name"]
|
|
data_reseller['reseller_name'] = data_reseller['reseller_name'].replace("'","")
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
data_reseller['reseller_average_rating'] = float(store["partner_ratings_sellerlab"]["partner_rating"])
|
|
except:
|
|
pass
|
|
|
|
|
|
try:
|
|
self.db_writer.rce_reseller(data_reseller)
|
|
except Exception as e:
|
|
logging.info(e)
|
|
|
|
##### Store info
|
|
|
|
data_reseller_store = {}
|
|
data_reseller_store['rce_source_store_status'] = 1
|
|
data_reseller_store['store_page_url'] = ""
|
|
data_reseller_store['store_page_url_hash'] = ""
|
|
data_reseller_store['store_location'] = ""
|
|
data_reseller_store['rce_reseller_id'] = ""
|
|
data_reseller_store['rce_source_id'] = self.rce_source_id
|
|
|
|
try:
|
|
data_reseller_store['store_page_url'] = "https://www.noon.com/uae-en/seller/" + store["store_code"]
|
|
data_reseller_store['store_page_url'] = data_reseller_store['store_page_url'].replace("'","")
|
|
|
|
data_reseller_store['store_page_url_hash'] = hashlib.md5(data_reseller_store['store_page_url'].encode('utf-8')).hexdigest()
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data_reseller['reseller_name'])+"'")
|
|
rce_reseller_id = self.cur.fetchone()
|
|
data_reseller_store['rce_reseller_id'] = rce_reseller_id[0]
|
|
if flag == 0:
|
|
return_item = data_reseller_store['rce_reseller_id']
|
|
flag = 1
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
self.db_writer.rce_reseller_store(data_reseller_store)
|
|
except Exception as e:
|
|
logging.info(e)
|
|
|
|
return return_item
|
|
|
|
except Exception as e:
|
|
print(e)
|
|
|
|
def brand_info(self, data):
|
|
data_brand = {}
|
|
|
|
data_brand['rce_source_id'] = self.rce_source_id
|
|
data_brand['rce_source_brand_status'] = 1
|
|
data_brand['brand_page_url'] = ""
|
|
data_brand['brand_page_url_hash'] = ""
|
|
data_brand['brand_name'] = ""
|
|
|
|
try:
|
|
data_brand['brand_page_url'] = "https://www.noon.com/uae-en/" + data["product"]["brand_code"]
|
|
data_brand['brand_page_url_hash'] = hashlib.md5(data_brand['brand_page_url'].encode('utf-8')).hexdigest()
|
|
|
|
try:
|
|
data_brand['brand_name'] = data["product"]["brand"]
|
|
data_brand['brand_name'] = data_brand['brand_name'].replace("'","")
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
self.db_writer.rce_brand(data_brand)
|
|
except Exception as e:
|
|
logging.info(e)
|
|
|
|
return data_brand['brand_name']
|
|
except:
|
|
pass
|
|
|
|
def product_info(self, data, category, keyword, url, url_hash, brand_name, rce_reseller_id):
|
|
data_product = {}
|
|
|
|
data_product['rce_source_product_id'] = 0
|
|
data_product['rce_source_id'] = self.rce_source_id
|
|
data_product['rce_source_product_status'] = 1
|
|
data_product['product_page_url'] = url.replace("'","''")
|
|
data_product['product_page_url_hash'] = url_hash
|
|
data_product['rce_category_id'] = int(category)
|
|
data_product['rce_brand_id'] = ""
|
|
data_product['rce_store_id'] = ""
|
|
data_product['rce_source_product_name'] = ""
|
|
data_product['product_images'] = ""
|
|
data_product['product_description'] = ""
|
|
data_product['product_sold_total'] = 0
|
|
data_product['product_sold'] = 0
|
|
data_product['product_price_min'] = ""
|
|
data_product['product_price_min_before_discount'] =""
|
|
data_product['product_price_max'] = ""
|
|
data_product['product_price_max_before_discount'] = ""
|
|
data_product['ratings'] = 0.0
|
|
data_product['product_section'] = keyword
|
|
|
|
try:
|
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_name = '"+str(brand_name)+"'"
|
|
self.cur.execute(sql)
|
|
data_product['rce_brand_id'] = self.cur.fetchone()[0]
|
|
except: pass
|
|
|
|
try:
|
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+""
|
|
self.cur.execute(sql)
|
|
data_product['rce_store_id'] = self.cur.fetchone()[0]
|
|
except: pass
|
|
|
|
try:
|
|
rce_source_product_name = data["product"]["product_title"]
|
|
data_product['rce_source_product_name'] = str(re.sub(self.pattern, '', rce_source_product_name)).replace("'","")
|
|
except: pass
|
|
|
|
|
|
try:
|
|
images = data["product"]["image_keys"]
|
|
data_product['product_images'] = ','.join(images)
|
|
#print(data_product['product_images'])
|
|
except: pass
|
|
|
|
try:
|
|
data_product['product_description'] = data["product"]["long_description"] + " ".join(data["product"]["feature_bullets"])
|
|
data_product['product_description'] = str(re.sub(self.pattern, '', data_product['product_description'])).replace("'","")
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
data_product['product_price_min'] = str(data["product"]["variants"][0]["offers"][0]["sale_price"])
|
|
data_product['product_price_max'] = data_product['product_price_min']
|
|
except:
|
|
data_product['product_price_min'] = str(data["product"]["variants"][0]["offers"][0]["price"])
|
|
data_product['product_price_max'] = data_product['product_price_min']
|
|
pass
|
|
|
|
try:
|
|
data_product['product_price_min_before_discount'] = str(data["product"]["variants"][0]["offers"][0]["price"])
|
|
data_product['product_price_max_before_discount'] = str(data["product"]["variants"][0]["offers"][0]["price"])
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
data_product['ratings'] = float(data["product"]["product_rating"]["value"])
|
|
#print(data_product['ratings'])
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
self.db_writer.rce_product(data_product)
|
|
except Exception as e:
|
|
logging.info(e)
|
|
|
|
### rce_product_variant
|
|
try:
|
|
variants = data["product"]["groups"][0]["options"]
|
|
if variants:
|
|
|
|
for variant in variants:
|
|
|
|
data_variant = {}
|
|
|
|
data_variant['rce_source_variant_id'] = 0
|
|
data_variant['rce_product_id'] = ""
|
|
data_variant['product_variant_name'] = ""
|
|
data_variant['product_variant_price'] = 0
|
|
data_variant['product_variant_price_before_discount'] = 0
|
|
data_variant['product_variant_stock'] = 0
|
|
|
|
try:
|
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_name = '"+str(data_product['rce_source_product_name'])+"'"
|
|
self.cur.execute(sql)
|
|
data_variant['rce_product_id'] = self.cur.fetchone()[0]
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
product_variant_name = variant["name"]
|
|
data_variant['product_variant_name'] = str(re.sub(self.pattern, '', product_variant_name)).replace("'","''")
|
|
except: pass
|
|
|
|
|
|
try:
|
|
self.db_writer.rce_product_variant(data_variant)
|
|
except Exception as e:
|
|
logging.info(e)
|
|
|
|
time.sleep(random.randint(2,5))
|
|
|
|
else:
|
|
logging.info('No variant found')
|
|
except:
|
|
logging.info('No variant found')
|
|
pass
|
|
|
|
|
|
|
|
def rating_info(self, data, rce_reseller_id, url_hash):
|
|
|
|
try:
|
|
data_reviews = []
|
|
data_reviews_ar = []
|
|
data_reviews_en = []
|
|
|
|
try:
|
|
if data["product"]["reviews"]["comments"]["ar"]["reviews"]:
|
|
data_reviews_ar = data["product"]["reviews"]["comments"]["ar"]["reviews"]
|
|
data_reviews.extend(data_reviews_ar)
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
if data["product"]["reviews"]["comments"]["en"]["reviews"]:
|
|
data_reviews_en = data["product"]["reviews"]["comments"]["en"]["reviews"]
|
|
data_reviews.extend(data_reviews_en)
|
|
except:
|
|
pass
|
|
|
|
|
|
for review in data_reviews:
|
|
|
|
data_review = {}
|
|
|
|
data_review["id"] = ""
|
|
data_review["rce_product_id"] = ""
|
|
data_review["username"] = ""
|
|
data_review["review"] = ""
|
|
data_review["img_url"] = ""
|
|
data_review["review_like_count"] = 0
|
|
data_review["user_tier"] = ""
|
|
data_review["shop_id"] = 0
|
|
data_review["video_url"] = ""
|
|
data_review["rating"] = ""
|
|
|
|
try:
|
|
sql = "select max(id) from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')
|
|
self.cur.execute(sql)
|
|
rating_id = self.cur.fetchone()
|
|
|
|
if rating_id[0]==None:
|
|
rating_id = 1
|
|
else:
|
|
rating_id = int(rating_id[0]) + 1
|
|
|
|
data_review["id"] = rating_id
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where product_page_url_hash = '"+str(url_hash)+"'"
|
|
self.cur.execute(sql)
|
|
data_review["rce_product_id"] = self.cur.fetchone()[0]
|
|
except: pass
|
|
|
|
try: data_review["username"] = review["displayName"]
|
|
except: pass
|
|
|
|
try:
|
|
try:
|
|
title = review["title"]
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
comment = review["comment"]
|
|
except:
|
|
pass
|
|
|
|
data_review["review"] = title + comment
|
|
data_review["review"] = data_review["review"].replace("'","")
|
|
except: pass
|
|
|
|
try:
|
|
data_review["review_like_count"] = review["helpfulCount"]
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
data_review["rating"] = review["rating"]
|
|
except: pass
|
|
|
|
try:
|
|
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+""
|
|
self.cur.execute(sql)
|
|
data_review["shop_id"] = self.cur.fetchone()[0]
|
|
except: pass
|
|
|
|
try:
|
|
self.db_writer.rce_ratings_reviews(data_review)
|
|
except Exception as e:
|
|
logging.info(e)
|
|
except:
|
|
pass
|
|
|
|
|
|
|
|
def get_product_info(self,item):
|
|
try:
|
|
|
|
data = get_product_info_raw(item[4])
|
|
|
|
|
|
|
|
##### Reseller info #####
|
|
rce_reseller_id = self.reseller_info(data)
|
|
|
|
|
|
|
|
##### Product Info #####
|
|
##### Brand Info
|
|
brand_name = self.brand_info(data)
|
|
##### Product info
|
|
self.product_info(data, item[2], item[3], item[4], item[5], brand_name, rce_reseller_id)
|
|
|
|
|
|
##### Rating Info #####
|
|
self.rating_info(data, rce_reseller_id, item[5])
|
|
|
|
|
|
sql = f"""
|
|
update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 where product_page_url_hash='{item[5]}'
|
|
"""
|
|
self.cur.execute(sql)
|
|
|
|
|
|
except Exception as e:
|
|
print(e)
|
|
|