raena-crawler-engine/noon_crawler_engine/noon_products.py

427 lines
16 KiB
Python
Raw Normal View History

2024-01-24 13:05:07 +00:00
import hashlib
import json
import logging
import random
import sys
import string
import psycopg2
import time
import re
import requests
from noon_db_writer import noon_db_writer
from datetime import datetime
from noon_raw_product import get_product_info_raw
class noon_products:
def __init__(self, config):
self.config = config
self.crawler_name = self.config.get("crawler_name")
self.pattern = r'[' + string.punctuation + ']'
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Noon'")
self.rce_source_id = self.cur.fetchone()[0]
self.cur.execute("select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where crawler_name='raena_crawler_enginer_noon' and flag=0")
self.items = self.cur.fetchall()
self.db_writer = noon_db_writer(config)
#self.display = Display(visible=0, size=(800, 600))
#self.display.start()
def __del__(self):
print("Closing connection.....")
self.conn.close()
#self.display.stop()
def slack_notification(message):
webhook_url = "https://hooks.slack.com/services/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"
slack_data = {"text": "Issue occurred on Noon Crawler. Error: " + str(message)}
response = requests.post(
webhook_url, data=json.dumps(slack_data),
headers={"Content-Type": "application/json"}
)
if response.status_code != 200:
raise ValueError(
f"Request to Slack returned an error {response.status_code}, {response.text}"
)
def start_processing(self):
count = 0
for item in self.items:
count += 1
try:
logging.info("============== Getting info for {}/{}: {} ================".format(str(count),str(len(self.items)),str(item)))
start = datetime.now()
self.get_product_info(item)
end = datetime.now()
logging.info('Total time taken to fetch the product: {}'.format(str(end-start)))
# sleeptime = random.randint(20,50)
# logging.info("Sleeping for {} sec".format(str(sleeptime)))
# time.sleep(sleeptime)
time.sleep(5)
except Exception as e:
print(e)
self.slack_notification(e)
def reseller_info(self, data):
try:
stores = data["product"]["variants"][0]["offers"]
if stores:
return_item = ""
flag = 0
for store in stores:
##### reseller info
data_reseller = {}
data_reseller['rce_source_id'] = self.rce_source_id
data_reseller['rce_source_reseller_status'] = 1
data_reseller['reseller_name'] = ""
data_reseller['reseller_average_rating'] = 0.0
data_reseller['reseller_description'] = ""
try:
data_reseller['reseller_name'] = store["store_name"]
data_reseller['reseller_name'] = data_reseller['reseller_name'].replace("'","")
except:
pass
try:
data_reseller['reseller_average_rating'] = float(store["partner_ratings_sellerlab"]["partner_rating"])
except:
pass
try:
self.db_writer.rce_reseller(data_reseller)
except Exception as e:
logging.info(e)
##### Store info
data_reseller_store = {}
data_reseller_store['rce_source_store_status'] = 1
data_reseller_store['store_page_url'] = ""
data_reseller_store['store_page_url_hash'] = ""
data_reseller_store['store_location'] = ""
data_reseller_store['rce_reseller_id'] = ""
data_reseller_store['rce_source_id'] = self.rce_source_id
try:
data_reseller_store['store_page_url'] = "https://www.noon.com/uae-en/seller/" + store["store_code"]
data_reseller_store['store_page_url'] = data_reseller_store['store_page_url'].replace("'","")
data_reseller_store['store_page_url_hash'] = hashlib.md5(data_reseller_store['store_page_url'].encode('utf-8')).hexdigest()
except:
pass
try:
self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data_reseller['reseller_name'])+"'")
rce_reseller_id = self.cur.fetchone()
data_reseller_store['rce_reseller_id'] = rce_reseller_id[0]
if flag == 0:
return_item = data_reseller_store['rce_reseller_id']
flag = 1
except:
pass
try:
self.db_writer.rce_reseller_store(data_reseller_store)
except Exception as e:
logging.info(e)
return return_item
except Exception as e:
print(e)
def brand_info(self, data):
data_brand = {}
data_brand['rce_source_id'] = self.rce_source_id
data_brand['rce_source_brand_status'] = 1
data_brand['brand_page_url'] = ""
data_brand['brand_page_url_hash'] = ""
data_brand['brand_name'] = ""
try:
data_brand['brand_page_url'] = "https://www.noon.com/uae-en/" + data["product"]["brand_code"]
data_brand['brand_page_url_hash'] = hashlib.md5(data_brand['brand_page_url'].encode('utf-8')).hexdigest()
try:
data_brand['brand_name'] = data["product"]["brand"]
data_brand['brand_name'] = data_brand['brand_name'].replace("'","")
except:
pass
try:
self.db_writer.rce_brand(data_brand)
except Exception as e:
logging.info(e)
return data_brand['brand_name']
except:
pass
def product_info(self, data, category, keyword, url, url_hash, brand_name, rce_reseller_id):
data_product = {}
data_product['rce_source_product_id'] = 0
data_product['rce_source_id'] = self.rce_source_id
data_product['rce_source_product_status'] = 1
data_product['product_page_url'] = url.replace("'","''")
data_product['product_page_url_hash'] = url_hash
data_product['rce_category_id'] = int(category)
data_product['rce_brand_id'] = ""
data_product['rce_store_id'] = ""
data_product['rce_source_product_name'] = ""
data_product['product_images'] = ""
data_product['product_description'] = ""
data_product['product_sold_total'] = 0
data_product['product_sold'] = 0
data_product['product_price_min'] = ""
data_product['product_price_min_before_discount'] =""
data_product['product_price_max'] = ""
data_product['product_price_max_before_discount'] = ""
data_product['ratings'] = 0.0
data_product['product_section'] = keyword
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_name = '"+str(brand_name)+"'"
self.cur.execute(sql)
data_product['rce_brand_id'] = self.cur.fetchone()[0]
except: pass
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+""
self.cur.execute(sql)
data_product['rce_store_id'] = self.cur.fetchone()[0]
except: pass
try:
rce_source_product_name = data["product"]["product_title"]
data_product['rce_source_product_name'] = str(re.sub(self.pattern, '', rce_source_product_name)).replace("'","")
except: pass
try:
images = data["product"]["image_keys"]
data_product['product_images'] = ','.join(images)
#print(data_product['product_images'])
except: pass
try:
data_product['product_description'] = data["product"]["long_description"] + " ".join(data["product"]["feature_bullets"])
data_product['product_description'] = str(re.sub(self.pattern, '', data_product['product_description'])).replace("'","")
except:
pass
try:
data_product['product_price_min'] = str(data["product"]["variants"][0]["offers"][0]["sale_price"])
data_product['product_price_max'] = data_product['product_price_min']
except:
data_product['product_price_min'] = str(data["product"]["variants"][0]["offers"][0]["price"])
data_product['product_price_max'] = data_product['product_price_min']
pass
try:
data_product['product_price_min_before_discount'] = str(data["product"]["variants"][0]["offers"][0]["price"])
data_product['product_price_max_before_discount'] = str(data["product"]["variants"][0]["offers"][0]["price"])
except:
pass
try:
data_product['ratings'] = float(data["product"]["product_rating"]["value"])
#print(data_product['ratings'])
except:
pass
try:
self.db_writer.rce_product(data_product)
except Exception as e:
logging.info(e)
### rce_product_variant
try:
variants = data["product"]["groups"][0]["options"]
if variants:
for variant in variants:
data_variant = {}
data_variant['rce_source_variant_id'] = 0
data_variant['rce_product_id'] = ""
data_variant['product_variant_name'] = ""
data_variant['product_variant_price'] = 0
data_variant['product_variant_price_before_discount'] = 0
data_variant['product_variant_stock'] = 0
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_name = '"+str(data_product['rce_source_product_name'])+"'"
self.cur.execute(sql)
data_variant['rce_product_id'] = self.cur.fetchone()[0]
except:
pass
try:
product_variant_name = variant["name"]
data_variant['product_variant_name'] = str(re.sub(self.pattern, '', product_variant_name)).replace("'","''")
except: pass
try:
self.db_writer.rce_product_variant(data_variant)
except Exception as e:
logging.info(e)
time.sleep(random.randint(2,5))
else:
logging.info('No variant found')
except:
logging.info('No variant found')
pass
def rating_info(self, data, rce_reseller_id, url_hash):
try:
data_reviews = []
data_reviews_ar = []
data_reviews_en = []
try:
if data["product"]["reviews"]["comments"]["ar"]["reviews"]:
data_reviews_ar = data["product"]["reviews"]["comments"]["ar"]["reviews"]
data_reviews.extend(data_reviews_ar)
except:
pass
try:
if data["product"]["reviews"]["comments"]["en"]["reviews"]:
data_reviews_en = data["product"]["reviews"]["comments"]["en"]["reviews"]
data_reviews.extend(data_reviews_en)
except:
pass
for review in data_reviews:
data_review = {}
data_review["id"] = ""
data_review["rce_product_id"] = ""
data_review["username"] = ""
data_review["review"] = ""
data_review["img_url"] = ""
data_review["review_like_count"] = 0
data_review["user_tier"] = ""
data_review["shop_id"] = 0
data_review["video_url"] = ""
data_review["rating"] = ""
try:
sql = "select max(id) from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')
self.cur.execute(sql)
rating_id = self.cur.fetchone()
if rating_id[0]==None:
rating_id = 1
else:
rating_id = int(rating_id[0]) + 1
data_review["id"] = rating_id
except:
pass
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where product_page_url_hash = '"+str(url_hash)+"'"
self.cur.execute(sql)
data_review["rce_product_id"] = self.cur.fetchone()[0]
except: pass
try: data_review["username"] = review["displayName"]
except: pass
try:
try:
title = review["title"]
except:
pass
try:
comment = review["comment"]
except:
pass
data_review["review"] = title + comment
data_review["review"] = data_review["review"].replace("'","")
except: pass
try:
data_review["review_like_count"] = review["helpfulCount"]
except:
pass
try:
data_review["rating"] = review["rating"]
except: pass
try:
sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+""
self.cur.execute(sql)
data_review["shop_id"] = self.cur.fetchone()[0]
except: pass
try:
self.db_writer.rce_ratings_reviews(data_review)
except Exception as e:
logging.info(e)
except:
pass
def get_product_info(self,item):
try:
data = get_product_info_raw(item[4])
##### Reseller info #####
rce_reseller_id = self.reseller_info(data)
##### Product Info #####
##### Brand Info
brand_name = self.brand_info(data)
##### Product info
self.product_info(data, item[2], item[3], item[4], item[5], brand_name, rce_reseller_id)
##### Rating Info #####
self.rating_info(data, rce_reseller_id, item[5])
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 where product_page_url_hash='{item[5]}'
"""
self.cur.execute(sql)
except Exception as e:
print(e)