import hashlib import json import logging import random import sys import string import psycopg2 import time import re import requests from noon_db_writer import noon_db_writer from datetime import datetime from noon_raw_product import get_product_info_raw class noon_products: def __init__(self, config): self.config = config self.crawler_name = self.config.get("crawler_name") self.pattern = r'[' + string.punctuation + ']' self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) self.conn.autocommit = True self.cur = self.conn.cursor() self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('source_tab')+" where source_name='Noon'") self.rce_source_id = self.cur.fetchone()[0] self.cur.execute("select * from "+self.config.get('crawler_schema')+"."+self.config.get('tracker_tab')+" where crawler_name='raena_crawler_enginer_noon' and flag=0") self.items = self.cur.fetchall() self.db_writer = noon_db_writer(config) #self.display = Display(visible=0, size=(800, 600)) #self.display.start() def __del__(self): print("Closing connection.....") self.conn.close() #self.display.stop() def slack_notification(message): webhook_url = "https://hooks.slack.com/services/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" slack_data = {"text": "Issue occurred on Noon Crawler. Error: " + str(message)} response = requests.post( webhook_url, data=json.dumps(slack_data), headers={"Content-Type": "application/json"} ) if response.status_code != 200: raise ValueError( f"Request to Slack returned an error {response.status_code}, {response.text}" ) def start_processing(self): count = 0 for item in self.items: count += 1 try: logging.info("============== Getting info for {}/{}: {} ================".format(str(count),str(len(self.items)),str(item))) start = datetime.now() self.get_product_info(item) end = datetime.now() logging.info('Total time taken to fetch the product: {}'.format(str(end-start))) # sleeptime = random.randint(20,50) # logging.info("Sleeping for {} sec".format(str(sleeptime))) # time.sleep(sleeptime) time.sleep(5) except Exception as e: print(e) self.slack_notification(e) def reseller_info(self, data): try: stores = data["product"]["variants"][0]["offers"] if stores: return_item = "" flag = 0 for store in stores: ##### reseller info data_reseller = {} data_reseller['rce_source_id'] = self.rce_source_id data_reseller['rce_source_reseller_status'] = 1 data_reseller['reseller_name'] = "" data_reseller['reseller_average_rating'] = 0.0 data_reseller['reseller_description'] = "" try: data_reseller['reseller_name'] = store["store_name"] data_reseller['reseller_name'] = data_reseller['reseller_name'].replace("'","") except: pass try: data_reseller['reseller_average_rating'] = float(store["partner_ratings_sellerlab"]["partner_rating"]) except: pass try: self.db_writer.rce_reseller(data_reseller) except Exception as e: logging.info(e) ##### Store info data_reseller_store = {} data_reseller_store['rce_source_store_status'] = 1 data_reseller_store['store_page_url'] = "" data_reseller_store['store_page_url_hash'] = "" data_reseller_store['store_location'] = "" data_reseller_store['rce_reseller_id'] = "" data_reseller_store['rce_source_id'] = self.rce_source_id try: data_reseller_store['store_page_url'] = "https://www.noon.com/uae-en/seller/" + store["store_code"] data_reseller_store['store_page_url'] = data_reseller_store['store_page_url'].replace("'","") data_reseller_store['store_page_url_hash'] = hashlib.md5(data_reseller_store['store_page_url'].encode('utf-8')).hexdigest() except: pass try: self.cur.execute("select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data_reseller['reseller_name'])+"'") rce_reseller_id = self.cur.fetchone() data_reseller_store['rce_reseller_id'] = rce_reseller_id[0] if flag == 0: return_item = data_reseller_store['rce_reseller_id'] flag = 1 except: pass try: self.db_writer.rce_reseller_store(data_reseller_store) except Exception as e: logging.info(e) return return_item except Exception as e: print(e) def brand_info(self, data): data_brand = {} data_brand['rce_source_id'] = self.rce_source_id data_brand['rce_source_brand_status'] = 1 data_brand['brand_page_url'] = "" data_brand['brand_page_url_hash'] = "" data_brand['brand_name'] = "" try: data_brand['brand_page_url'] = "https://www.noon.com/uae-en/" + data["product"]["brand_code"] data_brand['brand_page_url_hash'] = hashlib.md5(data_brand['brand_page_url'].encode('utf-8')).hexdigest() try: data_brand['brand_name'] = data["product"]["brand"] data_brand['brand_name'] = data_brand['brand_name'].replace("'","") except: pass try: self.db_writer.rce_brand(data_brand) except Exception as e: logging.info(e) return data_brand['brand_name'] except: pass def product_info(self, data, category, keyword, url, url_hash, brand_name, rce_reseller_id): data_product = {} data_product['rce_source_product_id'] = 0 data_product['rce_source_id'] = self.rce_source_id data_product['rce_source_product_status'] = 1 data_product['product_page_url'] = url.replace("'","''") data_product['product_page_url_hash'] = url_hash data_product['rce_category_id'] = int(category) data_product['rce_brand_id'] = "" data_product['rce_store_id'] = "" data_product['rce_source_product_name'] = "" data_product['product_images'] = "" data_product['product_description'] = "" data_product['product_sold_total'] = 0 data_product['product_sold'] = 0 data_product['product_price_min'] = "" data_product['product_price_min_before_discount'] ="" data_product['product_price_max'] = "" data_product['product_price_max_before_discount'] = "" data_product['ratings'] = 0.0 data_product['product_section'] = keyword try: sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('brand_tab')+" where brand_name = '"+str(brand_name)+"'" self.cur.execute(sql) data_product['rce_brand_id'] = self.cur.fetchone()[0] except: pass try: sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+"" self.cur.execute(sql) data_product['rce_store_id'] = self.cur.fetchone()[0] except: pass try: rce_source_product_name = data["product"]["product_title"] data_product['rce_source_product_name'] = str(re.sub(self.pattern, '', rce_source_product_name)).replace("'","") except: pass try: images = data["product"]["image_keys"] data_product['product_images'] = ','.join(images) #print(data_product['product_images']) except: pass try: data_product['product_description'] = data["product"]["long_description"] + " ".join(data["product"]["feature_bullets"]) data_product['product_description'] = str(re.sub(self.pattern, '', data_product['product_description'])).replace("'","") except: pass try: data_product['product_price_min'] = str(data["product"]["variants"][0]["offers"][0]["sale_price"]) data_product['product_price_max'] = data_product['product_price_min'] except: data_product['product_price_min'] = str(data["product"]["variants"][0]["offers"][0]["price"]) data_product['product_price_max'] = data_product['product_price_min'] pass try: data_product['product_price_min_before_discount'] = str(data["product"]["variants"][0]["offers"][0]["price"]) data_product['product_price_max_before_discount'] = str(data["product"]["variants"][0]["offers"][0]["price"]) except: pass try: data_product['ratings'] = float(data["product"]["product_rating"]["value"]) #print(data_product['ratings']) except: pass try: self.db_writer.rce_product(data_product) except Exception as e: logging.info(e) ### rce_product_variant try: variants = data["product"]["groups"][0]["options"] if variants: for variant in variants: data_variant = {} data_variant['rce_source_variant_id'] = 0 data_variant['rce_product_id'] = "" data_variant['product_variant_name'] = "" data_variant['product_variant_price'] = 0 data_variant['product_variant_price_before_discount'] = 0 data_variant['product_variant_stock'] = 0 try: sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where rce_source_product_name = '"+str(data_product['rce_source_product_name'])+"'" self.cur.execute(sql) data_variant['rce_product_id'] = self.cur.fetchone()[0] except: pass try: product_variant_name = variant["name"] data_variant['product_variant_name'] = str(re.sub(self.pattern, '', product_variant_name)).replace("'","''") except: pass try: self.db_writer.rce_product_variant(data_variant) except Exception as e: logging.info(e) time.sleep(random.randint(2,5)) else: logging.info('No variant found') except: logging.info('No variant found') pass def rating_info(self, data, rce_reseller_id, url_hash): try: data_reviews = [] data_reviews_ar = [] data_reviews_en = [] try: if data["product"]["reviews"]["comments"]["ar"]["reviews"]: data_reviews_ar = data["product"]["reviews"]["comments"]["ar"]["reviews"] data_reviews.extend(data_reviews_ar) except: pass try: if data["product"]["reviews"]["comments"]["en"]["reviews"]: data_reviews_en = data["product"]["reviews"]["comments"]["en"]["reviews"] data_reviews.extend(data_reviews_en) except: pass for review in data_reviews: data_review = {} data_review["id"] = "" data_review["rce_product_id"] = "" data_review["username"] = "" data_review["review"] = "" data_review["img_url"] = "" data_review["review_like_count"] = 0 data_review["user_tier"] = "" data_review["shop_id"] = 0 data_review["video_url"] = "" data_review["rating"] = "" try: sql = "select max(id) from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab') self.cur.execute(sql) rating_id = self.cur.fetchone() if rating_id[0]==None: rating_id = 1 else: rating_id = int(rating_id[0]) + 1 data_review["id"] = rating_id except: pass try: sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" where product_page_url_hash = '"+str(url_hash)+"'" self.cur.execute(sql) data_review["rce_product_id"] = self.cur.fetchone()[0] except: pass try: data_review["username"] = review["displayName"] except: pass try: try: title = review["title"] except: pass try: comment = review["comment"] except: pass data_review["review"] = title + comment data_review["review"] = data_review["review"].replace("'","") except: pass try: data_review["review_like_count"] = review["helpfulCount"] except: pass try: data_review["rating"] = review["rating"] except: pass try: sql = "select id from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where rce_reseller_id = "+str(rce_reseller_id)+"" self.cur.execute(sql) data_review["shop_id"] = self.cur.fetchone()[0] except: pass try: self.db_writer.rce_ratings_reviews(data_review) except Exception as e: logging.info(e) except: pass def get_product_info(self,item): try: data = get_product_info_raw(item[4]) ##### Reseller info ##### rce_reseller_id = self.reseller_info(data) ##### Product Info ##### ##### Brand Info brand_name = self.brand_info(data) ##### Product info self.product_info(data, item[2], item[3], item[4], item[5], brand_name, rce_reseller_id) ##### Rating Info ##### self.rating_info(data, rce_reseller_id, item[5]) sql = f""" update {self.config.get('crawler_schema')}.{self.config.get('tracker_tab')} set flag = 1 where product_page_url_hash='{item[5]}' """ self.cur.execute(sql) except Exception as e: print(e)