import logging import psycopg2 ###### Looger ###### logging.basicConfig(filename="/home/ubuntu/logs/hasaki_crawler.log", filemode='a', format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s', datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO) class hasaki_db_writer: def __init__(self, config): self.config = config self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port')) self.conn.autocommit = True self.cur = self.conn.cursor() def __del__(self): logging.info("Closing connection.....") self.conn.close() def get_id(self, schema, table): sql = f""" select max(id) from {schema}.{table} """ self.cur.execute(sql) res = self.cur.fetchone() if res[0] != None: id = res[0] + 1 else: id = 1 return id def get_aud_id(self, schema, table): sql = f""" select max(auditid) from {schema}.{table} """ self.cur.execute(sql) res = self.cur.fetchone() if res[0] != None: id = res[0] + 1 else: id = 1 return id def rce_category(self, data): sql = f""" select * from {self.config.get('crawler_schema')}.{self.config.get('category_tab')} where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']} """ self.cur.execute(sql) res = self.cur.fetchone() id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('category_tab')) id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('category_tab')) if not res: sql = f""" insert into {self.config.get('crawler_schema')}.{self.config.get('category_tab')}(id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,category_parent_name) values({id_main},{data['parent_category_id']},{data['rce_source_id']},{data['rce_source_category_id']},{data['rce_source_status']},'{data['category_page_url']}','{data['category_page_url_hash']}','{data['category_name']}','{data['category_parent_name']}') """ logging.info(sql) self.cur.execute(sql) sql = f""" insert into {self.config.get('crawler_schema')}.aud_{self.config.get('category_tab')}(auditid,id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat,category_parent_name) select {id_aud},id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat,category_parent_name from {self.config.get('crawler_schema')}.{self.config.get('category_tab')} where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']} """ logging.info(sql) self.cur.execute(sql) else: if str(data['parent_category_id'])==str(res[1]) and str(data['rce_source_category_id'])==str(res[3]) and str(data['category_name']) == str(res[7]) and \ str(data['category_page_url'])==str(res[5]) and str(data['category_parent_name'])==str(res[12]): sql = f""" update {self.config.get('crawler_schema')}.{self.config.get('category_tab')} set updatedat=GETDATE() where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']} """ logging.info(sql) self.cur.execute(sql) sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+" a set updatedat=b.updatedat " \ "from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" b where a.id=b.id and b.id = "+str(res[0]) logging.info(sql) self.cur.execute(sql) else: sql = f""" update {self.config.get('crawler_schema')}.{self.config.get('category_tab')} set parent_category_id={data['parent_category_id']}, rce_source_category_id = {data['rce_source_category_id']}, category_name = '{data['category_name']}', category_page_url = '{data['category_page_url']}', category_page_url_hash = '{data['category_page_url_hash']}', category_parent_name = '{data['category_parent_name']}', updatedat=GETDATE() where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']} """ logging.info(sql) self.cur.execute(sql) sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(auditid,id,parent_category_id,rce_source_id," \ "rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat,category_parent_name) " \ "select "+str(id_aud)+", id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \ "category_name,createdat,updatedat,category_parent_name from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \ "where category_name = '"+ str(res[7])+"'" logging.info(sql) self.cur.execute(sql) def rce_product(self, data): sql = f""" select * from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']} """ self.cur.execute(sql) res = self.cur.fetchone() id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('product_tab')) id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('product_tab')) if not res: sql = f""" insert into {self.config.get('crawler_schema')}.{self.config.get('product_tab')}(id,rce_source_product_id,rce_source_product_status,product_page_url, product_page_url_hash,rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold, product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings,product_section, rce_source_id,countryoforigin,rank,ships_from) values({id_main},{data['rce_source_product_id']},{data['rce_source_product_status']},'{data['product_page_url']}', '{data['product_page_url_hash']}',{data['rce_category_id']},{data['rce_brand_id']},{data['rce_store_id']},'{data['rce_source_product_name']}','{data['product_images']}','{data['product_description']}',{data['product_sold_total']},{data['product_sold']}, {data['product_price_min']},{data['product_price_min_before_discount']},{data['product_price_max']},{data['product_price_max_before_discount']},{data['ratings']},'{data['product_section']}', {data['rce_source_id']},'{data['countryoforigin']}',{data['rank']},'{data['ships_from']}') """ logging.info(sql) self.cur.execute(sql) sql = f""" insert into {self.config.get('crawler_schema')}.aud_{self.config.get('product_tab')}(auditid,id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash, rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount, product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank) select {id_aud},id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash, rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount, product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']} """ logging.info(sql) self.cur.execute(sql) else: if str(data['rce_source_product_id'])==str(res[1]) and str(data['rce_source_product_status'])==str(res[2]) and \ str(data['product_page_url'])==str(res[3]) and str(data['product_page_url_hash'])==str(res[4]) and str(data['rce_category_id'])==str(res[5]) and \ str(data['rce_brand_id'])==str(res[6]) and str(data['rce_store_id'])==str(res[7]) and str(data['rce_source_product_name'])==str(res[8]) and \ str(data['product_images'])==str(res[9]) and str(data['product_sold_total'])==str(res[11]) and \ str(data['product_sold'])==str(res[12]) and str(data['product_price_min'])==str(res[13]) and str(data['product_price_min_before_discount'])==str(res[14]) and \ str(data['product_price_max'])==str(res[15]) and str(data['product_price_max_before_discount'])==str(res[16]) and str(data['ratings'])==str(res[17]) \ and str(data['ships_from'])==str(res[18]) and str(data['rce_source_id'])==str(res[21]) \ and str(data['product_section'])==str(res[22]) and str(data['countryoforigin'])==str(res[23])\ and str(data['rank'])==str(res[24]): sql = f""" update {self.config.get('crawler_schema')}.{self.config.get('product_tab')} set updatedat=GETDATE() where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']} """ logging.info(sql) self.cur.execute(sql) sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" a set updatedat=b.updatedat " \ "from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" b where a.id=b.id and b.id = "+str(res[0]) logging.info(sql) self.cur.execute(sql) else: sql = f""" update {self.config.get('crawler_schema')}.{self.config.get('product_tab')} set rce_source_product_id = {data['rce_source_product_id']}, rce_source_product_status={data['rce_source_product_status']}, product_page_url='{data['product_page_url']}', product_page_url_hash='{data['product_page_url_hash']}', rce_category_id={data['rce_category_id']}, rce_brand_id={data['rce_brand_id']}, rce_store_id={data['rce_store_id']}, rce_source_product_name='{data['rce_source_product_name']}', product_images='{data['product_images']}', product_description='{data['product_description']}', product_sold_total={data['product_sold_total']}, product_sold={data['product_sold']}, product_price_min='{data['product_price_min']}',product_price_min_before_discount='{data['product_price_min_before_discount']}', product_price_max='{data['product_price_max']}', product_price_max_before_discount='{data['product_price_max_before_discount']}', ratings={data['ratings']}, ships_from='{data['ships_from']}',product_section='{data['product_section']}',countryoforigin='{data['countryoforigin']}',rank={data['rank']}, updatedat=GETDATE() where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']} """ logging.info(sql) self.cur.execute(sql) sql = f""" insert into {self.config.get('crawler_schema')}.aud_{self.config.get('product_tab')}(auditid,id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash, rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount, product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank) select {id_aud},id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash, rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount, product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank from {self.config.get('crawler_schema')}.{self.config.get('product_tab')} where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']} """ logging.info(sql) self.cur.execute(sql) def rce_product_variant(self, data): sql = f""" select * from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']} """ self.cur.execute(sql) res = self.cur.fetchone() id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('variant_tab')) id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('variant_tab')) if not res: sql = f""" insert into {self.config.get('crawler_schema')}.{self.config.get('variant_tab')}(id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku) values({id_main},{data['rce_source_variant_id']},{data['rce_product_id']},'{data['product_variant_name']}',{data['product_variant_price']},{data['product_variant_price_before_discount']},{data['product_variant_stock']},'{data['product_variant_sku']}') """ logging.info(sql) self.cur.execute(sql) sql = f""" insert into {self.config.get('crawler_schema')}.aud_{self.config.get('variant_tab')}(auditid,id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat) select {id_aud},id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']} """ logging.info(sql) self.cur.execute(sql) else: if str(data['rce_source_variant_id'])==str(res[1]) and str(data['rce_product_id'])==str(res[2]) and str(data['product_variant_name'])==str(res[3]) and \ str(data['product_variant_price'])==str(res[4]) and str(data['product_variant_price_before_discount'])==str(res[5]) and str(data['product_variant_stock'])==str(res[6])\ and str(data['product_variant_sku'])==str(res[9]): sql = f""" update {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} set updatedat=GETDATE() where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']} """ logging.info(sql) self.cur.execute(sql) sql = f""" update {self.config.get('crawler_schema')}.aud_{self.config.get('variant_tab')} a set updatedat=b.updatedat from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} b where a.id=b.id and b.id = {res[0]} """ logging.info(sql) self.cur.execute(sql) else: sql = f""" update {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} set rce_source_variant_id={data['rce_source_variant_id']}, rce_product_id={data['rce_product_id']},product_variant_name='{data['product_variant_name']}',product_variant_price={data['product_variant_price']}, product_variant_price_before_discount={data['product_variant_price_before_discount']},product_variant_stock={data['product_variant_stock']}, product_variant_sku={data['product_variant_sku']}, updatedat=GETDATE() where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']} """ logging.info(sql) self.cur.execute(sql) sql = f""" insert into {self.config.get('crawler_schema')}.aud_{self.config.get('variant_tab')}(auditid,id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat) select {id_aud},id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']} """ logging.info(sql) self.cur.execute(sql) def rce_brand(self, data): sql = f""" select * from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} where rce_source_brand_id = {data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']} """ self.cur.execute(sql) res = self.cur.fetchone() id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('brand_tab')) id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('brand_tab')) if not res: sql = f""" insert into {self.config.get('crawler_schema')}.{self.config.get('brand_tab')}(id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating) values({id_main},{data['rce_source_id']},{data['rce_source_brand_id']},{data['rce_source_brand_status']},'{data['brand_page_url']}','{data['brand_page_url_hash']}','{data['brand_name']}',{data['brand_following']},{data['brand_rating']}) """ logging.info(sql) self.cur.execute(sql) sql = f""" insert into {self.config.get('crawler_schema')}.aud_{self.config.get('brand_tab')}(auditid,id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat) select {id_aud}, id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']} """ logging.info(sql) self.cur.execute(sql) else: if str(data['rce_source_id'])==str(res[1]) and str(data['rce_source_brand_status'])==str(res[3]) and str(data['brand_page_url'])==str(res[4]) and \ str(data['brand_page_url_hash'])==str(res[5]) and str(data['brand_name'])==str(res[6]) and str(data['rce_source_brand_id'])==str(res[2]): sql = f""" update {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} set updatedat=GETDATE() where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']} """ logging.info(sql) self.cur.execute(sql) sql = f""" update {self.config.get('crawler_schema')}.aud_{self.config.get('brand_tab')} a set updatedat=b.updatedat from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} b where a.id=b.id and b.id = {res[0]} and b.rce_source_id = {data['rce_source_id']} """ logging.info(sql) self.cur.execute(sql) else: sql = f""" update {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} set rce_source_id={data['rce_source_id']}, rce_source_brand_id={data['rce_source_brand_id']}, rce_source_brand_status={data['rce_source_brand_status']}, brand_page_url='{data['brand_page_url']}', brand_page_url_hash='{data['brand_page_url_hash']}', brand_name='{data['brand_name']}', brand_following={data['brand_following']}, brand_rating={data['brand_rating']}, updatedat=GETDATE() where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']} """ logging.info(sql) self.cur.execute(sql) sql = f""" insert into {self.config.get('crawler_schema')}.aud_{self.config.get('brand_tab')}(auditid,id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat) select {id_aud}, id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']} """ logging.info(sql) self.cur.execute(sql) def rce_reseller(self, data): data['reseller_name'] = data['reseller_name'] sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data['reseller_name'])+"'" self.cur.execute(sql) res = self.cur.fetchone() id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('reseller_tab')) id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('reseller_tab')) if not res: sql = f""" insert into {self.config.get('crawler_schema')}.{self.config.get('reseller_tab')}(id,rce_source_id,rce_source_reseller_status,reseller_name) values({id_main},'{data['rce_source_id']}','{data['rce_source_reseller_status']}','{data['reseller_name']}') """ #logging.info(sql) self.cur.execute(sql) sql = f""" insert into {self.config.get('crawler_schema')}.aud_{self.config.get('reseller_tab')}(auditid,id,rce_source_id,rce_source_reseller_status,reseller_name,createdat,updatedat) select {id_aud}, id,rce_source_id,rce_source_reseller_status,reseller_name,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('reseller_tab')} where reseller_name='{data['reseller_name']}' """ #logging.info(sql) self.cur.execute(sql) else: if str(data['rce_source_reseller_status'])==str(res[3]) and str(data['reseller_name'])==str(res[4]): sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set updatedat=GETDATE() " \ "where reseller_name = '"+ str(res[4])+"'" #logging.info(sql) self.cur.execute(sql) sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" a set updatedat=b.updatedat " \ "from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" b where a.id=b.id and b.id = "+str(res[0]) #logging.info(sql) self.cur.execute(sql) else: sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set rce_source_id="+str(data['rce_source_id'])+", " \ "rce_source_reseller_status="+str(data['rce_source_reseller_status'])+", reseller_name='"+str(data['reseller_name'])+"', reseller_average_rating=" \ "'"+str(data['reseller_average_rating'])+"',reseller_description='"+str(data['reseller_description'])+"', updatedat=GETDATE() where reseller_name = '"+ str(res[4])+"'" #logging.info(sql) self.cur.execute(sql) sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (auditid,id,rce_source_id,rce_source_reseller_status," \ "reseller_name,reseller_average_rating,reseller_description,createdat,updatedat) select "+str(id_aud)+", id,rce_source_id,rce_source_reseller_status," \ "reseller_name,reseller_average_rating,reseller_description,createdat,updatedat from " \ ""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name='"+str(res[4])+"'" #logging.info(sql) self.cur.execute(sql) def rce_reseller_store(self, data): data['store_page_url'] = data['store_page_url'].replace("'","''") sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url = '"+str(data['store_page_url'])+"'" self.cur.execute(sql) res = self.cur.fetchone() id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('reseller_store_tab')) id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('reseller_store_tab')) if not res: sql = f""" insert into {self.config.get('crawler_schema')}.{self.config.get('reseller_store_tab')}(id,rce_source_store_status,store_page_url,store_page_url_hash,rce_reseller_id,rce_source_id) values({id_main},'{data['rce_source_store_status']}','{data['store_page_url']}','{data['store_page_url_hash']}',{data['rce_reseller_id']},{data['rce_source_id']}) """ #logging.info(sql) self.cur.execute(sql) sql = f""" insert into {self.config.get('crawler_schema')}.aud_{self.config.get('reseller_store_tab')}(auditid,id,rce_source_store_status,store_page_url,store_page_url_hash,rce_reseller_id,createdat,updatedat,rce_source_id) select {id_aud}, id,rce_source_store_status,store_page_url,store_page_url_hash,rce_reseller_id,createdat,updatedat,rce_source_id from {self.config.get('crawler_schema')}.{self.config.get('reseller_store_tab')} where store_page_url= '{data['store_page_url']}' """ #logging.info(sql) self.cur.execute(sql) else: if str(data['rce_source_store_status'])==str(res[2]) and str(data['store_page_url'])==str(res[3]) and \ str(data['store_page_url_hash'])==str(res[4]) and \ str(data['rce_reseller_id'])==str(res[6]) and str(data['rce_source_id'])==str(res[9]): sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set updatedat=GETDATE() " \ "where store_page_url = '"+ str(res[3])+"'" #logging.info(sql) self.cur.execute(sql) sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" a set updatedat=b.updatedat " \ "from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" b where a.id=b.id and b.id = "+str(res[0]) #logging.info(sql) self.cur.execute(sql) else: sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set " \ "rce_source_store_status="+str(data['rce_source_store_status'])+", store_page_url='"+str(data['store_page_url'])+"', store_page_url_hash=" \ "'"+str(data['store_page_url_hash'])+"',store_location='"+str(data['store_location'])+"', rce_reseller_id="+str(data['rce_reseller_id'])+", " \ "updatedat=GETDATE(), rce_source_id="+str(data['rce_source_id'])+" where store_page_url = '"+ str(res[3])+"'" #logging.info(sql) self.cur.execute(sql) sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (auditid,id,rce_source_store_status," \ "store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id) select "+id_aud+", id,rce_source_store_status," \ "store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id from " \ ""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url='"+str(res[3])+"'" #logging.info(sql) self.cur.execute(sql) def rce_ratings_reviews(self, data): sql = f""" select * from {self.config.get('crawler_schema')}.{self.config.get('review_tab')} where rce_product_id = {data['rce_product_id']} and username = '{data['username']}' """ self.cur.execute(sql) res = self.cur.fetchone() data['username'] = data['username'].replace("'","") id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('review_tab')) id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('review_tab')) if not res: sql = f""" insert into {self.config.get('crawler_schema')}.{self.config.get('review_tab')}(id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating) values({id_main},{data['rce_product_id']},'{data['username']}','{data['review']}','{data['img_url']}',{data['review_like_count']},'{data['user_tier']}',{data['shop_id']},'{data['video_url']}',{data['rating']}) """ logging.info(sql) self.cur.execute(sql) sql = f""" insert into {self.config.get('crawler_schema')}.aud_{self.config.get('review_tab')}(auditid,id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select {id_aud},id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('review_tab')} where rce_product_id = {data['rce_product_id']} and username = '{data['username']}' """ logging.info(sql) self.cur.execute(sql) else: if str(data['rce_product_id'])==str(res[1]) and str(data['username'])==str(res[2]) and str(data['review'])==str(res[3]) and \ str(data['img_url'])==str(res[4]) and str(data['review_like_count'])==str(res[5]) and str(data['user_tier'])==str(res[6]) and \ str(data['shop_id'])==str(res[7]) and str(data['video_url'])==str(res[8]) and str(data['rating'])==str(res[9]): sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set updatedat=GETDATE() " \ "where rce_product_id = "+ str(res[1])+" and username ='"+res[2]+"'" logging.info(sql) self.cur.execute(sql) sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" a set updatedat=b.updatedat " \ "from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" b where a.id=b.id and b.id = "+str(res[0]) logging.info(sql) self.cur.execute(sql) else: sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set rce_product_id="+str(data['rce_product_id'])+", " \ "username='"+str(data['username'])+"', review='"+str(data['review'])+"', img_url=" \ "'"+str(data['img_url'])+"',review_like_count="+str(data['review_like_count'])+", user_tier='"+str(data['user_tier'])+"', " \ "shop_id="+str(data['shop_id'])+", video_url='"+str(data['video_url'])+"', rating='"+str(data['rating'])+"', updatedat=GETDATE() " \ "where rce_product_id = "+ str(res[1])+" and username ='"+str(data['username'])+"'" logging.info(sql) self.cur.execute(sql) sql = f""" insert into {self.config.get('crawler_schema')}.aud_{self.config.get('review_tab')}(auditid,id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat) select {id_aud},id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('review_tab')} where rce_product_id = {data['rce_product_id']} and username = '{data['username']}' """ logging.info(sql) self.cur.execute(sql) def rce_seo(self, data): sql = f""" select * from {self.config.get('crawler_schema')}.{self.config.get('seo_tab')} where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']} """ self.cur.execute(sql) res = self.cur.fetchone() id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('seo_tab')) id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('seo_tab')) if not res: sql = f""" insert into {self.config.get('crawler_schema')}.{self.config.get('seo_tab')}(id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots) values({id_main},{data['rce_product_id']},{data['rce_source_id']},'{data['seo_title']}','{data['seo_description']}','{data['seo_url']}','{data['seo_url_hash']}','{data['seo_image']}',{data['seo_price_amount']},'{data['seo_price_currency']}','{data['seo_product_band']}','{data['seo_product_availability']}','{data['seo_product_category']}', '{data['seo_product_condition']}',{data['seo_product_retailer_item_id']},'{data['seo_product_robots']}') """ logging.info(sql) self.cur.execute(sql) sql = f""" insert into {self.config.get('crawler_schema')}.aud_{self.config.get('seo_tab')}(auditid,id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat) select {id_aud},id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('seo_tab')} where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']} """ logging.info(sql) self.cur.execute(sql) else: if (str(data['rce_product_id']) == str(res[1]) and str(data['rce_source_id']) == str(res[2]) and str(data['seo_title']) == str(res[3]) and \ str(data['seo_description']) == str(res[4]) and str(data['seo_url']) == str(res[5]) and str(data['seo_url_hash']) == str(res[6]) and \ str(data['seo_image']) == str(res[7]) and str(data['seo_price_amount']) == str(res[8]) and str(data['seo_price_currency']) == str(res[9]) and \ str(data['seo_product_band']) == str(res[10])) and str(data['seo_product_availability']) == str(res[11]) and str(data['seo_product_category']) == str(res[12]) and \ str(data['seo_product_condition']) == str(res[13]) and str(data['seo_product_retailer_item_id']) == str(res[14]) and str(data['seo_product_robots']) == str(res[15]): sql = "update " + self.config.get('crawler_schema') + "." + self.config.get('seo_tab') + " set updatedat=GETDATE() " \ "where rce_product_id = " + str(res[1]) + " and rce_source_id =" + str(data['rce_source_id']) logging.info(sql) self.cur.execute(sql) sql = "update " + self.config.get('crawler_schema') + ".aud_" + self.config.get('seo_tab') + " a set updatedat=b.updatedat " \ "from " + self.config.get('crawler_schema') + "." + self.config.get('seo_tab') + " b where a.id=b.id and b.id = " + str(res[0]) logging.info(sql) self.cur.execute(sql) else: sql = f""" update {self.config.get('crawler_schema')}.{self.config.get('seo_tab')} set rce_product_id={data['rce_product_id']}, rce_source_id={data['rce_source_id']}, seo_title='{data['seo_title']}', seo_description='{data['seo_description']}', seo_url='{data['seo_url']}', seo_url_hash='{data['seo_url_hash']}', seo_image='{data['seo_image']}', seo_price_amount='{data['seo_price_amount']}', seo_price_currency='{data['seo_price_currency']}', seo_product_band='{data['seo_product_band']}', seo_product_availability='{data['seo_product_availability']}', seo_product_category='{data['seo_product_category']}', seo_product_condition='{data['seo_product_condition']}', seo_product_retailer_item_id={data['seo_product_retailer_item_id']}, seo_product_robots='{data['seo_product_robots']}' where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']} """ logging.info(sql) self.cur.execute(sql) sql = f""" insert into {self.config.get('crawler_schema')}.aud_{self.config.get('seo_tab')}(auditid,id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat) select {id_aud},id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('seo_tab')} where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']} """ logging.info(sql) self.cur.execute(sql) # def rce_ratings_reviews_productmodels(self,data): # # sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id = "+str(data['rce_rating_id']) # self.cur.execute(sql) # res = self.cur.fetchone() # # # if not res: # # sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" (rce_rating_id,model_id) " \ # "values("+str(data['rce_rating_id'])+",'"+str(data['model_id'])+"')" # #logging.info(sql) # # self.cur.execute(sql) # # sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \ # "createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \ # ""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(data['rce_rating_id'])+"" # #logging.info(sql) # # self.cur.execute(sql) # # else: # # if str(data['rce_rating_id'])==str(res[1]) and str(data['model_id'])==str(res[2]): # # sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set updatedat=GETDATE() " \ # "where rce_rating_id = "+ str(res[1]) # #logging.info(sql) # self.cur.execute(sql) # # sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" a set updatedat=b.updatedat " \ # "from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" b where a.id=b.id and b.id = "+str(res[0]) # #logging.info(sql) # self.cur.execute(sql) # else: # # sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set model_id="+str(data['model_id'])+", " \ # "updatedat=GETDATE() where rce_source_store_id = "+ str(res[1]) # #logging.info(sql) # self.cur.execute(sql) # # sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \ # "createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \ # ""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(res[1])+"" # #logging.info(sql) # # self.cur.execute(sql) # # # def rce_tags(self,data): # # sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description = '"+str(data['description'])+"'" # self.cur.execute(sql) # res = self.cur.fetchone() # # # if not res: # # sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" (id,description) " \ # "values("+str(data['id'])+",'"+str(data['description'])+"')" # #logging.info(sql) # # self.cur.execute(sql) # # sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \ # "createdat,updatedat) select id,description,createdat,updatedat from " \ # ""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(data['description'])+"'" # #logging.info(sql) # # self.cur.execute(sql) # # else: # # if str(data['description'])==str(res[1]): # # sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set updatedat=GETDATE() " \ # "where description = '"+ str(res[1])+"'" # #logging.info(sql) # self.cur.execute(sql) # # sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" a set updatedat=b.updatedat " \ # "from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" b where a.id=b.id and b.id = "+str(res[0]) # #logging.info(sql) # self.cur.execute(sql) # else: # # sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set description='"+str(data['description'])+"', " \ # "updatedat=GETDATE() where description = "+ str(res[1]) # #logging.info(sql) # self.cur.execute(sql) # # sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \ # "createdat,updatedat) select id,description,createdat,updatedat from " \ # ""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(res[1])+"'" # #logging.info(sql) # # self.cur.execute(sql) # # # def rce_ratings_reviews_producttags(self,data): # # sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id = '"+str(data['rce_rating_id'])+"'" # self.cur.execute(sql) # res = self.cur.fetchone() # # # if not res: # # sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" (rce_rating_id,tag_ids) " \ # "values("+str(data['rce_rating_id'])+",'"+str(data['tag_ids'])+"')" # #logging.info(sql) # # self.cur.execute(sql) # # sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \ # "createdat,updatedat) select id,rce_rating_id,tag_ids,createdat,updatedat from " \ # ""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id='"+str(data['rce_rating_id'])+"'" # #logging.info(sql) # # self.cur.execute(sql) # # else: # # if str(data['rce_rating_id'])==str(res[1]): # # sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set updatedat=GETDATE() " \ # "where rce_rating_id = '"+ str(res[1])+"'" # #logging.info(sql) # self.cur.execute(sql) # # sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" a set updatedat=b.updatedat " \ # "from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" b where a.id=b.id and b.id = "+str(res[0]) # #logging.info(sql) # self.cur.execute(sql) # else: # # sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set rce_rating_id='"+str(data['rce_rating_id'])+"', " \ # "updatedat=GETDATE() where rce_rating_id = "+ str(res[1]) # #logging.info(sql) # self.cur.execute(sql) # # sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \ # "createdat,updatedat) select id,description,createdat,updatedat from " \ # ""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where description='"+str(res[1])+"'" # #logging.info(sql) # # self.cur.execute(sql) # #