raena-crawler-engine/hasaki_crawler_engine/hasaki_db_writer.py

758 lines
48 KiB
Python
Raw Permalink Normal View History

2024-03-14 05:16:59 +00:00
import logging
import psycopg2
2024-04-01 07:59:12 +00:00
###### Looger ######
logging.basicConfig(filename="/home/ubuntu/logs/hasaki_crawler.log",
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s: %(message)s',
datefmt="%Y-%m-%d %H:%M:%S",
level=logging.INFO)
2024-03-14 05:16:59 +00:00
class hasaki_db_writer:
def __init__(self, config):
self.config = config
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), port=self.config.get('db_port'))
self.conn.autocommit = True
self.cur = self.conn.cursor()
def __del__(self):
logging.info("Closing connection.....")
self.conn.close()
def get_id(self, schema, table):
sql = f"""
select max(id) from {schema}.{table}
"""
self.cur.execute(sql)
res = self.cur.fetchone()
if res[0] != None:
id = res[0] + 1
else:
id = 1
return id
def get_aud_id(self, schema, table):
sql = f"""
select max(auditid) from {schema}.{table}
"""
self.cur.execute(sql)
res = self.cur.fetchone()
if res[0] != None:
id = res[0] + 1
else:
id = 1
return id
def rce_category(self, data):
sql = f"""
select * from {self.config.get('crawler_schema')}.{self.config.get('category_tab')} where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']}
"""
self.cur.execute(sql)
res = self.cur.fetchone()
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('category_tab'))
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('category_tab'))
if not res:
sql = f"""
insert into {self.config.get('crawler_schema')}.{self.config.get('category_tab')}(id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,category_parent_name)
values({id_main},{data['parent_category_id']},{data['rce_source_id']},{data['rce_source_category_id']},{data['rce_source_status']},'{data['category_page_url']}','{data['category_page_url_hash']}','{data['category_name']}','{data['category_parent_name']}')
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('category_tab')}(auditid,id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat,category_parent_name)
select {id_aud},id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat,category_parent_name from {self.config.get('crawler_schema')}.{self.config.get('category_tab')}
where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
else:
if str(data['parent_category_id'])==str(res[1]) and str(data['rce_source_category_id'])==str(res[3]) and str(data['category_name']) == str(res[7]) and \
str(data['category_page_url'])==str(res[5]) and str(data['category_parent_name'])==str(res[12]):
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('category_tab')} set updatedat=GETDATE()
where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" b where a.id=b.id and b.id = "+str(res[0])
logging.info(sql)
self.cur.execute(sql)
else:
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('category_tab')} set parent_category_id={data['parent_category_id']}, rce_source_category_id = {data['rce_source_category_id']},
category_name = '{data['category_name']}', category_page_url = '{data['category_page_url']}', category_page_url_hash = '{data['category_page_url_hash']}', category_parent_name = '{data['category_parent_name']}',
updatedat=GETDATE() where category_name = '{data['category_name']}' and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('category_tab')+"(auditid,id,parent_category_id,rce_source_id," \
"rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash,category_name,createdat,updatedat,category_parent_name) " \
"select "+str(id_aud)+", id,parent_category_id,rce_source_id,rce_source_category_id,rce_source_status,category_page_url,category_page_url_hash," \
"category_name,createdat,updatedat,category_parent_name from "+self.config.get('crawler_schema')+"."+self.config.get('category_tab')+" " \
"where category_name = '"+ str(res[7])+"'"
logging.info(sql)
self.cur.execute(sql)
def rce_product(self, data):
sql = f"""
select * from {self.config.get('crawler_schema')}.{self.config.get('product_tab')}
where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
"""
self.cur.execute(sql)
res = self.cur.fetchone()
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('product_tab'))
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('product_tab'))
if not res:
sql = f"""
insert into {self.config.get('crawler_schema')}.{self.config.get('product_tab')}(id,rce_source_product_id,rce_source_product_status,product_page_url,
product_page_url_hash,rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,
product_price_min,product_price_min_before_discount,product_price_max,product_price_max_before_discount,ratings,product_section,
rce_source_id,countryoforigin,rank,ships_from) values({id_main},{data['rce_source_product_id']},{data['rce_source_product_status']},'{data['product_page_url']}',
'{data['product_page_url_hash']}',{data['rce_category_id']},{data['rce_brand_id']},{data['rce_store_id']},'{data['rce_source_product_name']}','{data['product_images']}','{data['product_description']}',{data['product_sold_total']},{data['product_sold']},
{data['product_price_min']},{data['product_price_min_before_discount']},{data['product_price_max']},{data['product_price_max_before_discount']},{data['ratings']},'{data['product_section']}',
{data['rce_source_id']},'{data['countryoforigin']}',{data['rank']},'{data['ships_from']}')
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('product_tab')}(auditid,id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash,
rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount,
product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank)
select {id_aud},id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash,
rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount,
product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank from {self.config.get('crawler_schema')}.{self.config.get('product_tab')}
where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_product_id'])==str(res[1]) and str(data['rce_source_product_status'])==str(res[2]) and \
str(data['product_page_url'])==str(res[3]) and str(data['product_page_url_hash'])==str(res[4]) and str(data['rce_category_id'])==str(res[5]) and \
str(data['rce_brand_id'])==str(res[6]) and str(data['rce_store_id'])==str(res[7]) and str(data['rce_source_product_name'])==str(res[8]) and \
str(data['product_images'])==str(res[9]) and str(data['product_sold_total'])==str(res[11]) and \
str(data['product_sold'])==str(res[12]) and str(data['product_price_min'])==str(res[13]) and str(data['product_price_min_before_discount'])==str(res[14]) and \
str(data['product_price_max'])==str(res[15]) and str(data['product_price_max_before_discount'])==str(res[16]) and str(data['ratings'])==str(res[17]) \
and str(data['ships_from'])==str(res[18]) and str(data['rce_source_id'])==str(res[21]) \
and str(data['product_section'])==str(res[22]) and str(data['countryoforigin'])==str(res[23])\
and str(data['rank'])==str(res[24]):
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('product_tab')} set updatedat=GETDATE()
where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('product_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('product_tab')+" b where a.id=b.id and b.id = "+str(res[0])
logging.info(sql)
self.cur.execute(sql)
else:
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('product_tab')} set rce_source_product_id = {data['rce_source_product_id']}, rce_source_product_status={data['rce_source_product_status']}, product_page_url='{data['product_page_url']}',
product_page_url_hash='{data['product_page_url_hash']}', rce_category_id={data['rce_category_id']}, rce_brand_id={data['rce_brand_id']}, rce_store_id={data['rce_store_id']},
rce_source_product_name='{data['rce_source_product_name']}', product_images='{data['product_images']}', product_description='{data['product_description']}', product_sold_total={data['product_sold_total']},
product_sold={data['product_sold']}, product_price_min='{data['product_price_min']}',product_price_min_before_discount='{data['product_price_min_before_discount']}',
product_price_max='{data['product_price_max']}', product_price_max_before_discount='{data['product_price_max_before_discount']}', ratings={data['ratings']},
ships_from='{data['ships_from']}',product_section='{data['product_section']}',countryoforigin='{data['countryoforigin']}',rank={data['rank']}, updatedat=GETDATE()
where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('product_tab')}(auditid,id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash,
rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount,
product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank)
select {id_aud},id,rce_source_product_id,rce_source_product_status,product_page_url,product_page_url_hash,
rce_category_id,rce_brand_id,rce_store_id,rce_source_product_name,product_images,product_description,product_sold_total,product_sold,product_price_min,product_price_min_before_discount,
product_price_max,product_price_max_before_discount,ratings,ships_from,product_section,createdat,updatedat,rce_source_id,countryoforigin,rank from {self.config.get('crawler_schema')}.{self.config.get('product_tab')}
where rce_source_product_id = {data['rce_source_product_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
def rce_product_variant(self, data):
sql = f"""
select * from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} where
rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
"""
self.cur.execute(sql)
res = self.cur.fetchone()
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('variant_tab'))
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('variant_tab'))
if not res:
sql = f"""
insert into {self.config.get('crawler_schema')}.{self.config.get('variant_tab')}(id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku)
values({id_main},{data['rce_source_variant_id']},{data['rce_product_id']},'{data['product_variant_name']}',{data['product_variant_price']},{data['product_variant_price_before_discount']},{data['product_variant_stock']},'{data['product_variant_sku']}')
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('variant_tab')}(auditid,id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat)
select {id_aud},id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat
from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
"""
logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_variant_id'])==str(res[1]) and str(data['rce_product_id'])==str(res[2]) and str(data['product_variant_name'])==str(res[3]) and \
str(data['product_variant_price'])==str(res[4]) and str(data['product_variant_price_before_discount'])==str(res[5]) and str(data['product_variant_stock'])==str(res[6])\
and str(data['product_variant_sku'])==str(res[9]):
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} set updatedat=GETDATE()
where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
update {self.config.get('crawler_schema')}.aud_{self.config.get('variant_tab')} a set updatedat=b.updatedat
from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} b where a.id=b.id and b.id = {res[0]}
"""
logging.info(sql)
self.cur.execute(sql)
else:
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} set rce_source_variant_id={data['rce_source_variant_id']},
rce_product_id={data['rce_product_id']},product_variant_name='{data['product_variant_name']}',product_variant_price={data['product_variant_price']},
product_variant_price_before_discount={data['product_variant_price_before_discount']},product_variant_stock={data['product_variant_stock']},
product_variant_sku={data['product_variant_sku']}, updatedat=GETDATE()
where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('variant_tab')}(auditid,id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat)
select {id_aud},id,rce_source_variant_id,rce_product_id,product_variant_name,product_variant_price,product_variant_price_before_discount,product_variant_stock,product_variant_sku,createdat,updatedat
from {self.config.get('crawler_schema')}.{self.config.get('variant_tab')} where rce_source_variant_id = {data['rce_source_variant_id']} and rce_product_id = {data['rce_product_id']}
"""
logging.info(sql)
self.cur.execute(sql)
def rce_brand(self, data):
sql = f"""
select * from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} where rce_source_brand_id = {data['rce_source_brand_id']}
and rce_source_id = {data['rce_source_id']}
"""
self.cur.execute(sql)
res = self.cur.fetchone()
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('brand_tab'))
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('brand_tab'))
if not res:
sql = f"""
insert into {self.config.get('crawler_schema')}.{self.config.get('brand_tab')}(id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating)
values({id_main},{data['rce_source_id']},{data['rce_source_brand_id']},{data['rce_source_brand_status']},'{data['brand_page_url']}','{data['brand_page_url_hash']}','{data['brand_name']}',{data['brand_following']},{data['brand_rating']})
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('brand_tab')}(auditid,id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat)
select {id_aud}, id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')}
where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_id'])==str(res[1]) and str(data['rce_source_brand_status'])==str(res[3]) and str(data['brand_page_url'])==str(res[4]) and \
str(data['brand_page_url_hash'])==str(res[5]) and str(data['brand_name'])==str(res[6]) and str(data['rce_source_brand_id'])==str(res[2]):
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} set updatedat=GETDATE() where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
update {self.config.get('crawler_schema')}.aud_{self.config.get('brand_tab')} a set updatedat=b.updatedat
from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} b where a.id=b.id and b.id = {res[0]} and
b.rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
else:
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('brand_tab')} set rce_source_id={data['rce_source_id']}, rce_source_brand_id={data['rce_source_brand_id']},
rce_source_brand_status={data['rce_source_brand_status']}, brand_page_url='{data['brand_page_url']}', brand_page_url_hash='{data['brand_page_url_hash']}',
brand_name='{data['brand_name']}', brand_following={data['brand_following']}, brand_rating={data['brand_rating']}, updatedat=GETDATE() where rce_source_brand_id={data['rce_source_brand_id']}
and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('brand_tab')}(auditid,id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat)
select {id_aud}, id,rce_source_id,rce_source_brand_id,rce_source_brand_status,brand_page_url,brand_page_url_hash,brand_name,brand_following,brand_rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('brand_tab')}
where rce_source_brand_id={data['rce_source_brand_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
def rce_reseller(self, data):
data['reseller_name'] = data['reseller_name']
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name = '"+str(data['reseller_name'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('reseller_tab'))
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('reseller_tab'))
if not res:
sql = f"""
insert into {self.config.get('crawler_schema')}.{self.config.get('reseller_tab')}(id,rce_source_id,rce_source_reseller_status,reseller_name)
values({id_main},'{data['rce_source_id']}','{data['rce_source_reseller_status']}','{data['reseller_name']}')
"""
#logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('reseller_tab')}(auditid,id,rce_source_id,rce_source_reseller_status,reseller_name,createdat,updatedat)
select {id_aud}, id,rce_source_id,rce_source_reseller_status,reseller_name,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('reseller_tab')}
where reseller_name='{data['reseller_name']}'
"""
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_reseller_status'])==str(res[3]) and str(data['reseller_name'])==str(res[4]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set updatedat=GETDATE() " \
"where reseller_name = '"+ str(res[4])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" set rce_source_id="+str(data['rce_source_id'])+", " \
"rce_source_reseller_status="+str(data['rce_source_reseller_status'])+", reseller_name='"+str(data['reseller_name'])+"', reseller_average_rating=" \
"'"+str(data['reseller_average_rating'])+"',reseller_description='"+str(data['reseller_description'])+"', updatedat=GETDATE() where reseller_name = '"+ str(res[4])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_tab')+" (auditid,id,rce_source_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat) select "+str(id_aud)+", id,rce_source_id,rce_source_reseller_status," \
"reseller_name,reseller_average_rating,reseller_description,createdat,updatedat from " \
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_tab')+" where reseller_name='"+str(res[4])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_reseller_store(self, data):
data['store_page_url'] = data['store_page_url'].replace("'","''")
sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url = '"+str(data['store_page_url'])+"'"
self.cur.execute(sql)
res = self.cur.fetchone()
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('reseller_store_tab'))
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('reseller_store_tab'))
if not res:
sql = f"""
insert into {self.config.get('crawler_schema')}.{self.config.get('reseller_store_tab')}(id,rce_source_store_status,store_page_url,store_page_url_hash,rce_reseller_id,rce_source_id)
values({id_main},'{data['rce_source_store_status']}','{data['store_page_url']}','{data['store_page_url_hash']}',{data['rce_reseller_id']},{data['rce_source_id']})
"""
#logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('reseller_store_tab')}(auditid,id,rce_source_store_status,store_page_url,store_page_url_hash,rce_reseller_id,createdat,updatedat,rce_source_id)
select {id_aud}, id,rce_source_store_status,store_page_url,store_page_url_hash,rce_reseller_id,createdat,updatedat,rce_source_id from {self.config.get('crawler_schema')}.{self.config.get('reseller_store_tab')}
where store_page_url= '{data['store_page_url']}'
"""
#logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_source_store_status'])==str(res[2]) and str(data['store_page_url'])==str(res[3]) and \
str(data['store_page_url_hash'])==str(res[4]) and \
str(data['rce_reseller_id'])==str(res[6]) and str(data['rce_source_id'])==str(res[9]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set updatedat=GETDATE() " \
"where store_page_url = '"+ str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" b where a.id=b.id and b.id = "+str(res[0])
#logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" set " \
"rce_source_store_status="+str(data['rce_source_store_status'])+", store_page_url='"+str(data['store_page_url'])+"', store_page_url_hash=" \
"'"+str(data['store_page_url_hash'])+"',store_location='"+str(data['store_location'])+"', rce_reseller_id="+str(data['rce_reseller_id'])+", " \
"updatedat=GETDATE(), rce_source_id="+str(data['rce_source_id'])+" where store_page_url = '"+ str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('reseller_store_tab')+" (auditid,id,rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id) select "+id_aud+", id,rce_source_store_status," \
"store_page_url,store_page_url_hash,store_location,rce_reseller_id,createdat,updatedat,rce_source_id from " \
""+self.config.get('crawler_schema')+"."+self.config.get('reseller_store_tab')+" where store_page_url='"+str(res[3])+"'"
#logging.info(sql)
self.cur.execute(sql)
def rce_ratings_reviews(self, data):
sql = f"""
select * from {self.config.get('crawler_schema')}.{self.config.get('review_tab')}
where rce_product_id = {data['rce_product_id']} and username = '{data['username']}'
"""
self.cur.execute(sql)
res = self.cur.fetchone()
data['username'] = data['username'].replace("'","")
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('review_tab'))
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('review_tab'))
if not res:
sql = f"""
insert into {self.config.get('crawler_schema')}.{self.config.get('review_tab')}(id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating)
values({id_main},{data['rce_product_id']},'{data['username']}','{data['review']}','{data['img_url']}',{data['review_like_count']},'{data['user_tier']}',{data['shop_id']},'{data['video_url']}',{data['rating']})
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('review_tab')}(auditid,id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat)
select {id_aud},id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('review_tab')}
where rce_product_id = {data['rce_product_id']} and username = '{data['username']}'
"""
logging.info(sql)
self.cur.execute(sql)
else:
if str(data['rce_product_id'])==str(res[1]) and str(data['username'])==str(res[2]) and str(data['review'])==str(res[3]) and \
str(data['img_url'])==str(res[4]) and str(data['review_like_count'])==str(res[5]) and str(data['user_tier'])==str(res[6]) and \
str(data['shop_id'])==str(res[7]) and str(data['video_url'])==str(res[8]) and str(data['rating'])==str(res[9]):
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set updatedat=GETDATE() " \
"where rce_product_id = "+ str(res[1])+" and username ='"+res[2]+"'"
logging.info(sql)
self.cur.execute(sql)
sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tab')+" a set updatedat=b.updatedat " \
"from "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" b where a.id=b.id and b.id = "+str(res[0])
logging.info(sql)
self.cur.execute(sql)
else:
sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tab')+" set rce_product_id="+str(data['rce_product_id'])+", " \
"username='"+str(data['username'])+"', review='"+str(data['review'])+"', img_url=" \
"'"+str(data['img_url'])+"',review_like_count="+str(data['review_like_count'])+", user_tier='"+str(data['user_tier'])+"', " \
"shop_id="+str(data['shop_id'])+", video_url='"+str(data['video_url'])+"', rating='"+str(data['rating'])+"', updatedat=GETDATE() " \
"where rce_product_id = "+ str(res[1])+" and username ='"+str(data['username'])+"'"
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('review_tab')}(auditid,id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat)
select {id_aud},id,rce_product_id,username,review,img_url,review_like_count,user_tier,shop_id,video_url,rating,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('review_tab')}
where rce_product_id = {data['rce_product_id']} and username = '{data['username']}'
"""
logging.info(sql)
self.cur.execute(sql)
def rce_seo(self, data):
sql = f"""
select * from {self.config.get('crawler_schema')}.{self.config.get('seo_tab')}
where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']}
"""
self.cur.execute(sql)
res = self.cur.fetchone()
id_main = self.get_id(self.config.get('crawler_schema'), self.config.get('seo_tab'))
id_aud = self.get_aud_id(self.config.get('crawler_schema'), "aud_" + self.config.get('seo_tab'))
if not res:
sql = f"""
insert into {self.config.get('crawler_schema')}.{self.config.get('seo_tab')}(id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots)
values({id_main},{data['rce_product_id']},{data['rce_source_id']},'{data['seo_title']}','{data['seo_description']}','{data['seo_url']}','{data['seo_url_hash']}','{data['seo_image']}',{data['seo_price_amount']},'{data['seo_price_currency']}','{data['seo_product_band']}','{data['seo_product_availability']}','{data['seo_product_category']}',
'{data['seo_product_condition']}',{data['seo_product_retailer_item_id']},'{data['seo_product_robots']}')
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('seo_tab')}(auditid,id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat)
select {id_aud},id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('seo_tab')}
where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
else:
if (str(data['rce_product_id']) == str(res[1]) and str(data['rce_source_id']) == str(res[2]) and str(data['seo_title']) == str(res[3]) and \
str(data['seo_description']) == str(res[4]) and str(data['seo_url']) == str(res[5]) and str(data['seo_url_hash']) == str(res[6]) and \
str(data['seo_image']) == str(res[7]) and str(data['seo_price_amount']) == str(res[8]) and str(data['seo_price_currency']) == str(res[9]) and \
str(data['seo_product_band']) == str(res[10])) and str(data['seo_product_availability']) == str(res[11]) and str(data['seo_product_category']) == str(res[12]) and \
str(data['seo_product_condition']) == str(res[13]) and str(data['seo_product_retailer_item_id']) == str(res[14]) and str(data['seo_product_robots']) == str(res[15]):
sql = "update " + self.config.get('crawler_schema') + "." + self.config.get('seo_tab') + " set updatedat=GETDATE() " \
"where rce_product_id = " + str(res[1]) + " and rce_source_id =" + str(data['rce_source_id'])
logging.info(sql)
self.cur.execute(sql)
sql = "update " + self.config.get('crawler_schema') + ".aud_" + self.config.get('seo_tab') + " a set updatedat=b.updatedat " \
"from " + self.config.get('crawler_schema') + "." + self.config.get('seo_tab') + " b where a.id=b.id and b.id = " + str(res[0])
logging.info(sql)
self.cur.execute(sql)
else:
sql = f"""
update {self.config.get('crawler_schema')}.{self.config.get('seo_tab')} set rce_product_id={data['rce_product_id']}, rce_source_id={data['rce_source_id']}, seo_title='{data['seo_title']}', seo_description='{data['seo_description']}',
seo_url='{data['seo_url']}', seo_url_hash='{data['seo_url_hash']}', seo_image='{data['seo_image']}', seo_price_amount='{data['seo_price_amount']}', seo_price_currency='{data['seo_price_currency']}', seo_product_band='{data['seo_product_band']}',
seo_product_availability='{data['seo_product_availability']}', seo_product_category='{data['seo_product_category']}', seo_product_condition='{data['seo_product_condition']}', seo_product_retailer_item_id={data['seo_product_retailer_item_id']},
seo_product_robots='{data['seo_product_robots']}' where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
sql = f"""
insert into {self.config.get('crawler_schema')}.aud_{self.config.get('seo_tab')}(auditid,id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat)
select {id_aud},id,rce_product_id,rce_source_id,seo_title,seo_description,seo_url,seo_url_hash,seo_image,seo_price_amount,seo_price_currency,seo_product_band,seo_product_availability,seo_product_category,seo_product_condition,seo_product_retailer_item_id,seo_product_robots,createdat,updatedat from {self.config.get('crawler_schema')}.{self.config.get('seo_tab')}
where rce_product_id = {data['rce_product_id']} and rce_source_id = {data['rce_source_id']}
"""
logging.info(sql)
self.cur.execute(sql)
# def rce_ratings_reviews_productmodels(self,data):
#
# sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id = "+str(data['rce_rating_id'])
# self.cur.execute(sql)
# res = self.cur.fetchone()
#
#
# if not res:
#
# sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" (rce_rating_id,model_id) " \
# "values("+str(data['rce_rating_id'])+",'"+str(data['model_id'])+"')"
# #logging.info(sql)
#
# self.cur.execute(sql)
#
# sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
# "createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
# ""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(data['rce_rating_id'])+""
# #logging.info(sql)
#
# self.cur.execute(sql)
#
# else:
#
# if str(data['rce_rating_id'])==str(res[1]) and str(data['model_id'])==str(res[2]):
#
# sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set updatedat=GETDATE() " \
# "where rce_rating_id = "+ str(res[1])
# #logging.info(sql)
# self.cur.execute(sql)
#
# sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" a set updatedat=b.updatedat " \
# "from "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" b where a.id=b.id and b.id = "+str(res[0])
# #logging.info(sql)
# self.cur.execute(sql)
# else:
#
# sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" set model_id="+str(data['model_id'])+", " \
# "updatedat=GETDATE() where rce_source_store_id = "+ str(res[1])
# #logging.info(sql)
# self.cur.execute(sql)
#
# sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_productmodels_tab')+" (id,rce_rating_id,model_id," \
# "createdat,updatedat) select id,rce_rating_id,model_id,createdat,updatedat from " \
# ""+self.config.get('crawler_schema')+"."+self.config.get('review_productmodels_tab')+" where rce_rating_id="+str(res[1])+""
# #logging.info(sql)
#
# self.cur.execute(sql)
#
#
# def rce_tags(self,data):
#
# sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description = '"+str(data['description'])+"'"
# self.cur.execute(sql)
# res = self.cur.fetchone()
#
#
# if not res:
#
# sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" (id,description) " \
# "values("+str(data['id'])+",'"+str(data['description'])+"')"
# #logging.info(sql)
#
# self.cur.execute(sql)
#
# sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
# "createdat,updatedat) select id,description,createdat,updatedat from " \
# ""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(data['description'])+"'"
# #logging.info(sql)
#
# self.cur.execute(sql)
#
# else:
#
# if str(data['description'])==str(res[1]):
#
# sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set updatedat=GETDATE() " \
# "where description = '"+ str(res[1])+"'"
# #logging.info(sql)
# self.cur.execute(sql)
#
# sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" a set updatedat=b.updatedat " \
# "from "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
# #logging.info(sql)
# self.cur.execute(sql)
# else:
#
# sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" set description='"+str(data['description'])+"', " \
# "updatedat=GETDATE() where description = "+ str(res[1])
# #logging.info(sql)
# self.cur.execute(sql)
#
# sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_tags_tab')+" (id,description," \
# "createdat,updatedat) select id,description,createdat,updatedat from " \
# ""+self.config.get('crawler_schema')+"."+self.config.get('review_tags_tab')+" where description='"+str(res[1])+"'"
# #logging.info(sql)
#
# self.cur.execute(sql)
#
#
# def rce_ratings_reviews_producttags(self,data):
#
# sql = "select * from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id = '"+str(data['rce_rating_id'])+"'"
# self.cur.execute(sql)
# res = self.cur.fetchone()
#
#
# if not res:
#
# sql = "insert into "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" (rce_rating_id,tag_ids) " \
# "values("+str(data['rce_rating_id'])+",'"+str(data['tag_ids'])+"')"
# #logging.info(sql)
#
# self.cur.execute(sql)
#
# sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
# "createdat,updatedat) select id,rce_rating_id,tag_ids,createdat,updatedat from " \
# ""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where rce_rating_id='"+str(data['rce_rating_id'])+"'"
# #logging.info(sql)
#
# self.cur.execute(sql)
#
# else:
#
# if str(data['rce_rating_id'])==str(res[1]):
#
# sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set updatedat=GETDATE() " \
# "where rce_rating_id = '"+ str(res[1])+"'"
# #logging.info(sql)
# self.cur.execute(sql)
#
# sql = "update "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" a set updatedat=b.updatedat " \
# "from "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" b where a.id=b.id and b.id = "+str(res[0])
# #logging.info(sql)
# self.cur.execute(sql)
# else:
#
# sql = "update "+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" set rce_rating_id='"+str(data['rce_rating_id'])+"', " \
# "updatedat=GETDATE() where rce_rating_id = "+ str(res[1])
# #logging.info(sql)
# self.cur.execute(sql)
#
# sql = "insert into "+self.config.get('crawler_schema')+".aud_"+self.config.get('review_producttags_tab')+" (id,rce_rating_id,tag_ids," \
# "createdat,updatedat) select id,description,createdat,updatedat from " \
# ""+self.config.get('crawler_schema')+"."+self.config.get('review_producttags_tab')+" where description='"+str(res[1])+"'"
# #logging.info(sql)
#
# self.cur.execute(sql)
#
#