FB Marketplace crawler

This commit is contained in:
Shariar Imtiaz 2024-02-01 12:22:05 +04:00
parent 5d1e79452b
commit 9940025b2b
2 changed files with 60 additions and 48 deletions

View File

@ -9,7 +9,7 @@ logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%
config = {
"crawler_name": "raena_crawler_engine_fb_marketplace",
"crawler_schema": "test_spider_management",
"crawler_schema": "raena_spider_management",
"tracker_tab": "facebook_crawler_tracker",
"category_tab": "rce_category",
"product_tab": "rce_product",
@ -24,12 +24,12 @@ config = {
}
def run_pipeline ():
#marketplace_products = FbMarketplaceProducts(config=config)
#marketplace_products.getProducts()
marketplace_products = FbMarketplaceProducts(config=config)
marketplace_products.getProducts()
#url = "https://www.facebook.com/marketplace/item/266638329606593?ref=category_feed&referral_code=null&referral_story_type=listing&tracking=%7B%22qid%22%3A%22-2656048044644247513%22%2C%22mf_story_key%22%3A%2224843562218625057%22%2C%22commerce_rank_obj%22%3A%22%7B%5C%22target_id%5C%22%3A24843562218625057%2C%5C%22target_type%5C%22%3A0%2C%5C%22primary_position%5C%22%3A43%2C%5C%22ranking_signature%5C%22%3A6122896018789849685%2C%5C%22commerce_channel%5C%22%3A504%2C%5C%22value%5C%22%3A0.00044445870408613%2C%5C%22candidate_retrieval_source_map%5C%22%3A%7B%5C%2224843562218625057%5C%22%3A3001%7D%7D%22%2C%22ftmd_400706%22%3A%22111112l%22%7D"
product_info = FbMarketplaceProductInfo(config)
product_info.run_product_pipeline()
url = "https://www.facebook.com/marketplace/item/266638329606593?ref=category_feed&referral_code=null&referral_story_type=listing&tracking=%7B%22qid%22%3A%22-2656048044644247513%22%2C%22mf_story_key%22%3A%2224843562218625057%22%2C%22commerce_rank_obj%22%3A%22%7B%5C%22target_id%5C%22%3A24843562218625057%2C%5C%22target_type%5C%22%3A0%2C%5C%22primary_position%5C%22%3A43%2C%5C%22ranking_signature%5C%22%3A6122896018789849685%2C%5C%22commerce_channel%5C%22%3A504%2C%5C%22value%5C%22%3A0.00044445870408613%2C%5C%22candidate_retrieval_source_map%5C%22%3A%7B%5C%2224843562218625057%5C%22%3A3001%7D%7D%22%2C%22ftmd_400706%22%3A%22111112l%22%7D"
#product_info = FbMarketplaceProductInfo(config)
#product_info.run_product_pipeline()

View File

@ -16,7 +16,17 @@ logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%
class FbMarketplaceProducts:
def __init__(self, config):
self.config = config
self.url = "https://www.facebook.com/marketplace/hochiminhcity/beauty-products"
self.url = [
"https://www.facebook.com/marketplace/hochiminhcity/beauty-products",
"https://www.facebook.com/marketplace/106388046062960/beauty-products/?exact=false",
"https://www.facebook.com/marketplace/111711568847056/beauty-products/?exact=false",
"https://www.facebook.com/marketplace/107751605926185/beauty-products/?exact=false",
"https://www.facebook.com/marketplace/108121405876426/beauty-products/?exact=false",
"https://www.facebook.com/marketplace/109205905763791/beauty-products/?exact=false",
"https://www.facebook.com/marketplace/109477335744202/beauty-products/?exact=false",
"https://www.facebook.com/marketplace/114668461883395/beauty-products/?exact=false",
"https://www.facebook.com/marketplace/112933135384816/beauty-products/?exact=false"
]
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'),
password=self.config.get('db_pass'), host=self.config.get('db_host'),
@ -29,7 +39,7 @@ class FbMarketplaceProducts:
###### S3 bucket information
self.S3_KEY = "AKIAR2YL57QCWO34OE4J"
self.S3_SECRET = "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"
self.S3_BUCKET = "s3://raena-prod-data-engineering/mongo_order/"
self.S3_BUCKET = "s3://raena-prod-data-engineering/crawler_engine/fb-marketplace/"
self.version_main = 120
self.driver = Chrome(options=op, version_main=self.version_main)
@ -38,53 +48,55 @@ class FbMarketplaceProducts:
self.driver.quit()
def getProducts(self):
self.driver.get(self.url)
self.driver.execute_script("document.body.style.zoom='25%'")
time.sleep(10)
##### Scrolling
#smartScroll(self.driver, stopAtBorder=True, distancePerSecond=2000, humanBreaks=True, timeout=10, stopWhenDocHeightNotChangedSince=60)
for url in self.url:
logging.info("Getting products of {}".format(url))
self.driver.get(url)
self.driver.execute_script("document.body.style.zoom='25%'")
time.sleep(10)
##### Scrolling
#smartScroll(self.driver, stopAtBorder=True, distancePerSecond=2000, humanBreaks=True, timeout=10, stopWhenDocHeightNotChangedSince=60)
# Scroll down using Page Down key
# html_element = self.driver.find_element(By.TAG_NAME, 'html')
# for _ in range(1000):
# html_element.send_keys(Keys.PAGE_DOWN)
# time.sleep(random.randint(1,3))
######## Scroll down using Page Down key
# html_element = self.driver.find_element(By.TAG_NAME, 'html')
# for _ in range(1000):
# html_element.send_keys(Keys.PAGE_DOWN)
# time.sleep(random.randint(1,3))
product_elements = self.driver.find_elements(By.CSS_SELECTOR,".x9f619.x78zum5.x1r8uery.xdt5ytf.x1iyjqo2.xs83m0k.x1e558r4.x150jy0e.x1iorvi4.xjkvuk6.xnpuxes.x291uyu.x1uepa24")
product_elements = self.driver.find_elements(By.CSS_SELECTOR,".x9f619.x78zum5.x1r8uery.xdt5ytf.x1iyjqo2.xs83m0k.x1e558r4.x150jy0e.x1iorvi4.xjkvuk6.xnpuxes.x291uyu.x1uepa24")
products = []
for element in product_elements:
try:
name = element.find_element(By.CSS_SELECTOR, ".x1lliihq.x6ikm8r.x10wlt62.x1n2onr6").text
price = element.find_element(By.CSS_SELECTOR, ".x193iq5w.xeuugli.x13faqbe.x1vvkbs.xlh3980.xvmahel.x1n0sxbx.x1lliihq.x1s928wv.xhkezso.x1gmr53x.x1cpjm7i.x1fgarty.x1943h6x.x4zkp8e.x3x7a5m.x1lkfr7t.x1lbecb7.x1s688f.xzsf02u").text
url = element.find_element(By.TAG_NAME, 'a').get_attribute("href")
city = element.find_element(By.CSS_SELECTOR, ".x1lliihq.x6ikm8r.x10wlt62.x1n2onr6.xlyipyv.xuxw1ft").text
flag = 0
product = [name, price, url, city, flag]
products.append(product)
except:
pass
products = []
for element in product_elements:
try:
name = element.find_element(By.CSS_SELECTOR, ".x1lliihq.x6ikm8r.x10wlt62.x1n2onr6").text
price = element.find_element(By.CSS_SELECTOR, ".x193iq5w.xeuugli.x13faqbe.x1vvkbs.xlh3980.xvmahel.x1n0sxbx.x1lliihq.x1s928wv.xhkezso.x1gmr53x.x1cpjm7i.x1fgarty.x1943h6x.x4zkp8e.x3x7a5m.x1lkfr7t.x1lbecb7.x1s688f.xzsf02u").text
url = element.find_element(By.TAG_NAME, 'a').get_attribute("href")
city = element.find_element(By.CSS_SELECTOR, ".x1lliihq.x6ikm8r.x10wlt62.x1n2onr6.xlyipyv.xuxw1ft").text
flag = 0
product = [name, price, url, city, flag]
products.append(product)
except:
pass
print(str(len(products)) + " products found")
df = pd.DataFrame(products, columns=["Name", "Price", "Url", "City", "Flag"])
df = df.drop_duplicates()
df['Price'] = df['Price'].replace('Free', '0')
df['Price'] = df['Price'].str.replace('', '')
df['Price'] = df['Price'].str.replace(',', '')
print(str(len(products)) + " products found")
df = pd.DataFrame(products, columns=["Name", "Price", "Url", "City", "Flag"])
df = df.drop_duplicates()
df['Price'] = df['Price'].replace('Free', '0')
df['Price'] = df['Price'].str.replace('', '')
df['Price'] = df['Price'].str.replace(',', '')
filename = 'facebook_crawler_tracker' + str(datetime.now().strftime('%Y%m%d%H%M%S')) + ".csv"
filename = 'facebook_crawler_tracker' + str(datetime.now().strftime('%Y%m%d%H%M%S')) + ".csv"
df.to_csv(self.S3_BUCKET + filename,
storage_options={'key': self.S3_KEY, 'secret': self.S3_SECRET},
index=False,
header=False,
mode='a')
df.to_csv(self.S3_BUCKET + filename,
storage_options={'key': self.S3_KEY, 'secret': self.S3_SECRET},
index=False,
header=False,
mode='a')
table = f"""{self.config.get('crawler_schema')}.{self.config.get('tracker_tab')}"""
table = f"""{self.config.get('crawler_schema')}.{self.config.get('tracker_tab')}"""
sql = "COPY " + table + " FROM '" + self.S3_BUCKET + filename + \
"' credentials 'aws_access_key_id=" + self.S3_KEY + ";aws_secret_access_key=" + self.S3_SECRET + "' CSV"
sql = "COPY " + table + " FROM '" + self.S3_BUCKET + filename + \
"' credentials 'aws_access_key_id=" + self.S3_KEY + ";aws_secret_access_key=" + self.S3_SECRET + "' CSV"
print("Syncing SQL is: {}".format(sql))
self.cur.execute(sql)
print("Syncing SQL is: {}".format(sql))
self.cur.execute(sql)