FB Marketplace crawler

This commit is contained in:
Shariar Imtiaz 2024-02-01 12:22:05 +04:00
parent 5d1e79452b
commit 9940025b2b
2 changed files with 60 additions and 48 deletions

View File

@ -9,7 +9,7 @@ logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%
config = {
"crawler_name": "raena_crawler_engine_fb_marketplace",
"crawler_schema": "test_spider_management",
"crawler_schema": "raena_spider_management",
"tracker_tab": "facebook_crawler_tracker",
"category_tab": "rce_category",
"product_tab": "rce_product",
@ -24,12 +24,12 @@ config = {
}
def run_pipeline ():
#marketplace_products = FbMarketplaceProducts(config=config)
#marketplace_products.getProducts()
marketplace_products = FbMarketplaceProducts(config=config)
marketplace_products.getProducts()
#url = "https://www.facebook.com/marketplace/item/266638329606593?ref=category_feed&referral_code=null&referral_story_type=listing&tracking=%7B%22qid%22%3A%22-2656048044644247513%22%2C%22mf_story_key%22%3A%2224843562218625057%22%2C%22commerce_rank_obj%22%3A%22%7B%5C%22target_id%5C%22%3A24843562218625057%2C%5C%22target_type%5C%22%3A0%2C%5C%22primary_position%5C%22%3A43%2C%5C%22ranking_signature%5C%22%3A6122896018789849685%2C%5C%22commerce_channel%5C%22%3A504%2C%5C%22value%5C%22%3A0.00044445870408613%2C%5C%22candidate_retrieval_source_map%5C%22%3A%7B%5C%2224843562218625057%5C%22%3A3001%7D%7D%22%2C%22ftmd_400706%22%3A%22111112l%22%7D"
product_info = FbMarketplaceProductInfo(config)
product_info.run_product_pipeline()
url = "https://www.facebook.com/marketplace/item/266638329606593?ref=category_feed&referral_code=null&referral_story_type=listing&tracking=%7B%22qid%22%3A%22-2656048044644247513%22%2C%22mf_story_key%22%3A%2224843562218625057%22%2C%22commerce_rank_obj%22%3A%22%7B%5C%22target_id%5C%22%3A24843562218625057%2C%5C%22target_type%5C%22%3A0%2C%5C%22primary_position%5C%22%3A43%2C%5C%22ranking_signature%5C%22%3A6122896018789849685%2C%5C%22commerce_channel%5C%22%3A504%2C%5C%22value%5C%22%3A0.00044445870408613%2C%5C%22candidate_retrieval_source_map%5C%22%3A%7B%5C%2224843562218625057%5C%22%3A3001%7D%7D%22%2C%22ftmd_400706%22%3A%22111112l%22%7D"
#product_info = FbMarketplaceProductInfo(config)
#product_info.run_product_pipeline()

View File

@ -16,7 +16,17 @@ logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%
class FbMarketplaceProducts:
def __init__(self, config):
self.config = config
self.url = "https://www.facebook.com/marketplace/hochiminhcity/beauty-products"
self.url = [
"https://www.facebook.com/marketplace/hochiminhcity/beauty-products",
"https://www.facebook.com/marketplace/106388046062960/beauty-products/?exact=false",
"https://www.facebook.com/marketplace/111711568847056/beauty-products/?exact=false",
"https://www.facebook.com/marketplace/107751605926185/beauty-products/?exact=false",
"https://www.facebook.com/marketplace/108121405876426/beauty-products/?exact=false",
"https://www.facebook.com/marketplace/109205905763791/beauty-products/?exact=false",
"https://www.facebook.com/marketplace/109477335744202/beauty-products/?exact=false",
"https://www.facebook.com/marketplace/114668461883395/beauty-products/?exact=false",
"https://www.facebook.com/marketplace/112933135384816/beauty-products/?exact=false"
]
self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'),
password=self.config.get('db_pass'), host=self.config.get('db_host'),
@ -29,7 +39,7 @@ class FbMarketplaceProducts:
###### S3 bucket information
self.S3_KEY = "AKIAR2YL57QCWO34OE4J"
self.S3_SECRET = "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"
self.S3_BUCKET = "s3://raena-prod-data-engineering/mongo_order/"
self.S3_BUCKET = "s3://raena-prod-data-engineering/crawler_engine/fb-marketplace/"
self.version_main = 120
self.driver = Chrome(options=op, version_main=self.version_main)
@ -38,13 +48,15 @@ class FbMarketplaceProducts:
self.driver.quit()
def getProducts(self):
self.driver.get(self.url)
for url in self.url:
logging.info("Getting products of {}".format(url))
self.driver.get(url)
self.driver.execute_script("document.body.style.zoom='25%'")
time.sleep(10)
##### Scrolling
#smartScroll(self.driver, stopAtBorder=True, distancePerSecond=2000, humanBreaks=True, timeout=10, stopWhenDocHeightNotChangedSince=60)
# Scroll down using Page Down key
######## Scroll down using Page Down key
# html_element = self.driver.find_element(By.TAG_NAME, 'html')
# for _ in range(1000):
# html_element.send_keys(Keys.PAGE_DOWN)