diff --git a/fb_marketplace/fb_marketplace_main.py b/fb_marketplace/fb_marketplace_main.py index 1a415ef..9e51ef5 100644 --- a/fb_marketplace/fb_marketplace_main.py +++ b/fb_marketplace/fb_marketplace_main.py @@ -9,7 +9,7 @@ logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:% config = { "crawler_name": "raena_crawler_engine_fb_marketplace", - "crawler_schema": "test_spider_management", + "crawler_schema": "raena_spider_management", "tracker_tab": "facebook_crawler_tracker", "category_tab": "rce_category", "product_tab": "rce_product", @@ -24,12 +24,12 @@ config = { } def run_pipeline (): - #marketplace_products = FbMarketplaceProducts(config=config) - #marketplace_products.getProducts() + marketplace_products = FbMarketplaceProducts(config=config) + marketplace_products.getProducts() - #url = "https://www.facebook.com/marketplace/item/266638329606593?ref=category_feed&referral_code=null&referral_story_type=listing&tracking=%7B%22qid%22%3A%22-2656048044644247513%22%2C%22mf_story_key%22%3A%2224843562218625057%22%2C%22commerce_rank_obj%22%3A%22%7B%5C%22target_id%5C%22%3A24843562218625057%2C%5C%22target_type%5C%22%3A0%2C%5C%22primary_position%5C%22%3A43%2C%5C%22ranking_signature%5C%22%3A6122896018789849685%2C%5C%22commerce_channel%5C%22%3A504%2C%5C%22value%5C%22%3A0.00044445870408613%2C%5C%22candidate_retrieval_source_map%5C%22%3A%7B%5C%2224843562218625057%5C%22%3A3001%7D%7D%22%2C%22ftmd_400706%22%3A%22111112l%22%7D" - product_info = FbMarketplaceProductInfo(config) - product_info.run_product_pipeline() + url = "https://www.facebook.com/marketplace/item/266638329606593?ref=category_feed&referral_code=null&referral_story_type=listing&tracking=%7B%22qid%22%3A%22-2656048044644247513%22%2C%22mf_story_key%22%3A%2224843562218625057%22%2C%22commerce_rank_obj%22%3A%22%7B%5C%22target_id%5C%22%3A24843562218625057%2C%5C%22target_type%5C%22%3A0%2C%5C%22primary_position%5C%22%3A43%2C%5C%22ranking_signature%5C%22%3A6122896018789849685%2C%5C%22commerce_channel%5C%22%3A504%2C%5C%22value%5C%22%3A0.00044445870408613%2C%5C%22candidate_retrieval_source_map%5C%22%3A%7B%5C%2224843562218625057%5C%22%3A3001%7D%7D%22%2C%22ftmd_400706%22%3A%22111112l%22%7D" + #product_info = FbMarketplaceProductInfo(config) + #product_info.run_product_pipeline() diff --git a/fb_marketplace/fb_marketplace_products.py b/fb_marketplace/fb_marketplace_products.py index 353fe26..d114db2 100644 --- a/fb_marketplace/fb_marketplace_products.py +++ b/fb_marketplace/fb_marketplace_products.py @@ -16,7 +16,17 @@ logging.basicConfig(format=format, level=logging.INFO, datefmt="%Y-%m-%d %H:%M:% class FbMarketplaceProducts: def __init__(self, config): self.config = config - self.url = "https://www.facebook.com/marketplace/hochiminhcity/beauty-products" + self.url = [ + "https://www.facebook.com/marketplace/hochiminhcity/beauty-products", + "https://www.facebook.com/marketplace/106388046062960/beauty-products/?exact=false", + "https://www.facebook.com/marketplace/111711568847056/beauty-products/?exact=false", + "https://www.facebook.com/marketplace/107751605926185/beauty-products/?exact=false", + "https://www.facebook.com/marketplace/108121405876426/beauty-products/?exact=false", + "https://www.facebook.com/marketplace/109205905763791/beauty-products/?exact=false", + "https://www.facebook.com/marketplace/109477335744202/beauty-products/?exact=false", + "https://www.facebook.com/marketplace/114668461883395/beauty-products/?exact=false", + "https://www.facebook.com/marketplace/112933135384816/beauty-products/?exact=false" + ] self.conn = psycopg2.connect(database=self.config.get('database'), user=self.config.get('db_user'), password=self.config.get('db_pass'), host=self.config.get('db_host'), @@ -29,7 +39,7 @@ class FbMarketplaceProducts: ###### S3 bucket information self.S3_KEY = "AKIAR2YL57QCWO34OE4J" self.S3_SECRET = "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" - self.S3_BUCKET = "s3://raena-prod-data-engineering/mongo_order/" + self.S3_BUCKET = "s3://raena-prod-data-engineering/crawler_engine/fb-marketplace/" self.version_main = 120 self.driver = Chrome(options=op, version_main=self.version_main) @@ -38,53 +48,55 @@ class FbMarketplaceProducts: self.driver.quit() def getProducts(self): - self.driver.get(self.url) - self.driver.execute_script("document.body.style.zoom='25%'") - time.sleep(10) - ##### Scrolling - #smartScroll(self.driver, stopAtBorder=True, distancePerSecond=2000, humanBreaks=True, timeout=10, stopWhenDocHeightNotChangedSince=60) + for url in self.url: + logging.info("Getting products of {}".format(url)) + self.driver.get(url) + self.driver.execute_script("document.body.style.zoom='25%'") + time.sleep(10) + ##### Scrolling + #smartScroll(self.driver, stopAtBorder=True, distancePerSecond=2000, humanBreaks=True, timeout=10, stopWhenDocHeightNotChangedSince=60) - # Scroll down using Page Down key - # html_element = self.driver.find_element(By.TAG_NAME, 'html') - # for _ in range(1000): - # html_element.send_keys(Keys.PAGE_DOWN) - # time.sleep(random.randint(1,3)) + ######## Scroll down using Page Down key + # html_element = self.driver.find_element(By.TAG_NAME, 'html') + # for _ in range(1000): + # html_element.send_keys(Keys.PAGE_DOWN) + # time.sleep(random.randint(1,3)) - product_elements = self.driver.find_elements(By.CSS_SELECTOR,".x9f619.x78zum5.x1r8uery.xdt5ytf.x1iyjqo2.xs83m0k.x1e558r4.x150jy0e.x1iorvi4.xjkvuk6.xnpuxes.x291uyu.x1uepa24") + product_elements = self.driver.find_elements(By.CSS_SELECTOR,".x9f619.x78zum5.x1r8uery.xdt5ytf.x1iyjqo2.xs83m0k.x1e558r4.x150jy0e.x1iorvi4.xjkvuk6.xnpuxes.x291uyu.x1uepa24") - products = [] - for element in product_elements: - try: - name = element.find_element(By.CSS_SELECTOR, ".x1lliihq.x6ikm8r.x10wlt62.x1n2onr6").text - price = element.find_element(By.CSS_SELECTOR, ".x193iq5w.xeuugli.x13faqbe.x1vvkbs.xlh3980.xvmahel.x1n0sxbx.x1lliihq.x1s928wv.xhkezso.x1gmr53x.x1cpjm7i.x1fgarty.x1943h6x.x4zkp8e.x3x7a5m.x1lkfr7t.x1lbecb7.x1s688f.xzsf02u").text - url = element.find_element(By.TAG_NAME, 'a').get_attribute("href") - city = element.find_element(By.CSS_SELECTOR, ".x1lliihq.x6ikm8r.x10wlt62.x1n2onr6.xlyipyv.xuxw1ft").text - flag = 0 - product = [name, price, url, city, flag] - products.append(product) - except: - pass + products = [] + for element in product_elements: + try: + name = element.find_element(By.CSS_SELECTOR, ".x1lliihq.x6ikm8r.x10wlt62.x1n2onr6").text + price = element.find_element(By.CSS_SELECTOR, ".x193iq5w.xeuugli.x13faqbe.x1vvkbs.xlh3980.xvmahel.x1n0sxbx.x1lliihq.x1s928wv.xhkezso.x1gmr53x.x1cpjm7i.x1fgarty.x1943h6x.x4zkp8e.x3x7a5m.x1lkfr7t.x1lbecb7.x1s688f.xzsf02u").text + url = element.find_element(By.TAG_NAME, 'a').get_attribute("href") + city = element.find_element(By.CSS_SELECTOR, ".x1lliihq.x6ikm8r.x10wlt62.x1n2onr6.xlyipyv.xuxw1ft").text + flag = 0 + product = [name, price, url, city, flag] + products.append(product) + except: + pass - print(str(len(products)) + " products found") - df = pd.DataFrame(products, columns=["Name", "Price", "Url", "City", "Flag"]) - df = df.drop_duplicates() - df['Price'] = df['Price'].replace('Free', '0') - df['Price'] = df['Price'].str.replace('₫', '') - df['Price'] = df['Price'].str.replace(',', '') + print(str(len(products)) + " products found") + df = pd.DataFrame(products, columns=["Name", "Price", "Url", "City", "Flag"]) + df = df.drop_duplicates() + df['Price'] = df['Price'].replace('Free', '0') + df['Price'] = df['Price'].str.replace('₫', '') + df['Price'] = df['Price'].str.replace(',', '') - filename = 'facebook_crawler_tracker' + str(datetime.now().strftime('%Y%m%d%H%M%S')) + ".csv" + filename = 'facebook_crawler_tracker' + str(datetime.now().strftime('%Y%m%d%H%M%S')) + ".csv" - df.to_csv(self.S3_BUCKET + filename, - storage_options={'key': self.S3_KEY, 'secret': self.S3_SECRET}, - index=False, - header=False, - mode='a') + df.to_csv(self.S3_BUCKET + filename, + storage_options={'key': self.S3_KEY, 'secret': self.S3_SECRET}, + index=False, + header=False, + mode='a') - table = f"""{self.config.get('crawler_schema')}.{self.config.get('tracker_tab')}""" + table = f"""{self.config.get('crawler_schema')}.{self.config.get('tracker_tab')}""" - sql = "COPY " + table + " FROM '" + self.S3_BUCKET + filename + \ - "' credentials 'aws_access_key_id=" + self.S3_KEY + ";aws_secret_access_key=" + self.S3_SECRET + "' CSV" + sql = "COPY " + table + " FROM '" + self.S3_BUCKET + filename + \ + "' credentials 'aws_access_key_id=" + self.S3_KEY + ";aws_secret_access_key=" + self.S3_SECRET + "' CSV" - print("Syncing SQL is: {}".format(sql)) - self.cur.execute(sql) + print("Syncing SQL is: {}".format(sql)) + self.cur.execute(sql)