raena-crawler-engine/oliveyoung_crawler/spiders/oliveyoung_bk.py

64 lines
2.9 KiB
Python
Raw Permalink Normal View History

2024-01-24 13:05:07 +00:00
# oliveyoung.py
import scrapy
import requests
class OliveYoungSpider(scrapy.Spider):
name = 'oliveyoung_bk'
start_urls = [
'https://global.oliveyoung.com/?gad=1&gclid=CjwKCAjwq4imBhBQEiwA9Nx1Bi5w7mSF9wgKTFqfX37hyG_c3ocYHldGoXbIX1XfYKQQFxLOPECJCxoCxpEQAvD_BwE']
def parse(self, response):
sections = {
"Best Sellers": "//div[@class='slick-slider-customized']/div[contains(@class,'slick-slide')]",
# "MD's Pick": "//section[@id='md_pick']/div[@class='item']/div[@class='product-item']",
# "Featured Brands": "//section[@id='brand_list']/div[@class='product-item']",
# "K-Pop": "//section[@id='kpop_list']/div[@class='product-item']",
# "INNISFREE": "//section[@id='brand_zone']/div[contains(@class,'brand-inn-store')]//div["
# "@class='product-item']",
# "Recommendation": "//section[@id='recommendation']/div[contains(@class,'product-item')]",
}
# Extract data from each section
for section_name, section_xpath in sections.items():
products = response.xpath(section_xpath)
for product in products:
brand_name = product.xpath(".//span[@class='brand']/text()").get()
product_name = product.xpath(".//span[@class='name']/text()").get()
price = product.xpath(".//span[@class='num']/text()").get()
if brand_name:
yield {
"brand_name": brand_name.strip(),
"product_name": product_name.strip(),
"price": price.strip(),
"section": section_name,
}
# # Generate hashtags for each brand name
# hashtags = [word.lower() for word in brand_name.split()]
# hashtags = '#'.join(hashtags)
# yield {
# "brand_name": brand_name.strip(),
# "hashtags": f"#{hashtags}",
# }
#
# # Fetch views data from TikTok API using tiktok_api.py
# views_all, views = get_hashtag_views(hashtags)
# yield {
# "brand_name": brand_name.strip(),
# "hashtags": f"#{hashtags}",
# "views_all": views_all,
# "views": views,
# }
def get_hashtag_views(hashtag):
url = f'https://ads.tiktok.com/creative_radar_api/v1/popular_trend/hashtag/detail?period=7&hashtag_name={hashtag}&country_code=IS'
headers = {
# Add the headers from the CURL request here
}
response = requests.get(url, headers=headers)
data = response.json()
return data.get('hashtag', {}).get('video_views_all', 0), data.get('hashtag', {}).get('video_views', 0)