-
Notifications
You must be signed in to change notification settings - Fork 0
/
utopia.py
39 lines (35 loc) · 1.48 KB
/
utopia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# -*- coding: utf-8 -*-
import scrapy
class UtopiaSpider(scrapy.Spider):
name = 'utopia'
allowed_domains = ['utopia.de']
# scraping from list of best coffees
start_urls = ['https://utopia.de/bestenlisten/bio-kaffee-fair-trade-kaffee/']
def parse(self, response):
list_of_urls = response.css("h3.product-listing__headline a::attr(href)").extract()
for link in list_of_urls:
request = scrapy.Request(link,callback=self.parse)
yield request
#extract brand name
brand = response.css("h1::text").extract_first()
#extract reviews
reviews = response.css('div.commenttext > p').extract()
# self.log(reviews)
for review in reviews:
# remove tags
review = review.replace('<br>\n', ' ').replace('<p>', '').replace('</p>', '')
# counter for star-ratings based on number of i-class instances
stars = 0
for star in response.css("div.static-rating").extract():
stars = star.count('<i class="fa fa-star" aria-hidden="true">')
# dictionary for json
coffee_reviews = {
"brand":brand,
"rating": stars,
"review":review
}
yield coffee_reviews
# follow pagination
next_page_url = response.css("ul.pagination > li > a::attr(href)").extract_first()
if next_page_url:
yield scrapy.Request(url=next_page_url, callback=self.parse)