generated from oracle-devrel/repo-template
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtrending_spider.py
51 lines (38 loc) · 1.46 KB
/
trending_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import scrapy
def build_link(i):
'''
Build the XPath for the link based on the index.
Args:
i (int): Index of the article.
Returns:
str: XPath for the link.
'''
return '/html/body/div[1]/div[4]/main/div[3]/div/div[2]/article[{}]/h2/a/@href'.format(i)
class GithubTrendingSpider(scrapy.Spider):
'''
Spider to crawl GitHub trending page and extract links.
'''
name = 'github_trending'
start_urls = ['https://github.com/trending']
# by default, GitHub will give us 25 results in the trending tab.
def parse(self, response):
'''
Parse the response and extract the links.
Args:
response (scrapy.http.Response): The response received from the request.
'''
link_list = list()
for x in range(1, 26):
link = response.xpath(build_link(x)).getall()
yield {
'link': link
}
link_list.append(link[0])
print(link_list)
with open('output.txt', 'w') as file:
# trick to avoid the last iteration to write \n in our output file.
for x in range(len(link_list)):
if x+1 != len(link_list): # checking if we are in the last iteration
file.write(link_list[x] + '\n')
else: # if we are, just write the link with no newline at the end.
file.write(link_list[x])