-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
127 lines (98 loc) · 3.74 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import pandas as pd
import requests
from bs4 import BeautifulSoup
import argparse
from pathlib import Path
def scrape(url):
"""
Accepts a link and creates/appends a csv databse consisting of information of user poitning to his/her comment alongwith the underlyting Blog name.
"""
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, 'html.parser') #parse object
#reading text
comments = [] #contains every comment
usernames = [] #contains every user who commented
#search for comment elements with class comment-body in the html page
comment_elem = soup.find_all(attrs = {'class' : 'comment-body'})
#search for usernames elements with class comment-body in the html page
user_elem = soup.find_all(attrs = {'class' : 'fn'})
for item in comment_elem: #store comments of users
com_text = item.find_all('p')
text = ''
for p in com_text:
text = text + p.text + ' '
comments.append(text)
for user in user_elem: #store name of users
usernames.append(user.text)
usernames.pop(0)
Dict = {} #dictionary to map user name to comment
#Dict maps every user to its comment
for _, (usr,com) in enumerate(zip(usernames, comments)):
Dict[usr] = com
#saving data into csv file
db = pd.DataFrame({'Blog' : url[47 : -1].replace('-', ' ').capitalize(), 'User' : usernames, 'Comment' : comments})
if Path("database.csv").is_file():
with open ('database.csv', 'a') as f:
db.to_csv(f, header = False)
else:
db.to_csv('database.csv')
def scrape_blog(url):
"""
Accepts a link and scrapes the Blog texts into a seperate text file for all the blogs uptil which the limit is specified.
"""
html_page = requests.get(url)
soup = BeautifulSoup(html_page.text, 'html.parser')
blog = soup.find_all(attrs = {'class' : 'entry-content single'})
file = open(url[47 : -1] + '.txt', 'w+')
for para in blog:
para_text = para.find_all(['p', 'li'])
text = ''
for p in para_text:
text = text + p.text + ' '
file.write(text)
file.close()
def scrapeParent():
"""
This module pre-processes the blogs in order to scrape them through methods scrape() and scrape_blog()
"""
url = 'https://blogs.msdn.microsoft.com/ie/page/'
parser = argparse.ArgumentParser()
parser.add_argument(
'-p',
'--page',
help = 'IEBlog database till a page number'
)
parser.add_argument(
'-b',
'--blog',
action = 'store_true',
help = 'Scrape blog in a txt file'
)
args = parser.parse_args()
blog_url = []
if args.page:
for i in range(1):
response = requests.get(url + str(int(args.page)) + '/')
data = response.text
soup = BeautifulSoup(data, 'lxml')
tags = soup.find_all('a', attrs = {'rel' : 'bookmark'})
bookmarks = []
for tag in tags:
bookmarks.append(tag.get('href'))
if args.page == 1:
bookmarks.pop(0)
comment_count = []
nums = soup.find_all(attrs = {'class' : 'comments-link'})
for num in nums:
comment_count.append(num.text)
if i == 0:
#threshold for comment extraction of a given blog
for _, (link, count) in enumerate(zip(bookmarks, comment_count)):
if int(str(count)) > 70 :
blog_url.append(link)
scrape(link)
if args.blog:
for url in blog_url:
scrape_blog(url)
if __name__ == '__main__':
scrapeParent()