-
Notifications
You must be signed in to change notification settings - Fork 0
/
courseswebscraper.py
173 lines (130 loc) · 7.03 KB
/
courseswebscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import codecs
import bs4 #importing the beautifulsoup library i.e. the webscraping parser
from urllib.request import urlopen as uReq #module that opens URLs
from bs4 import BeautifulSoup as soup #renaming beautiful soup into something easier to type
from string import ascii_uppercase
import string
import csv
#major and requirements scraper
def majorscraper(catalog_url,majors,requirements):
uClient = uReq(catalog_url) #downloading and requesting data from url
catalog_html = uClient.read() #dumping requested data into a variable
uClient.close() #exiting the requester so that it won't keep requesting data (might cause overloading)
catalog_page = soup(catalog_html,'html.parser') #parsing the downloaded data to give a nested data structure for us to navigate
list = catalog_page.findAll('a',{'class':"main"}) #finds data that corresponds to attributes such as class or id
for major in list: #creating the loop for code to run on ALL divs, not just first one
major_name = major.text #indicating which particular item within the div we want
major_link = "http://catalog.registrar.ucla.edu/" + major.get("href")
uClient = uReq(major_link)
link_html = uClient.read()
uClient.close()
link_page = soup(link_html,'html.parser')
header = link_page.find('div',{'class':"main-text"}).h1.text
maintext = link_page.findAll('p')
course = ""
for paragraph in maintext:
paragraphtext = paragraph.text
if "Required" in paragraphtext:
course = course + paragraphtext
if " BA" in header or " BS" in header:
majors.append(header)
requirements.append(course)
else:
link2list = link_page.findAll("a",{"class":"main"})
for link2 in link2list:
link2text = link2.text
if " BA" in link2text or " BS" in link2text:
major_link2 = "http://catalog.registrar.ucla.edu/" + link2.get("href")
uClient = uReq(major_link2)
link2_html = uClient.read()
uClient.close()
link2_page = soup(link2_html,'html.parser')
header2 = link2_page.find('div',{'class':"main-text"}).h1.text
maintext2 = link2_page.findAll('p')
course2 = ""
for paragraph in maintext2:
paragraphtext = paragraph.text
if "Required" in paragraphtext:
course2 = course2 + paragraphtext
if " BA" in header2 or " BS" in header2:
majors.append(header2)
requirements.append(course2)
#course name scraper
def coursecodescraper(catalog_url,course_names,course_codes):
uClient = uReq(catalog_url) #downloading and requesting data from url
catalog_html = uClient.read() #dumping requested data into a variable
uClient.close() #exiting the requester so that it won't keep requesting data (might cause overloading)
catalog_page = soup(catalog_html,'html.parser')
list = catalog_page.findAll('a',{'class':'main'})
for course in list[2:]:
course_link = "https://catalog.registrar.ucla.edu/" + course.get("href")
uClient = uReq(course_link)
link_html = uClient.read()
uClient.close()
link_page = soup(link_html,'html.parser')
course_pages = link_page.findAll('a', {'class':'main'})
for course_page in course_pages:
course_text = course_page.text
if "Courses" in course_text:
course_name = course_text.replace(' Courses','')
course_names.append(course_name)
for x in range(1,199):
course_number = str(x)
course_codes.append(course_number)
for c in ascii_uppercase:
course_alphanumber = course_number + c
course_numberalpha = c + course_number
course_codes.append(course_alphanumber)
course_codes.append(course_numberalpha)
majors = []
requirements=[]
course_names = []
course_codes = []
majorscraper('https://catalog.registrar.ucla.edu/ucla-catalog18-19-4.html', majors, requirements)
coursecodescraper('https://catalog.registrar.ucla.edu/ucla-catalog18-19-271.html', course_names, course_codes)
with codecs.open("major_requirements.csv", 'w', 'utf8') as f:
for requirement, major in zip(requirements, majors):
for course_name in course_names:
if requirement.find(course_name) != -1:
f.write('"' + major + '"' + "," + '"' + course_name + '"' + "\n")
f.close()
with codecs.open("major_statistics.csv", 'w', 'utf8') as f:
for requirement, major in zip(requirements, majors):
required_courses = [];
for course_name in course_names:
if requirement.find(course_name) != -1:
required_courses.append(course_name)
f.write('"' + major + '"' + "," + '"' + str(len(required_courses)) + '"' + "\n")
f.close()
with codecs.open("course_statistics.csv", 'w', 'utf8') as f:
for course_name in course_names:
majors_require = []
for requirement, major in zip(requirements, majors):
if requirement.find(course_name) != -1:
majors_require.append(major)
f.write('"' + course_name + '"' + "," + '"' + str(len(majors_require)) + '"' + "\n")
f.close()
with codecs.open("majors_requiring_course.csv", 'w', 'utf8') as f:
for course_name in course_names:
for requirement, major in zip(requirements, majors):
if requirement.find(course_name) != -1:
f.write('"' + course_name + '"' + "," + '"' + major + '"' + "\n")
f.close()
with codecs.open("major_course_codes.csv", 'w', 'utf8') as f:
for require, major in zip(requirements, majors):
indexes = []
requirement = require.translate(str.maketrans('', '', string.punctuation))
for course_name in course_names:
if requirement.find(course_name) != -1:
indexes.append(requirement.index(course_name))
indexes.append(len(requirement)-1)
for x in range(0,(len(indexes)-2)):
for course_name in course_names:
if requirement.find(course_name, indexes[x],indexes[x+1]) != -1:
for course_code in course_codes:
space_course_code = " " + course_code + " "
if requirement.find(space_course_code,indexes[x],indexes[x+1]) != -1:
f.write('"' + major + '"' + "," + '"' + course_name + '"' + "," + '"' + course_code + '"' + "\n")
if requirement.find(space_course_code,indexes[(len(indexes)-1)],(len(requirement)-1)) != -1:
f.write('"' + major + '"' + "," + '"' + course_name + '"' + "," + '"' + course_code + '"' + "\n")
f.close()