-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
130 lines (96 loc) · 4.6 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from asyncore import read
import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import sys
import time
import get_data
import click_functions
import parse_inputs
import unzip_and_parse
import os
from flask import Flask, render_template, request, send_file
from threading import Thread
# years = ["2019"]
ready_to_delete = False
app = Flask(__name__)
@app.route('/')
def index():
return render_template("index.html")
@app.route('/get_big_ten_data', methods=['POST'])
def get_big_ten_data():
filename = rest_of_code(list(request.form['unitIDs'].split(", ")), list(request.form['years'].split(", ")))
# try:
return send_file(filename, as_attachment=True)
# finally:
# time.sleep(5)
# os.remove(filename)
if "__name__" == "__main__":
app.run()
def rest_of_code(unitIDs, years):
print (unitIDs)
# this code checks to see if a valid file with a list of UnitIDs was given as input
# if not, it will use the default list of Big Ten Schools as input
# also check for if a list of years is given in a text file
# inputs = parse_inputs.parse_inputs(sys.argv)
# years = inputs[0]
# unitIDs = inputs[1]
# if unitIDs == "":
# print("List of UnitIDs not given, using default of Big10 Schools.")
# unitIDs = "145637, 153658, 174066, 181464, 147767, 243780, 240444, 151351, 163286, 170976, 171100, 204796, 214777, 186380, 144050"
if years == []:
print("List of years not given, will search latest year")
# temporarily installs correct chromedriver needed for selenium
chromedriver_autoinstaller.install()
# defines driver variable for use throughout
driver = webdriver.Chrome()
# opens chrome to the following url
driver.get('https://nces.ed.gov/ipeds/datacenter/default.aspx?gotoReportId=5&fromIpeds=true')
# find textbox for searching for institution
inst_tb = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="tbInstitutionSearch"]')))
# enter test_UnitID into textbox
driver.execute_script("arguments[0].setAttribute('value',arguments[1])",inst_tb, unitIDs)
# click "select" button to search for UnitID
driver.find_element(By.ID, 'contentPlaceHolder_ibtnSelectInstitutions').click()
# waits until continue button has loaded in
continue_btn = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="contentPlaceHolder_ibtnContinue"]')))
# clicks Check All link
driver.find_element(By.XPATH, '//span[text()="Check All"]').click()
# find all checkboxes that have been loaded and click them
# check_boxes = driver.find_elements(By.ID, 'cbUnitId')
# for check_box in check_boxes:
# check_box.click()
# click continue button
continue_btn.click()
view_data_continue_btn = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="contentPlaceHolder_divInstructions"]/table/tbody/tr/td[2]/a/img'))).click()
## finding data matching table as shown here: https://btaa-sitefinity.azurewebsites.net/docs/default-source/research-data/at-a-glance-2019-ipeds-with-2020-rankings-btaa.pdf?sfvrsn=393cfc1b_2
# getting data based on what years are wanted by user
# if no year file given, just get most recent year's data
if len(years) == 0:
avail_years = driver.find_elements(By.XPATH, "//a[contains(text(), '20')]")
avail_years[0].click()
get_data.get_data(2021, driver) # 2019 not necessary, just has to be at least 2019
# if year file give, get data from all the years in file
else:
for year in years:
year = year.rstrip()
click_functions.click_something("//a[contains(text(), '" + year + "')]", driver)
time.sleep(1)
get_data.get_data(int(year), driver)
driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.HOME)
time.sleep(1)
# click continue
click_functions.click_something('//*[@id="imgContinueButton"]', driver)
# find all CSV links that contain all the data for a particular year
csv_links = driver.find_elements(By.XPATH, '//a[contains(@href,"singleFile=true&command=csv")]')
# click each year's full data link
for csv_link in csv_links:
csv_link.click()
time.sleep(5)
driver.quit()
filename = unzip_and_parse.get_all_data(len(csv_links))
return filename