-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunzip_and_parse.py
136 lines (119 loc) · 5.74 KB
/
unzip_and_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import shutil
import os
from pathlib import Path
import csv
import numpy as np
import pandas as pd
import time as time
def get_all_data(num_files):
all_data = []
file_offset = 0
downloads_path = Path.home() / "Downloads"
files = sorted(downloads_path.iterdir(), key=os.path.getmtime)
# collect data from each csv file (each file is data for a single year)
for j in range(0, num_files):
# below code makes sure that the file to unzip is one that was downloaded from IPEDs
# and also a file that has not been unzipped previously
file_count = file_offset + 1
file_name = ""
file_dir = ""
while ".zip" not in file_name and "CSV" not in file_name:
file_dir = str(files[len(files) - (file_count)])
file_name = file_dir[file_dir.rfind("\\") + 1:]
file_count += 1
file_offset = file_count - 1
# below code gets the name of the csv that was extracted from zip
# also makes sure file naming is correct
cwd = os.getcwd()
shutil.unpack_archive(file_dir, cwd)
if " " in file_name:
file_name = file_name[:file_name.index(" ")] + file_name[file_name.index("."):]
file_no_ext = file_name.replace(".zip", "")
csv_name = file_no_ext + ".csv"
# opens csv file and puts text into data variable
with open(csv_name, newline='') as f:
reader = csv.reader(f)
data = list(reader)
# defining variables to be used as indexes for data to be referenced later
private_expenses_idx = -1
region_idx = -1
land_grant_idx = -1
research_expenses_idx = -1
# changes each column header from original csv to have correct name
for idx, column in enumerate(data[0]):
if "institution name" in column:
data[0][idx] = "Institution Name"
if "year" in column:
data[0][idx] = "Year"
if "BEA" in column:
data[0][idx] = "Geographic Region per IPEDs"
region_idx = idx
if "Land Grant Institution" in column:
data[0][idx] = "Land Grant Institution"
land_grant_idx = idx
if "Total enrollment" in column:
data[0][idx] = "Total Students (Full-Time & Part-Time)"
if "Full-time enrollment" in column:
data[0][idx] = "Full-Time Students"
if "Full-time undergraduate enrollment" in column:
data[0][idx] = "Full-Time Undergraduates"
if "Full-time graduate enrollment" in column:
data[0][idx] = "Full-Time Graduates"
if "Doctor's degree - research/scholarship" in column:
data[0][idx] = "Research PhDs Awarded"
if "Instructional, research and public service FTE" in column or "FTE staff" in column:
data[0][idx] = "Full-Time Faculty (Instruction, Research, and Public Service)"
if "Instructional FTE" in column:
data[0][idx] = "Full-Time Faculty (Instruction Only)"
if "Research-Total amount" in column:
data[0][idx] = "Research Expenditures (Private School)"
private_expenses_idx = idx
if "Research - Current year total" in column:
data[0][idx] = "Research Expenditures"
research_expenses_idx = idx
# modifies values for data to be what we are looking for
for i in range(1, len(data)):
private_expenses = data[i][private_expenses_idx]
region = data[i][region_idx]
land_grant = data[i][land_grant_idx]
# checks to see if private expense was left blank
# if so, then school is public, otherwise private
# also moves value of private school research expenses column to column for public school, then later deletes private school column
if private_expenses == '':
data[i].append("Public")
else:
data[i].append("Private")
data[i][research_expenses_idx] = private_expenses
# only shows important value for region of school
data[i][region_idx] = region[:region.index("(")].rstrip()
# changes land grant values to just be yes or no
if "Not" in land_grant:
data[i][land_grant_idx] = "No"
else:
data[i][land_grant_idx] = "Yes"
# adds another column to csv for public/private university data
data[0].append("Public/Private University")
# converts data so that entire column/row can be deleted
n = len(max(data, key=len))
data = [x + [None]*(n-len(x)) for x in data]
data = np.array(data)
# deletes private expenses column
data = np.delete(data, private_expenses_idx, axis=1)
# deletes UnitID column
data = np.delete(data, 0, axis=1)
# sets up and adds to variable that stores data from all collected years
if j != 0:
# deletes the first row if adding to data from other years, only need column headers 1 time
data = np.delete(data, 0, 0)
# adds data for currently parsed year to all data
all_data = np.vstack((all_data, data))
else:
# instantiates variable to store all data
all_data = data
# deletes html and csv file that were extracted from zip file, no longer needed
os.remove(file_no_ext + ".csv")
os.remove(file_no_ext + ".html")
# stores all data as a csv
filename = "file" + str(time.time()) + ".csv"
pd.DataFrame(all_data).to_csv(filename,index=False,header=False)
return filename