-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsocs_queries.py
209 lines (165 loc) · 7.85 KB
/
socs_queries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import xml.etree.ElementTree as ET
import pandas as pd
from parse_example_scos_message import get_queries_from_scos
"""
Author: John Levander
Description:
The purpose of this program is to extract data from a dataframe using a simulator count output specification as the
query specification. This script is in it's early stages of development. As it sits right now, the program will
load a dataset of simulator output and filter out all rows that are about LATENT or INFECTIOUS MALES.
Log:
01/15/2016: Simple example to create dataframe queries given categorical variables. Next step is to support
integer variables and age ranges.
01/19/2016: Started to implement age ranges. Passing this off to Nick Millett for now.
"""
# Sample Simulator Count Output Specifications
all_sick_scos = {}
all_sick_scos["simulator_count_variables"] = {}
all_sick_scos["simulator_count_variables"]['infection_state'] = ["LATENT", "INFECTIOUS"]
all_sick_scos["output_options"] = {}
all_sick_scos["output_options"]['axes'] = {'simulator_time'}
all_sick_by_sex = {}
all_sick_by_sex["simulator_count_variables"] = {}
all_sick_by_sex["simulator_count_variables"]['infection_state'] = ["LATENT", "INFECTIOUS"]
all_sick_by_sex["simulator_count_variables"]['sex'] = ["M"]
all_sick_by_sex_by_age_range = {}
#this is the where clause
all_sick_by_sex_by_age_range["simulator_count_variables"] = {}
all_sick_by_sex_by_age_range["simulator_count_variables"]['infection_state'] = ["LATENT", "INFECTIOUS"]
all_sick_by_sex_by_age_range["simulator_count_variables"]['sex'] = ["M", "F"]
all_sick_by_sex_by_age_range["simulator_count_variables"]['age_range'] = {}
all_sick_by_sex_by_age_range["simulator_count_variables"]['age_range']['driving_age'] = [16,999]
all_sick_by_sex_by_age_range["simulator_count_variables"]['age_range']['voting_age'] = [18,999]
all_sick_by_sex_by_age_range["simulator_count_variables"]['age_range']['retirement_age'] = [65,999]
#this is the select clause
all_sick_by_sex_by_age_range["output_options"] = {}
all_sick_by_sex_by_age_range["output_options"]['axes'] = {'simulator_time', 'age_range'}
all_sick_by_sex_by_age = {}
#this is the where clause
all_sick_by_sex_by_age["simulator_count_variables"] = {}
all_sick_by_sex_by_age["simulator_count_variables"]['infection_state'] = ["LATENT", "INFECTIOUS"]
all_sick_by_sex_by_age["simulator_count_variables"]['sex'] = ["M", "F"]
all_sick_by_sex_by_age["simulator_count_variables"]['integer_age'] = {"min_age" : 30, "max_age" : 40}
#this is the select clause
all_sick_by_sex_by_age["output_options"] = {}
all_sick_by_sex_by_age["output_options"]['axes'] = {'simulator_time', 'integer_age'}
"""
TODO: Add support for these in next version.
all_sick_by_sex_by_age = {}
all_sick_by_sex_by_age["simulator_count_variables"] = {}
all_sick_by_sex_by_age["simulator_count_variables"]['infection_states'] = ["LATENT", "INFECTIOUS"]
all_sick_by_sex_by_age["simulator_count_variables"]['sex'] = ["M", "F"]
all_sick_by_sex_by_age["simulator_count_variables"]['integer_age'] = True
all_sick_by_sex_by_age_state = {}
all_sick_by_sex_by_age_state["simulator_count_variables"] = {}
all_sick_by_sex_by_age_state["simulator_count_variables"]['infection_state'] = ["LATENT", "INFECTIOUS"]
all_sick_by_sex_by_age_state["simulator_count_variables"]['sex'] = ["M", "F"]
all_sick_by_sex_by_age_state["simulator_count_variables"]['integer_age'] = "all"
all_sick_by_sex_by_age_state["simulator_count_variables"]['location_admin1'] = "all"
"""
m_inf = -float('inf')
p_inf = float('inf')
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
def filter_ranges(scos, df):
column_name = ''
if 'age_range' in scos["simulator_count_variables"]:
variable = 'age_range'
column_name = 'integer_age'
elif 'household_median_income' in scos["simulator_count_variables"]:
variable = 'household_median_income'
column_name = 'household_median_income'
else:
return df;
def filter_ranges_min_max(df, min, max):
if min == float('-inf') and max == float('inf'):
# no filtering required
return df
if min == float('-inf'):
query = "(" + column_name + " <= " + str(max) + ")"
elif max == float('inf'):
query = "(" + column_name + " >= " + str(min) + ")"
else:
query = "(" + column_name + " >= " + str(min) + ") & (" + column_name + " <= " + str(max) + ")"
# remove rows with age below min or above max
df = df.query(query)
return df
ranges = scos["simulator_count_variables"][variable]
# get min and max from all ranges
min = float("inf")
max = -float("inf")
for age_range in ranges:
if ranges[age_range]['range'][0] < min:
min = ranges[age_range]['range'][0]
if ranges[age_range]['range'][1] > max:
max = ranges[age_range]['range'][1]
# filter out all ages outside min and max
df = filter_ranges_min_max(df, min, max)
count = 0
for age_range in ranges:
print (age_range + " bin is " + str(ranges[age_range]['range'][0]) + " to " + str(ranges[age_range]['range'][1]))
# get copy of dataframe to filter
dfcopy = df.copy()
dfcopy = filter_ranges_min_max(dfcopy, ranges[age_range]['range'][0], ranges[age_range]['range'][1])
# add the age range column to the data frame
dfcopy['age_range'] = age_range
if count == 0:
newdf = dfcopy
else:
newdf = pd.concat([newdf, dfcopy], axis=0)
count = count + 1
return newdf
"""
This function creates queries in the form of: 'b == ["a", "b", "c"]'
The query selects all rows in the dataframe where column b is equal to the value a b or c.
"""
def create_category_query(df, col_name, vals_to_keep):
query = df.columns[df.columns.get_loc(col_name)] + ' == ['
for val in vals_to_keep:
# see if val is an integer
if is_number(val):
query += val + ","
else:
query += "'" + val + "',"
query = query[:-1]
query += "]"
return query
"""
Here we enforce the WHERE clause. We filter out rows that we do not want, want based on the simulator_count_variables.
For example if the user
only wanted to see the rows that contain data for MALES, it would be specified in the simulator_count_variables, and we
would filter the FEMALES out in this function.
"""
def filter_df(df, scos):
#TODO: deal with age_range categories and integer categories
for simulator_count_variable in scos["simulator_count_variables"]:
if simulator_count_variable != "age_range" and simulator_count_variable != "household_median_income":
query = create_category_query(df, simulator_count_variable, scos['simulator_count_variables'][simulator_count_variable])
df = df.query(query)
return df
def process_output_options(df, socs):
output_axes = scos["output_options"]['axes']
df = df.groupby(list(output_axes))['count'].aggregate(sum)
df = df.to_frame("count")
df = df.reset_index()
return df
if __name__ == '__main__':
tree = ET.parse('/Users/nem41/Documents/code_projects/apollo_projects/example-scos-messages/new_E_total_vacc_by_household_income.xml')
root = tree.getroot()
queries = get_queries_from_scos(root)
#scos = all_sick_by_sex_by_age_range
scos = queries[0]
#load the simulator output into a dataframe
line_listing = pd.read_csv('http://research.rods.pitt.edu/line_listing_new.csv')
# line_listing['species'] = line_listing['species'].astype(str)
#Filter out rows from the dataframe that we don't want
df = filter_df(line_listing, scos)
df = filter_ranges(scos, df)
df = process_output_options(df, scos)
df.to_csv('/Users/nem41/Documents/apollo/python_output_test.csv', sep=',', index=False)
#TODO: Add aggregation options next, once data is filtered
print (df)