-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathfeatureHistograms.py
209 lines (154 loc) · 6.55 KB
/
featureHistograms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 3 12:52:31 2014
@author: brian
"""
import csv
from collections import defaultdict
from grid import *
from regions import *
from multiprocessing import Pool
NUM_PROCESSORS = 8
#Class which represents a discrete histogram of some particular feature
#The bins are specified upfront, and counts are incremented via the record() method
class Histogram:
#A simple constructor for the histogram which specifies the name and the bins
#Arguments:
#name - a descriptive name for this histogram
#granularity - the bin size
#lower_bound - the lowest possible value for this histogram to track
#upper_bound - the highest possible value for this histogram to track
#Note: the total number of bins will be (upper_bound - lower_bound) / granularity
def __init__(self, name, granularity, lower_bound=float('-inf'), upper_bound=float('inf')):
self.name = name
self.granularity = granularity
self.lower_bound = lower_bound
self.upper_bound = upper_bound
self.counts = defaultdict(int)
#Record a value in the histogram by incrementing the appropriate bin
#Arguments:
#value - the feature value to record
def record(self, value):
#Use rounding to determine the appropriate bin
rounded = int(value/self.granularity)*self.granularity
rounded = max(self.lower_bound, rounded)
rounded = min(self.upper_bound, rounded)
#Increment that bin
self.counts[rounded] += 1
#Saves the histogram into a CSV file - first column contains values, second column contains frequencies
#Arguments:
#filename - the name of the file to output
def saveToFile(self, filename):
w = csv.writer(open(filename, 'w'))
w.writerow([self.name, 'frequency'])
#Loop through values in order and output their frequencies
for val in sorted(self.counts):
w.writerow([val, self.counts[val]])
#A simple iterator which gives the (year, month) tuples in the data range
def monthIterator():
for year in range(2010, 2014):
for month in range(1, 13):
yield (year, month)
#Computes feature histograms for a given month of the dataset
#Arguments:
#(year,month) - see monthIterator()
#Returns:
#a tuple of Histograms - one for each type of feature
def computeMonth((year, month)):
#Initialize the histograms with reasonable ranges and granularities
hist_lon = Histogram('lon', .01, lower_bound=-80, upper_bound = -70)
hist_lat = Histogram('lat', .01, lower_bound=35, upper_bound = 45)
hist_straightline = Histogram('straightline', .01, lower_bound=0, upper_bound = 100)
hist_time = Histogram('time', 5, lower_bound=0, upper_bound = 10*3600)
hist_minutes = Histogram('minutes', 60, lower_bound=0, upper_bound = 10*3600)
hist_dist = Histogram('distance', .1, lower_bound=0, upper_bound = 100)
hist_miles = Histogram('miles', 1, lower_bound=0, upper_bound = 100)
hist_winding = Histogram('winding', .01, lower_bound=0, upper_bound = 100)
hist_pace = Histogram('pace', 5, lower_bound=0, upper_bound = 10*3600)
#The name of the trip file that will be read
filename = "../new_chron/FOIL" + str(year) + "/trip_data_" + str(month) + ".csv"
logMsg("Reading file " + filename)
r = csv.reader(open(filename, "r"))
#Read the first line from the input file and use it to initialize the header
Trip.initHeader(r.next())
i = 0
for line in r:
try:
#Parse the line into a trip
trip = Trip(line)
#Record longitudes for both the start and end of the trip
hist_lon.record(trip.fromLon)
hist_lon.record(trip.toLon)
#Record latitudes for both the start and end of the trip
hist_lat.record(trip.fromLat)
hist_lat.record(trip.toLat)
#Record the stragihtline distance
hist_straightline.record(trip.straight_line_dist)
#Record the trip time (and rounded trip time)
hist_time.record(trip.time)
hist_minutes.record(trip.time)
#Record the trip distance (and rounded trip distance)
hist_dist.record(trip.dist)
hist_miles.record(trip.dist)
#Record the winding factor
hist_winding.record(trip.winding_factor)
#Record the pace if it is defined
if(trip.dist > 0):
hist_pace.record(float(trip.time) / trip.dist)
except ValueError:
#Parse error - move on
pass
#Intermediate output
i += 1
if(i%1000000==0):
logMsg(filename + " read " + str(i) + " rows")
#Return the tuple of histograms
return (hist_lon, hist_lat, hist_straightline, hist_time, hist_minutes, hist_dist, hist_miles, hist_winding, hist_pace)
#This function merges the tuples of histograms generated by computeMonth()
#The general idea is that each month can be computed in parallel, and the answers can be merged here.
#Arguments:
#histTupleList - a list of tuples of histograms. Essentially, a list of outputsfrom many calls of computeMonth()
#Returns:
#A single histogram tuple
def mergeHistogramTuples(histTupleList):
#Initialize a tuple of histograms -all counts set to 0
mergedHistList = []
for hist in histTupleList[0]: #Mirror the first tuple in the histTupleList
histCopy = Histogram(hist.name, hist.granularity, hist.lower_bound, hist.upper_bound)
mergedHistList.append(histCopy)
mergedHistTuple = tuple(mergedHistList)
#Iterate through all tuples in the list
for histTuple in histTupleList:
#Iterate through all histograms in the tuple
for i in range(len(histTuple)):
#Update the counts of the appropriate merged Histogram
#This is performed by summing the values from the smaller histograms
for val in histTuple[i].counts:
mergedHistTuple[i].counts[val] += histTuple[i].counts[val]
return mergedHistTuple
logMsg("Computing histograms for each month")
#Create a multiprocessing pool
pool = Pool(NUM_PROCESSORS)
#Compute the tuple of feature histograms for each month (slice)
slicedHists = pool.map(computeMonth, monthIterator())
logMsg("Merging histograms across months")
#Merge the months into one single feature histogram tuple
histTuple = mergeHistogramTuples(slicedHists)
#unpack the tuple into individual histograms
(hist_lon, hist_lat, hist_straightline, hist_time, hist_minutes, hist_dist, hist_miles, hist_winding, hist_pace) = histTuple
#Save each histogram to a file
logMsg('Saving...')
try:
os.mkdir("hist_results")
except:
pass
hist_lon.saveToFile('hist_results/lon.csv')
hist_lat.saveToFile('hist_results/lat.csv')
hist_straightline.saveToFile('hist_results/straightline.csv')
hist_time.saveToFile('hist_results/time.csv')
hist_minutes.saveToFile('hist_results/minutes.csv')
hist_dist.saveToFile('hist_results/dist.csv')
hist_miles.saveToFile('hist_results/miles.csv')
hist_winding.saveToFile('hist_results/winding.csv')
hist_pace.saveToFile('hist_results/pace.csv')
logMsg('Done.')