featureHistograms.py

# -*- coding: utf-8 -*-
"""
Created on Thu Jul  3 12:52:31 2014

@author: brian
"""
import csv
from collections import defaultdict
from grid import *
from regions import *
from multiprocessing import Pool


NUM_PROCESSORS = 8

#Class which represents a discrete histogram of some particular feature
#The bins are specified upfront, and counts are incremented via the record() method
class Histogram:
	#A simple constructor for the histogram which specifies the name and the bins
	#Arguments:
		#name - a descriptive name for this histogram
		#granularity - the bin size
		#lower_bound - the lowest possible value for this histogram to track
		#upper_bound - the highest possible value for this histogram to track
		#Note: the total number of bins will be (upper_bound - lower_bound) / granularity
	def __init__(self, name, granularity, lower_bound=float('-inf'), upper_bound=float('inf')):
		self.name = name
		self.granularity = granularity
		self.lower_bound = lower_bound
		self.upper_bound = upper_bound
		self.counts = defaultdict(int)
	
	#Record a value in the histogram by incrementing the appropriate bin
	#Arguments:
		#value - the feature value to record
	def record(self, value):
		
		#Use rounding to determine the appropriate bin
		rounded = int(value/self.granularity)*self.granularity
		rounded = max(self.lower_bound, rounded)
		rounded = min(self.upper_bound, rounded)
		
		#Increment that bin
		self.counts[rounded] += 1
	
	#Saves the histogram into a CSV file - first column contains values, second column contains frequencies
	#Arguments:
		#filename - the name of the file to output
	def saveToFile(self, filename):
		w = csv.writer(open(filename, 'w'))
		w.writerow([self.name, 'frequency'])
		
		#Loop through values in order and output their frequencies
		for val in sorted(self.counts):
			w.writerow([val, self.counts[val]])


#A simple iterator which gives the (year, month) tuples in the data range
def monthIterator():
	for year in range(2010, 2014):
		for month in range(1, 13):
			yield (year, month)


#Computes feature histograms for a given month of the dataset
#Arguments:
	#(year,month) - see monthIterator()
#Returns:
	#a tuple of Histograms - one for each type of feature
def computeMonth((year, month)):

	#Initialize the histograms with reasonable ranges and granularities
	hist_lon = Histogram('lon', .01, lower_bound=-80, upper_bound = -70)
	hist_lat = Histogram('lat', .01, lower_bound=35, upper_bound = 45)
	hist_straightline = Histogram('straightline', .01, lower_bound=0, upper_bound = 100)
	hist_time = Histogram('time', 5, lower_bound=0, upper_bound = 10*3600)
	hist_minutes = Histogram('minutes', 60, lower_bound=0, upper_bound = 10*3600)
	hist_dist = Histogram('distance', .1, lower_bound=0, upper_bound = 100)
	hist_miles = Histogram('miles', 1, lower_bound=0, upper_bound = 100)
	hist_winding = Histogram('winding', .01, lower_bound=0, upper_bound = 100)
	hist_pace = Histogram('pace', 5, lower_bound=0, upper_bound = 10*3600)
	
	#The name of the trip file that will be read
	filename = "../new_chron/FOIL" + str(year) + "/trip_data_" + str(month) + ".csv"
	
	logMsg("Reading file " + filename)
	r = csv.reader(open(filename, "r"))
	
	#Read the first line from the input file and use it to initialize the header
	Trip.initHeader(r.next())
	i = 0
	for line in r:
		try:
			#Parse the line into a trip
			trip = Trip(line)
			
			#Record longitudes for both the start and end of the trip
			hist_lon.record(trip.fromLon)
			hist_lon.record(trip.toLon)

			#Record latitudes for both the start and end of the trip
			hist_lat.record(trip.fromLat)
			hist_lat.record(trip.toLat)
			
			#Record the stragihtline distance
			hist_straightline.record(trip.straight_line_dist)
			
			#Record the trip time (and rounded trip time)
			hist_time.record(trip.time)
			hist_minutes.record(trip.time)
			
			#Record the trip distance (and rounded trip distance)
			hist_dist.record(trip.dist)
			hist_miles.record(trip.dist)
			
			#Record the winding factor
			hist_winding.record(trip.winding_factor)
			
			#Record the pace if it is defined
			if(trip.dist > 0):
				hist_pace.record(float(trip.time) / trip.dist)
				
		except ValueError:
			#Parse error - move on
			pass

		#Intermediate output
		i += 1
		if(i%1000000==0):
			logMsg(filename + " read " + str(i) + " rows")


	#Return the tuple of histograms
	return (hist_lon, hist_lat, hist_straightline, hist_time, hist_minutes, hist_dist, hist_miles, hist_winding, hist_pace)


#This function merges the tuples of histograms generated by computeMonth()
#The general idea is that each month can be computed in parallel, and the answers can be merged here.
#Arguments:
	#histTupleList - a list of tuples of histograms.  Essentially, a list of outputsfrom many calls of computeMonth()
#Returns:
	#A single histogram tuple
def mergeHistogramTuples(histTupleList):
	
	#Initialize a tuple of histograms -all counts set to 0
	mergedHistList = []
	for hist in histTupleList[0]: #Mirror the first tuple in the histTupleList
		histCopy =  Histogram(hist.name, hist.granularity, hist.lower_bound, hist.upper_bound)
		mergedHistList.append(histCopy)
	mergedHistTuple = tuple(mergedHistList)
		
	
	#Iterate through all tuples in the list
	for histTuple in histTupleList:
		#Iterate through all histograms in the tuple
		for i in range(len(histTuple)):
			#Update the counts of the appropriate merged Histogram
			#This is performed by summing the values from the smaller histograms
			for val in histTuple[i].counts:
				mergedHistTuple[i].counts[val] += histTuple[i].counts[val]
	
	return mergedHistTuple


logMsg("Computing histograms for each month")

#Create a multiprocessing pool
pool = Pool(NUM_PROCESSORS)

#Compute the tuple of feature histograms for each month (slice)
slicedHists = pool.map(computeMonth, monthIterator())


logMsg("Merging histograms across months")

#Merge the months into one single feature histogram tuple
histTuple = mergeHistogramTuples(slicedHists)

#unpack the tuple into individual histograms
(hist_lon, hist_lat, hist_straightline, hist_time, hist_minutes, hist_dist, hist_miles, hist_winding, hist_pace) = histTuple

		
#Save each histogram to a file
logMsg('Saving...')

try:
	os.mkdir("hist_results")
except:
	pass


hist_lon.saveToFile('hist_results/lon.csv')
hist_lat.saveToFile('hist_results/lat.csv')
hist_straightline.saveToFile('hist_results/straightline.csv')
hist_time.saveToFile('hist_results/time.csv')
hist_minutes.saveToFile('hist_results/minutes.csv')
hist_dist.saveToFile('hist_results/dist.csv')
hist_miles.saveToFile('hist_results/miles.csv')
hist_winding.saveToFile('hist_results/winding.csv')
hist_pace.saveToFile('hist_results/pace.csv')

logMsg('Done.')