diff --git a/.gitignore b/.gitignore index 2f1c545..a6d5175 100644 --- a/.gitignore +++ b/.gitignore @@ -1,25 +1,12 @@ -.DS_Store -*.aux -*.bbl -*.blg -*.log -*.fdb_latexmk -*.gz -~$* -*.m~ +*generated* +*/_build/* +build/* +dist/* +demo/* +_private/* *.swp -*.swo *.pyc -tags -.#* -projects/sample_script/*results.txt -code/convert_mat_to_xy.pyc -logfile.txt -projects/compare_sad/*0.png -projects/compare_sad/*0.csv -projects/compare_sad/*0.txt -projects/compare_sar/*.png -projects/compare_sar/*.csv -projects/sample_script/*0.png -projects/sample_script/*0.csv -projects/sample_script/*0.txt +*.DS_Store +*.pdf +*egg-info* + diff --git a/LICENSE.md b/LICENSE.txt similarity index 50% rename from LICENSE.md rename to LICENSE.txt index af4f37c..9b0d6e6 100644 --- a/LICENSE.md +++ b/LICENSE.txt @@ -1,22 +1,22 @@ -Copyright (c) 2013, The Regents of the University of California +Copyright (c) 2012-2014, The Regents of the University of California All rights reserved. -Redistribution and use in source and binary forms, with or without +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: -- Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. -- Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. +- Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. +- Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md deleted file mode 100644 index 4eed332..0000000 --- a/README.md +++ /dev/null @@ -1,32 +0,0 @@ -macroeco -======== - -## For the most up-to-date and unittested version of `macroeco` please use the develop branch - -Overview --------- - -macroeco is a Python package for pattern-based ecological analysis. The package -was developed at UC Berkeley by Justin Kitzes, Mark Wilber, and Chloe Lewis, -and is maintained by Justin Kitzes. - -There is no separate documentation available yet, although the docstrings for -various classes and functions are relatively complete. Refer also to the -software [ecopattern](http://github.com/jkitzes/ecopattern) for examples of the -package in use. - -Installation ------------- - -Simply clone the macroeco directory to a location on your PYTHONPATH, including -as a subdirectory of your current work folder. A `setup.py` file is coming in a -future release. - -Credits -------- -- Authors: Justin Kitzes, Mark Wilber, Chloe Lewis -- Copyright: Copyright 2012, Regents of the University of California -- License: BSD 2-clause -- Maintainer: Justin Kitzes -- Email: jkitzes@berkeley.edu -- Status: Development diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..333c18c --- /dev/null +++ b/README.rst @@ -0,0 +1,8 @@ +Macroeco: Ecological pattern analysis in Python +----------------------------------------------- + +Macroeco is a Python package that provides a comprehensive set of functions for analyzing empirical patterns in ecological data, predicting patterns from theory, and comparing empirical results to theory. Many major macroecological patterns can be analyzed using this package, including the species abundance distribution, the species and endemics area relationships, several measures of beta diversity, and many others. + +Extensive documentation for macroeco, including detailed installation instructions, tutorials, and a reference guide, is available at http://macroeco.org. The most recent stable version of the macroeco package can be installed from PyPI (``pip install macroeco``). For users who do not program in Python, a standalone application called Macroeco Desktop, which provides most of the functionality of macroeco through a simple interface that requires no programming, is also available. + +The current version of macroeco is developed and maintained by Justin Kitzes (UC Berkeley) and Mark Wilber (UC Santa Barbara). Other contributors include Chloe Lewis and Ethan White. The development of macroeco has been supported by the National Science Foundation, the Gordon and Betty Moore Foundation, and the Berkeley Institute for Data Science. diff --git a/__init__.py b/__init__.py deleted file mode 100644 index 5f9b4d5..0000000 --- a/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -__author__ = "Justin Kitzes, Mark Wilber, Chloe Lewis" -__copyright__ = "Copyright 2012, Regents of University of California" -__credits__ = [] -__license__ = "BSD 2-clause" -__version__ = "0.2" -__maintainer__ = "Justin Kitzes" -__email__ = "jkitzes@berkeley.edu" -__status__ = "Development" - -import compare -import data -import empirical -import output -import utils.workflow as workflow diff --git a/compare.py b/compare.py deleted file mode 100644 index d4bd508..0000000 --- a/compare.py +++ /dev/null @@ -1,1569 +0,0 @@ -#!/usr/bin/python - -''' This module contains classes and functions for comparing empirical and -predicted macroecological metrics. - -Classes -------- -CompareDistribution : Base class to for CompareSAD, CompareSSAD, CompareIED, -CompareSED, CompareASED - -CompareSAD : Compares predicted species abundance distributions (SAD) with -empirical SADs - -CompareSSAD : Compares predicted species-level spatial abundance distributions -(SSAD) with empirical SSADS - -CompareSAR : Compares predicted species-area relationship (SAR) curves with -empirical SAR curves - -CompareIED : Compares predicted individual energy distributions (IED) with -empirical IEDs - -CompareSED : Compares predicted species-level energy distributions (SED) with -empirical SEDs - -CompareASED : Compares predicted average species-level energy distributions -(ASED) with empirical ASEDs. - -Functions ---------- --`empirical_cdf` -- Empirical cdf for given data --`aic` -- Calculate AIC value --`aicc` -- Calculate corectted AIC value --`aic_wieghts` -- Calculate AIC weights for models --`ks_two_sample_test` -- Kolmogrov-Smirnov two sample test --`likelihood_ratio` -- Calculated likelihood ratio for nested models --`variance` -- Calculates the variance for given datasets --`skew` -- Calculates the skew for given datasets --`kurtosis` -- Calculates the kurtosis for given data sets --`bootstrap` -- Get bootstrapped samples from a dataset -- `bootstrap_moment` -- Gives a BS confidence interval for a comparison of - first three moments of two distributions --`'mean_squared_error` -- Calculates the MSE between an obs and pred data set - - -''' - -from __future__ import division -import numpy as np -import scipy.stats as stats -from distributions import * -import copy -import random -import time -import logging - - -class CompareDistribution(object): - ''' - Comparison object compares a list of data to any number of distributions - - ''' - - #TODO: Error Checking - def __init__(self, data_list, dist_list, observed_index): - ''' - Parameters - ---------- - data_list : list of iterables or list of tuples of iterables - data_list is any list of iterables or list of tuples of iterables - that will be passed to the fit functions of the distribution - objects in dist_list. data_list will be passed to fit functions for - each distribution. data_list undergoes no validation in __init__ - dist_list : list - List of distribution objects or strings that have the same name as - a distribution object. If they are strings, they will be evaled - observed_index : int - The index of the desired observed metric in the tuples within - data_list. If 0, data_list can be a list of data - rather than a list of tuples of data. The index specified by - object_ind will be considered the observed data. - - Notes - ----- - All distribution objects are fit in the __init__ method. - - ''' - - self.dist_list = make_dist_list(dist_list) - - # Fit the distributions objects - [dist.fit(data_list) for dist in self.dist_list] - - # Set the observed data - if observed_index == 0 and np.all([type(dt) != type((1,)) for dt in - data_list]): - self.observed_data = [np.array(dt) for dt in data_list] - elif np.all([type(dt) == type((1,)) for dt in data_list]): - self.observed_data = [np.array(dt[observed_index]) for dt in - data_list] - else: - self.observed_data = [np.array(dt) for dt in data_list] - - # Set this in __init__ so other methods can check if compare_rads() has - # been called - self.rads = None - self.cdfs = None - - # If attributes have not been instantiated, set to None - try: - self.sad_spp_list - except: - self.sad_spp_list = None - try: - self.criteria - except: - self.criteria = None - - def compare_mse(self, mse_base='cdf'): - ''' - This function compares the mean squared error (mse) for each distribution - against the observed data, self.observed_data. Perfect predicted data - would yield a mse of 0. The lower the mse the better the predicted - values fit the data. If mse_base='cdf' the mse is calculated from the - cdf. If mse_base='rad', the mse is calculated from the rank_abundance - distribution. - - Parameters - ----------- - mse_base : str - Either 'cdf' or 'rad'. If 'cdf' the mse values are computed - from the cumulative density function. It 'rad' the mse values are - computed from the rank abundance distribution. Default is 'cdf' - - Returns - ------- - : dict - A dictionary of length self.dist_list with keywords being the - distribution names. Each keyword looks up a list of length - self.observed_data in which are the mse values comparing that - distribution's predicted values (cdf or rad) to the corresponding - observed values. - - Notes - ----- - Calculating the mse from the cdf is the least bias approximater - - ''' - if mse_base == 'cdf': - if self.cdfs == None: - vals = self.compare_cdfs() - else: - vals = self.cdfs - elif mse_base == 'rad': - if self.rads == None: - vals = self.compare_rads() - else: - vals = self.rads - else: - raise NameError('%s value for mse_base not recognized' % mse_base) - - - mse = {} - for kw in vals.iterkeys(): - if kw != 'observed': - if not np.all([len(j) == 0 for j in vals[kw]]): - mse[kw] = [mean_squared_error(vals['observed'][i], - vals[kw][i]) for i in xrange(len(vals[kw]))] - else: - logging.warning('MSE values for %s set to NaN' % kw) - mse[kw] = [np.NaN for i in xrange(len(self.observed_data))] - return mse - - - def compare_aic(self, crt=False): - ''' - Get the aic or aicc values for every data set and for every - distribution - - Parameters - ---------- - crt : bool - If True, calculates the corrected AIC for the given data. If False, - calculates AIC. - - Returns - ------- - : list - A list of arrays. The list has length = to number of data sets in - self.observed_data. Each array within list has the length of - self.dist_list. The first element of the array corresponds to the - first distribution in dist_list, the second corresponds to the - second distribution, etc. - - ''' - aic_vals = [] - for dist in self.dist_list: - - try: - nlls = nll(dist.pmf(self.observed_data)) - except NotImplementedError: - try: - nlls = nll(dist.pdf(self.observed_data)) - except NotImplementedError: - logging.warning('%s has neither a PMF nor a PDF. AIC set' - % get_name(dist) + ' to infinity') - nlls = np.repeat(np.inf, len(self.observed_data)) - - #NOTE: dist.par_num is the number of parameters of distribution - k = np.repeat(dist.par_num, len(nlls)) - if crt: - obs = np.array([len(data) for data in self.observed_data]) - aic_vals.append(aicc(nlls, k, obs)) - else: - aic_vals.append(aic(nlls, k)) - return list(np.array(aic_vals).T) - - def compare_aic_measures(self, crt=False): - ''' - Compare AIC weights, delta_AIC, and AIC values across the different - models. Output is a three item tuple where each item is a list of - arrays with each array having length equal to the number of models - proposed and the length of the list is the length of self.observed_data. - See Returns for tuple description. - - Parameters - ---------- - crt : bool - If True, calculates the corrected AIC weights for the given data. - If False, calculates AIC weights. - - Returns - ------- - : tuple - The first element is a list of arrays with each array having length - equal to the number of models proposed and the length of the list - is the length of self.observed_data. The first element contains - the AIC weights. The second element is the delta AIC values in the - same format as the first tuple object. The third object are the AIC - values in the same format as the output of the compare_aic method. - - Notes - ----- - The given AIC values in each array correspond to the distributions in - self.dist_list. - - ''' - aic_vals = self.compare_aic(crt=crt) - aic_wghts = []; delta_aic = [] - for mods_aic in aic_vals: - taic_wghts, tdelta_aic = aic_weights(mods_aic) - aic_wghts.append(taic_wghts) - delta_aic.append(tdelta_aic) - return aic_wghts, delta_aic, aic_vals - - def compare_rads(self): - ''' - Compares rank abundance distributions for all data in data_list and to - the given distributions - - Returns - ------- - : dict - Has len(self.dist_list) + 1. All the distribution class names - passed to the constructor are key words as well as 'observed' which - references the observed data, self.observed_data. Each keyword looks up - a list of arrays. Each list is len(self.observed_data) long and - contains the predicted rads for the empirical data sets for the - given distribution. - - Note - ---- - If self.rads has already been set in another method (i.e. is not None). - This method will not overwrite it. To reset self.rads, set self.rads - = None and then run self.compare_rads(). - - ''' - if self.rads == None: - rads_dict = {} - rads_dict['observed'] = copy.deepcopy(self.observed_data) - for i, dist in enumerate(self.dist_list): - #Different Identifier? - rads_dict[get_name(dist)] = dist.rad() - - self.rads = rads_dict - return self.rads - - def compare_cdfs(self): - ''' - Compares cdfs for all data in data_lists and to the empirical cdfs - - Returns - ------- - :dict - Has len(self.dist_list) + 1. All the distribution class names - passed to the constructor are key words as well 'observed' which - references the observed data, self.observed_data. Each keyword looks up - a list of arrays. Each list is len(self.observed_data) long and - contains the predicted cdfs for the empirical data sets for the - given distribution. - - - ''' - if self.cdfs == None: - - cdfs_dict = {} - cdfs_dict['observed'] = [empirical_cdf(data) for data in - self.observed_data] - for i, dist in enumerate(self.dist_list): - try: - cdfs_dict[get_name(dist)] = dist.cdf(self.observed_data) - except NotImplementedError: - logging.warning('CDF method not implemented for %s' % - get_name(dist)) - cdfs_dict[get_name(dist)] = [np.array([]) for i in - xrange(len(self.observed_data))] - - self.cdfs = cdfs_dict - return self.cdfs - - - def compare_LRT(self, null_mdl): - ''' - Performs a likelihood ratio test (LRT) on the distributions with in - self.dist_list with the parameter nll_mdl as the null model. While this - function will generate output on non-nested models, the models must be - nested for the output to be meaningful. - - Parameters - ---------- - null_mdl : distribution object - The null distribution object to use in the LRT. - - Returns - ------- - : dict - A dictionary with keywords 'null_model, alternative model.' Each - keyword references a list of length len(self.observed_data) which - contains tuples that contain the output of the function - likelihood_ratio (chisquared, p-value). The LRT is performed on - each data set in self.observed_data for each given model pair. - - ''' - LRT_list = {} - null_mdl.fit(self.observed_data) - - try: - null_nlls = nll(null_mdl.pmf(self.observed_data)) - except: - null_nlls = nll(null_mdl.pdf(self.observed_data)) - for i, dist in enumerate(self.dist_list): - - try: - alt_nlls = nll(dist.pmf(self.observed_data)) - except: - alt_nlls = nll(dist.pdf(self.observed_data)) - - k = dist.par_num - null_mdl.par_num - df = np.repeat(k, len(alt_nlls)) - lrt = likelihood_ratio(null_nlls, alt_nlls, df) - comp_kw = get_name(null_mdl) + ", " + get_name(dist) - LRT_list[comp_kw] = lrt - return LRT_list - - def compare_rarity(self, mins_list): - ''' - This method takes in the output from self.compare_rads and a list of - minimum values against which to compare the observed and predicted - rads. and outputs a dictionary with length self.dist_list + 1 (all - distributions + observed). Each keyword in this dict looks up a dict - of len(mins_list) where the keywords are the values against which the - rads will be <=. Each one of these sub-dictionaries looks up a list - with len(self.observed_data). - - Parameters - ---------- - mins_list : array-like object - A list of numbers. Each number number will be used in the - following function: rad <= mins_list[i]. - - Returns - ------- - : dict - Returns a dictionary with length self.dist_list + 1 (all - distributions + observed). Each keyword in this dict looks up a dict - of len(mins_list) where the keywords are the values against which the - rads will be <=. Each one of these sub-dictionaries looks up a list - with len(self.observed_data). - - - ''' - - # Don't remake rads if they have already been made - if self.rads == None: - rads = self.compare_rads() - else: - rads = self.rads - - mins_list = make_array(mins_list) - - rarity = {} - keys = list(rads.viewkeys()) - for kw in keys: - rarity[kw] = {} - for mins in mins_list: - rarity[kw][mins] = [sum(data <= mins) for data in rads[kw]] - return rarity - - def compare_moments(self): - ''' - Compare the higher order moments (variance, skew, kurtosis) for the - given distributions and observed data. - - Returns - ------- - : dict - A dictionary with keywords variance, skew, and kurtosis. Each - keyword looks up a dictionary len(dist_list) + 1 keywords. The - keywords are 'observed' and the distribution object names. Each of - these keywords looks up a list of floats with the same length as - data_list. - - ''' - - if self.rads == None: - rads = self.compare_rads() - else: - rads = self.rads - - var = {} - skw = {} - kurt = {} - - for kw in rads.iterkeys(): - var[kw] = variance(rads[kw]) - skw[kw] = skew(rads[kw]) - kurt[kw] = kurtosis(rads[kw]) - moments = {} - moments['variance'] = var - moments['skew'] = skw - moments['kurtosis'] = kurt - - return moments - - def summary(self, mins_list=[10], crt=False): - ''' - Summarizes the given datasets and the predicted rads. Looks at - total balls sampled ('balls'), number of urns ('urns'), the max balls - in a given urn ('max'), number of urns with less than MIN balls ('tot - <= MIN'), and the fit of the distributions in self.dist_list to the - data in self.observed_data - - 'balls' is the sum of the observed data. For a Species Abundance - Distribution 'balls' would represent individuals. For an Individual - Energy Distribution 'balls' would represent energy. - - 'urns' is the length of the observed data. For a Species Abundance - Distribution 'urns' would represent species and for a Individual Energy - Distribution 'urns' would represent individuals. - - Parameters - ---------- - mins_list : list - Bins with balls less than or equal to 10 - crt : bool - If True, corrected AIC, if False, not - - Returns - ------- - : dict - Dictionary of dictionaries of length self.dist_list + 1. Each - sub-dictionary other than 'observed' contains the keywords balls, - urns, max, tot_min, aic, aic_d, aic_w, and par_num. Each of these - keywords contains a list that is the same length as the number of - sads under consideration. - - - urns = total number of items in self.observed_data. Could be - species (SAD, ASED), cells (SSAD), or individuals (IED, SED) - balls = Items that are placed in urns. Could be individuals (SAD, - SSAD), energy (ASED, IED, SED). - max = Maximum number of balls in an urn - tot_min = Total number of urns with with <= a given number of balls - aic = AIC - aic_d = Delta AIC - aic_w = AIC weights - par_num = Parameter number of the given distribution - tot_min = total counts less than or equal numbers in min_list - vars = Additional variables computed for the given distribution - - - ''' - summary = {} - - # Check that rads is already set, if not set it - if self.rads == None: - rads = self.compare_rads() - if type(rads) == type((1,)): - rads = rads[0] - else: - rads = self.rads - - rarity = self.compare_rarity(mins_list=mins_list) - for kw in rads.iterkeys(): - summary[kw] = {} - summary[kw]['balls'] = [np.sum(data) for data in rads[kw]] - summary[kw]['urns'] = [len(data) for data in rads[kw]] - summary[kw]['max'] = [np.max(data) for data in rads[kw]] - summary[kw]['tot_min'] = rarity[kw] - - aic_vals = self.compare_aic_measures(crt=crt) - names = [get_name(dist) for dist in self.dist_list] - for i, nm in enumerate(names): - summary[nm]['aic'] = list(np.array(aic_vals[2]).T)[i] - summary[nm]['aic_d'] = list(np.array(aic_vals[1]).T)[i] - summary[nm]['aic_w'] = list(np.array(aic_vals[0]).T)[i] - summary[nm]['par_num'] = np.repeat(self.dist_list[i].par_num, - len(list(np.array(aic_vals[2]).T)[i])) - summary[nm]['vars'] = self.dist_list[i].var - - return summary - -class CompareSAD(CompareDistribution): - ''' - Object inherits CompareDistribution and uses it to compare species - abundance distributions (SAD) - - Attributes - ---------- - self.observed_data : A list of arrays - Each array in this list is an SAD. Each of these SADs will be compared - to the distributions in self.dist_list - self.dist_list : a list of distribution objects - Each object is a distribution object to which the SADs in - self.observed_data will be compared. - self.criteria : a list of dictionaries or None - If not None, each dictionary specifies the divisions made on the plot - that generated each SAD in self.observed_data. self.criteria should be - the same length as self.observed_data - self.sad_spp_list : list of arrays or None - If not None, each array contains the species strings for the - corresponding SAD in self.observed_data. The length of - self.sad_spp_list should be the same length as self.observed_data and - the length of any array within self.sad_spp_list should be the same - length the corresponding array in self.observed_data. The index of any - species name within any array within self.sad_spp_list references the - species count with the same index in self.observed_data. - - ''' - - def __init__(self, data_list, dist_list, patch=False): - ''' - Parameters - ---------- - data_list : list of iterables or output from Patch.sad - List of np.arrays containing data - dist_list : list - List of distribution objects or strings that have the same name as - a distribution object. If they are strings, they will be evaled - patch : bool - If True, expects the output from the Patch.sad method and if False, - expects a list of iterables. Presumably, each iterable is an SAD. - - Notes - ----- - If data_list is a list of tuples containing iterables, the 1st entry - (0th element) in each tuple is considered the observed SADs - ''' - if patch == True: - self.criteria, sad_data, self.sad_spp_list = unpack(data_list) - super(CompareSAD, self).__init__(sad_data, dist_list, 0) - else: - super(CompareSAD, self).__init__(data_list, dist_list, 0) - -class CompareSSAD(CompareDistribution): - ''' - Object inherits CompareDistribution and uses it to compare species-level - spatial abundance distributions (SSAD) - - Attributes - ---------- - self.observed_data : A list of arrays - Each array in this list is an SSAD. Each of these SSADs will be - compared to the distributions in dist_list - self.dist_list : a list of distribution objects - Each object is a distribution object to which the SSADs in - self.observed_data will be compared. - self.criteria : a list of dictionaries or None - If not None, each dictionary specifies the divisions made on the plot - that generated each SAD in self.observed_data. self.criteria should be - the same length as self.observed_data - self.sad_spp_list : List of strings or None - If not None, self.sad_spp_list is a list of strings where each string - refers to a species. The length of self.sad_spp_list should be the same - length as self.observed_data. Each species string has the same index - within the list as its corresponding SSAD in self.observed_data. - - ''' - - def __init__(self, data_list, dist_list, patch=False): - ''' - Parameters - ---------- - data_list : list of iterables or output from Patch.ssad - List of np.arrays containing data - dist_list : list - List of distribution objects or strings that have the same name as - a distribution object. If they are strings, they will be evaled - patch : bool - If True, expects the output from the Patch.sad method and if False, - expects a list of iterables. Presumably, each iterable is an SSAD. - - - Notes - ----- - If data_list is a list of tuples containing iterables, the 1st entry - (0th element) in each tuple is considered the observed SSADs - ''' - if patch == True: - - self.sad_spp_list = list(data_list[1].viewkeys()) - ssad_data = [np.array(data_list[1][nm]) for nm in - self.sad_spp_list] - self.criteria = data_list[0] - - super(CompareSSAD, self).__init__(ssad_data, dist_list, 0) - else: - super(CompareSSAD, self).__init__(data_list, dist_list, 0) - - - -class CompareIED(CompareDistribution): - ''' - Class compares predicted individual energy distributions (IED) for the - entire community to observed IEDs - - Attributes - ---------- - self.observed_data : list of arrays - Observed individual energy distributions (IED) - self.ied_spp_lists : list of arrays - Each array contains species strings which pair to the values - contained in the corresponding array in self.ied_list. The length of - self.ied_spp_lists should be the same length as self.ied_list. - self.sad_spp_list : list of arrays - If not None, each array contains the species strings for the - corresponding SAD in self.sad_list. The length of self.sad_spp_list - should be the same length as self.sad_list and the length of any array - within self.sad_spp_list should be the same length the corresponding - array in self.sad_list. The index of any species name within any array - within self.sad_spp_list references the species count with the same - index in self.sad_list. - self.criteria : a list of dictionaries or None - If not None, each dictionary specifies the divisions made on the plot - that generated each SAD and IED in self.sad_list and self.ied_list. - self.criteria should be the same length as self.sad_list and - self.ied_list. - self.dist_list : a list of distribution objects - Each object is a distribution to which the IEDs in self.ied_list will - be compared. - - ''' - - def __init__(self, data_list, dist_list, patch=False): - ''' - Parameters - ---------- - data_list : list of tuples or output from Patch object - A list containing tuples of length two. The first object in a - tuple an iterable containing the community individual energy - distribution. The second object in a tuple is an iterable - containing the empirical species abundance distribution. - See patch argument for more information. - dist_list : list of strings or objects - Each string corresponds to a name of a psi distribution to which to - compare to the observed data. - patch: bool - If True, expects a tuple of length 2 with the first object being - the output from Patch.ied and the second element being the - output from Patch.sad. If False expects what argument data_list - describes. sads and energy should be made with the same criteria. - - Notes - ----- - The __init__ method always removes zeros from the SADs - - If data_list is a list of tuples containing iterables, the 1st entry - (0th element) in each tuple is considered the observed IEDs - ''' - - if patch == True: - # Unpack sad. Store spp_lists in items - sad_criteria, sad_list, self.sad_spp_list = \ - unpack(data_list[1]) - - # Unpack ied - ied_criteria, ied_list, self.ied_spp_lists = \ - unpack(data_list[0]) - self.criteria = sad_criteria - - super(CompareIED, self).__init__(zip(ied_list, sad_list), - dist_list, 0) - - else: - super(CompareIED, self).__init__(data_list, dist_list, 0) - self.ied_spp_lists = None - - - -class CompareSED(CompareDistribution): - ''' - Class compares predicted species-level energy distribution(s) with the - observed species-level energy distribution(s) - - Attributes - ---------- - self.observed_data : list of iterables - Observed species energy distributions (SED) - self.criteria : a list of dictionaries or None - If not None, each dictionary specifies the divisions made on the plot - that generated each SED, IED, and SAD and IED in self.sed_list, - self.ied_list, and self.sad_list. All self.criteria should have the - same length. - self.dist_list : a list of distribution objects - Each object is a distribution to which the IEDs in self.ied_list will - be compared. - self.sad_spp_list : list of strings or None - If not None, each string in self.spp_names is a species ID which - corresponds to an array in self.sed_list. - - ''' - - def __init__(self, data_list, dist_list, patch=False): - ''' - Parameters - ---------- - data_list : list of tuples or output from Patch object - A list of tuple where each tuple has length 3. The first object in - a tuple is an iterable containing the empirical species energy - distribution. The second object is a tuple is a community - individual energy distribution. The third object in a tuple is an - empirical species abundance distribution. - dist_list : list of strings or objects - Each string corresponds to a name of a psi distribution to which to - compare to the observed data. - patch : bool - If True, expects a tuple of length 3 with the first object being - the complete output from Patch.sed, the second object being the - output from Patch.ied and the third element being the output from - Patch.sad. If False expects what argument data_list describes. - Empirical sads and energy distributions should be made with the - same criteria (See Patch class for criteria explanation). - - Notes - ----- - If data_list is a list of tuples containing iterables, the 1st entry - (0th element) in each tuple is considered the observed SEDs. - ''' - - if patch: - # TODO: Check length of input objects! - - if not ((len(data_list[0]) == len(data_list[1])) and\ - (len(data_list[1]) == len(data_list[2]))): - raise IndexError('SED, IED, and SAD patch returns' +\ - ' must have the same length. Use the same criteria for' +\ - ' each.') - - #Sort species energy - sed_criteria = [] - sed_list = [] - spp_names = [] - for obj in data_list[0]: - spp = list(obj[1].viewkeys()); spp.sort() - spp_names.append(spp) - for kw in spp_names[-1]: - sed_list.append(obj[1][kw]) - sed_criteria.append(obj[0]) - - #Sort community energy - ied_criteria = [] - ied_list = [] - for i, obj in enumerate(data_list[1]): - - # For consistency I am copying the ied data for each species - num = len(spp_names[i]) - tcri = [obj[0] for i in xrange(num)] - ied_criteria += tcri - teng = [obj[1] for i in xrange(num)] - ied_list += teng - - #Sort sad - sad_criteria = [] - sad_list = [] - for i, obj in enumerate(data_list[2]): - - # Copy sad data for each species - num = len(spp_names[i]) - tcri = [obj[0] for i in xrange(num)] - sad_criteria += tcri - tsad = [obj[1] for i in xrange(num)] - sad_list += tsad - - self.sad_spp_list = [] - for i in xrange(len(spp_names)): - self.sad_spp_list += spp_names[i] - self.criteria = sad_criteria - - super(CompareSED, self).__init__(zip(sed_list, ied_list, sad_list), - dist_list, 0) - - else: - - super(CompareSED, self).__init__(data_list, dist_list, 0) - - def compare_rads(self, return_spp=False): - ''' - Comparison of species level energy distributions rank abundance - distributions. - - Parameters - ---------- - return_spp : bool - If True, the returns a tuple with a species list as the second - element. - - Returns - ------- - : dict - Has len(self.dist_list) + 1. All the distribution class names - passed to the constructor are key words as well as 'observed' which - references the observed data. Each keyword looks up - a list of arrays. Each list is len(self.ied_list) long and - contains the predicted reds for the empirical data sets for the - given distribution. - : list or None - Returns self.sad_spp_list which could be a list of lists or None. - These names are the species names that correspond numerically with - the arrays in within each distribution. Only returned if - return_spp=True. - - ''' - if return_spp: - return super(CompareSED, self).compare_rads(), self.sad_spp_list - else: - return super(CompareSED, self).compare_rads() - - - def compare_cdfs(self, return_spp=False): - ''' - Comparison of species level energy distributions cdfs - - Parameters - ---------- - return_spp : bool - If True, the returns a tuple with a species list as the second - element. - - Returns - ------- - : dict - Has len(self.dist_list) + 1. All the distribution class names - passed to the constructor are key words as well as 'observed' which - references the observed data. Each keyword looks up - a list of arrays. Each list is len(self.ied_list) long and - contains the predicted reds for the empirical data sets for the - given distribution. - : list or None - Returns self.sad_spp_list which could be a list of lists or None. - These names are the species names that correspond numerically with - the arrays within each distribution. Only returned if - return_spp=True. - - ''' - if return_spp: - return super(CompareSED, self).compare_cdfs(), self.sad_spp_list - else: - return super(CompareSED, self).compare_cdfs() - -class CompareASED(CompareDistribution): - ''' - Compares theoretical and observed ased's - - Attributes - ---------- - self.observed_data : list of arrays - Observed average species energy distributions (ASED) - self.sad_spp_list : list of arrays - If not None, each array contains the species strings for the - corresponding SAD in self.sad_list. The length of self.sad_spp_list - should be the same length as self.sad_list and the length of any array - within self.sad_spp_list should be the same length the corresponding array - in self.sad_list. The index of any species name within any array - within self.sad_spp_list references the species count with the same - index in self.sad_list. - self.criteria : a list of dictionaries or None - If not None, each dictionary specifies the divisions made on the plot - that generated each SAD and IED in self.sad_list and self.ied_list. - self.criteria should be the same length as self.sad_list and - self.ied_list. - self.dist_list : a list of distribution objects - Each object is a distribution to which the IEDs in self.ied_list will - be compared. - - ''' - - def __init__(self, data_list, dist_list, patch=False): - ''' - Parameters - ---------- - data_list : list of tuples or output from Patch object - A list containing tuples of length three. The first object in the - tuple is an iterable containing the average energy distribution. - The second object in a tuple an iterable containing the community - individual energy distribution. The third object in a tuple is an - iterable containing the empirical species abundance - distribution.See patch argument in this method for information - about Patch object output. - dist_list : list of strings or objects - Each string corresponds to a name of a ased distribution to which to - compare to the observed data. - patch : bool - If True, expects a tuple of length 3 with the first object being - the complete output from Patch.ased, the second object being - the output from Patch.ied and the third element being the - output from Patch.sad. If False expects what argument data_list - describes. Empirical sads and energy distributions should be made - with the same criteria. - - Notes - ----- - If data_list is a list of tuples containing iterables, the 1st entry - (0th element) in each tuple is considered the observed ASEDs. - ''' - - if patch: - - # Unpack sad. Store spp_lists in items - sad_criteria, sad_list, sad_spp_lists = \ - unpack(data_list[2]) - - # Unpack ased - ased_criteria, ased_list, ased_species = \ - unpack(data_list[0]) - - # Unpack ied - ied_criteria, ied_list, ied_spp = unpack(data_list[1]) - - self.criteria = sad_criteria - self.sad_spp_list = ased_species - - super(CompareASED, self).__init__(zip(ased_list, ied_list, - sad_list), dist_list, 0) - - - else: - super(CompareASED, self).__init__(data_list, dist_list, 0) - -class CompareSAR(object): - ''' - Object allows comparison between species-area relationships - - Attributes - ---------- - self.sar_list : list of arrays - A list of arrays in which each array is the number of species a - given areas. The areas are specified in self.a_list and correspond - exactly self.sar_list. - self.a_list : list of arrays - A list of arrays in which each array is the area (or area fraction) at - which the number of species specified in self.sar_list are found. - Indices correspond exactly with self.sar_list. - self.full_sad : list of arrays - A list of species abundance distributions (SAD) computed at the anchor - scale for each given SAR. The length of self.full_sad should equal the - length of self.sar_list and self.a_list. - self.curve_list : list of objects - A list of SAR curve objects to which the empirical SARs in - self.sar_list will be compared. - - ''' - - def __init__(self, sar_list, curve_list, full_sad, max_a=True, - patch=False): - ''' - Parameters - ---------- - sar_list : list of tuples or list of outputs from Patch().sar - A list of tuples where each tuple contains two array-like objects - of the same length. The first element in the tuple is the - area list and the second element is the species count for the sar. - The maximum area in the area list should be the anchor area from - which the full_sad was generated. If patch=True, accepts the - output from Patch.sar - curve_list : list - A list of SARCurve objects or list of SARCurve object names (str) - full_sad : list of array-like objects - List of complete sads. Each sad corresponds to an element in - sar_list. - max_a : bool - If max_a is True, compare sets all areas to fractions in area_list. - patch : bool - If True, sar_list should be a list of outputs from Patch().sar - ''' - - assert len(sar_list) == len(full_sad), "sar_list and full_sad must " \ - + " be the same length" - self.sar_list = [] - self.a_list = [] - if patch: - for sar_obj in sar_list: - unzipped_sar = unpack(sar_obj[0]) - self.sar_list.append(np.array(unzipped_sar[0])) - self.a_list.append(np.array(unzipped_sar[1])) - else: - unzipped_sar = unpack(sar_list) - self.a_list = [np.array(areas) for areas in unzipped_sar[0]] - self.sar_list = [np.array(sar) for sar in unzipped_sar[1]] - - # Set to area fractions if max_a is true - if max_a: - self.a_list = [ars / np.max(ars) for ars in self.a_list] - - self.full_sad = [np.array(sad) for sad in full_sad] - - self.curve_list = make_dist_list(curve_list) - - - def compare_curves(self, iter_vals=False, use_rad=False, form='sar'): - ''' - Method generates predicted SAR curves from the given observed data and - curve objects for comparison - - Parameters - ---------- - use_rad : bool - If False, uses the sad pmf to calculate the SAR. If True, uses the - sad rank abundance distribution to calculate the SAR. - iter_val : bool - If True, uses the iterative method to calculate SAR. If False uses - the one shot method. - form : string - Default value is 'sar' which calculates the SAR given the - parameters. You can also use 'ear' which calculates the EAR with - the given parameters. - - Returns - ------- - : list of dicts - The list is the same length self.sar_list and each dictionary is - the length of self.curve_list + 1. Each keyword in a dictionary - references either the observed SAR ('observed') or the SAR generate by - one of the curve objects. - - Notes - ----- - If possible, the SARs are computed using an iterative method. - Otherwise, they are calculated with a one-shot method. - ''' - pred_sar = [] - for sar, a, sad in zip(self.sar_list, self.a_list, self.full_sad): - psar = {} - psar['observed'] = np.array(zip(sar, a), dtype=[('items', np.float), - ('area', np.float)]) - for cur in self.curve_list: - cur.fit(sad, (a, sar)) - - if iter_vals: - try: - psar[cur.get_name()] = cur.iter_vals(a, - use_rad=use_rad, form=form) - except AttributeError: - psar[cur.get_name()] = cur.iter_vals(a, use_rad=True, - form=form) - else: - try: - psar[cur.get_name()] = cur.vals(a, use_rad=use_rad, - form=form) - except AttributeError: - psar[cur.get_name()] = cur.vals(a, use_rad=True, - form=form) - - for kw in psar.iterkeys(): - psar[kw].sort(order='area') - pred_sar.append(psar) - return pred_sar - -def nll(pdist): - ''' - Parameters - ---------- - pdist : list of arrays - List of pmf values on which to compute the negative log-likelihood - - Returns - ------- - :list - List of nll values - - ''' - return [-sum(np.log(dist)) for dist in pdist] - - - -def empirical_cdf(emp_data): - ''' - Generates an empirical cdf from empirical data - - Parameters - ---------- - emp_data : array-like object - Empirical data - - Returns - -------- - :ndarray - An empirical cdf - ''' - - emp_data = cnvrt_to_arrays(emp_data)[0] - unq_vals = np.unique(emp_data) - leng = len(emp_data) - cdf = np.empty(len(emp_data)) - count = 0 - for i in unq_vals: - loc = np.where((i == emp_data))[0] - count += len(loc) - cdf[loc] = count / leng - return cdf - -def aic(neg_L, k, loglik=True): - ''' - Calculates the AIC of a given model - - Parameters - ---------- - neg_L : array-like object - The negative log likelihood of the models or a list of pdfs/pmfs, - depending on nll - k : array-like object - The number of parameters of the model - loglik : bool - If True, assumes neg_L is an array-like object of negative log - likelihood. If False, assumes neg_L is a list of pdfs/pmfs. - - Returns - ------- - : float - AIC for a given model - ''' - if loglik: - neg_L, k = cnvrt_to_arrays(neg_L, k) - else: - neg_L = nll(neg_L) - neg_L, k = cnvrt_to_arrays(neg_L, k) - - assert len(k) == len(neg_L), "neg_L and k must have the same length" - aic = (2 * neg_L) + (2 * k) - return aic - -def aicc(neg_L, k, n=None, loglik=True): - ''' - Calculates the corrected AIC of a given model - - Parameters - ---------- - neg_L : array-like object - The negative log likelihood of models or list of pdfs/pmfs - k : array-like object - The number of parameters of models - n : array-like object - Number of observations for each model. Can be left as None if neg_L is - list of pdfs/pmfs and loglik = True - loglik : bool - If True, assumes neg_L is a array-like object of negative log - likelihood. If False, assumes neg_L is a list of pdfs/pmfs. - - Returns - ------- - : np.array - AICc for a given models - - ''' - if loglik: - assert n != None, 'n argument must be given if loglik is True' - neg_L, k, n = cnvrt_to_arrays(neg_L, k, n) - else: - n = np.array([len(tneg_L) for tneg_L in neg_L]) - neg_L = nll(neg_L) - neg_L, k = cnvrt_to_arrays(neg_L, k) - - assert len(neg_L) == len(k) and len(neg_L) == len(n) and len(k) == len(n),\ - "neg_L, k, and n must all have the same length" - aic_value = aic(neg_L, k) - return aic_value + ((2 * k * (k + 1)) / (n - k - 1)) - -def aic_weights(aic_values): - ''' - Calculates the aic_weights for a given set of models - - Parameters - ---------- - aic_values : array-like object - Array-like object containing AIC values from different models - - Returns - ------- - : tuple - First element contains the relative AIC weights, second element - contains the delta AIC values. - - Notes - ----- - AIC weights can be interpreted as the probability that a given model is the - best model in comparison to the other models - - ''' - aic_values = cnvrt_to_arrays(aic_values)[0] - aic_values = np.array(aic_values) - minimum = np.min(aic_values) - delta = np.array([x - minimum for x in aic_values]) - values = np.exp(-delta / 2) - weights = np.array([x / sum(values) for x in values]) - return weights, delta - -def ks_two_sample(data1, data2): - '''Function uses the Kolomogrov-Smirnov two-sample test to determine if the - two samples come from the same distribution. Note that the KS-test is only - valid for continuous distributions - - Parameters - ---------- - data1 : array-like object - Array-like object which contains a set of data to compare - data2 : array-like object - Array-like object which contains a set of data to compare - - Returns - ------- - : tuple - (D-statistic, two-sided p-value) - - ''' - data1, data2 = cnvrt_to_arrays(data1, data2) - data1 = np.array(data1) - data2 = np.array(data2) - return stats.ks_2samp(data1, data2) - -def likelihood_ratio(nll_null, nll_alt, df_list): - ''' - This functions compares of two nested models using the likelihood ratio - test. - - Parameters - ---------- - nll_null : array-like object - The negative log-likelihood of the null model - nll_alt : array-like object - The negative log-likelihood of the alternative model - df_list : array-like object - the degrees of freedom calculated as (number of free parameters in - alternative model) - (number of free parameters in null model) - - Returns - ------- - : list of tuples - (test_statistic, p-value) - - Notes - ----- - The LRT only applies to nested models. The variable test_stat is known as - the G^2 statistic. - ''' - - nll_null, nll_alt, df_list = cnvrt_to_arrays(nll_null, nll_alt, df_list) - assert len(nll_null) == len(nll_alt) and len(nll_null) == len(df_list) and\ - len(nll_alt) == len(df_list), "nll_null, nll_alt, and df_list " + \ - "must have the same length" - # Calculate G^2 statistic - ll_null = nll_null * -1; ll_alt = nll_alt * -1 - test_stat = 2 * (ll_null - ll_alt) - return [(ts, stats.chisqprob(ts, df)) for ts, df in zip(test_stat, df_list)] - -def variance(data_sets): - '''Calculates the variance of the given data_sets - - Parameters - ---------- - data_sets : list - A list of np.arrays on which the kurtosis will be calculated - - ''' - - variance_list = [] - for data in data_sets: - variance_list.append(np.var(data, ddof=1)) - - return variance_list - -def skew(data_sets): - '''Calculates the skew of some given data - - Parameters - ---------- - data_sets : list - A list of np.arrays on which the kurtosis will be calculated - - Returns - ------- - : list - A list of kurtosis values with the same length as data_sets - - ''' - - skewness_list = [] - for data in data_sets: - skewness_list.append(stats.skew(data)) - - return skewness_list - -def kurtosis(data_sets): - '''Calculates the kurtosis using an online algorithm for the given list of - datasets - - Parameters - ---------- - data_sets : list - A list of np.arrays on which the kurtosis will be calculated - - Returns - ------- - : list - A list of kurtosis values with the same length as data_sets - - ''' - kurtosis_list = [] - for data in data_sets: - kurtosis_list.append(stats.kurtosis(data)) - - return kurtosis_list - -def bootstrap(data_sets, num_samp=1000): - '''Bootstrap a data_set within data_sets num_samp times. With replacement - - Parameters - ---------- - data_sets : list - A list of np.arrays on which the kurtosis will be calculated - num_samp : int - Number of bootstrap samples to take - - Returns - ------- - : a list - A list of lists of arrays. Each list contains num_samp bootstrapped - arrays - ''' - - random.seed(time.time()) - - bootstraps = [] - for data in data_sets: - bt_data = [] - n = len(data) - for j in xrange(num_samp): - bt_data.append(np.array([random.choice(data) for j in xrange(n)])) - bootstraps.append(bt_data) - - return bootstraps - -def bootstrap_moment(data1, data2, moment, CI=.95, num_samp=1000): - ''' - A bootstrap two-sample test of a moment. Returns the test_statistic - distribution and the confidence interval as specified by parameter CI. The - confidence interval is the difference of the moment from data1 minus the - moment from data2. - - Parameters - ---------- - data1 : array-like object - An array like object containing data - data2 : array-like object - An array-like object containing data - moment : list - List of strings (mean, skew, kurtosis, and/or variance). - Will calculate the bootstrap CI's for all the moments in the list - CI : float - The desired confidence interval - num_samp : int - Number of bootstrap samples - - Returns - ------- - res : dict - A dictionary with key words equivalent to the strings found in moment. - Each keyword looks up tuple with two elements. The first element is - the observed difference between the moment of data1 and the moment of - data2. The second element is a tuple containing the confidence - interval (lower_bound, upper_bound) on the difference between the - specified moment of data1 and data2. - - Notes - ----- - From the returned confidence interval, one is CI confident that the - returned confidence interval contains the true difference between the - moment of data1 and data2. Therefore, if the confidence interval does not - contain 0 you can be CI confident that the moments are different. - - Bootstrapping in typically only appropriate for sample sizes >= 25. - - - ''' - - data1 = np.array(data1) - data2 = np.array(data2) - - # Bootstrap the data - data1_boot = bootstrap([data1], num_samp=num_samp)[0] - data2_boot = bootstrap([data2], num_samp=num_samp)[0] - - def calc_ci(stat1, stat2): - """ Calculate CI """ - - diff = stat1 - stat2 - lci = (1 - CI) / 2. - uci = 1 - lci - ci = (stats.scoreatpercentile(diff, 100 * lci),\ - stats.scoreatpercentile(diff, 100 * uci)) - return ci - - - res = {} - # Set the higher order moment - if 'skew' in moment: - - stat_1 = np.array(skew(data1_boot)) - stat_2 = np.array(skew(data2_boot)) - - stat_dist = skew([data1])[0] - skew([data2])[0] - ci = calc_ci(stat_1, stat_2) - - res['skew'] = (stat_dist, ci) - - if 'variance' in moment: - stat_1 = np.array(variance(data1_boot)) - stat_2 = np.array(variance(data2_boot)) - - stat_dist = variance([data1])[0] - variance([data2])[0] - ci = calc_ci(stat_1, stat_2) - - res['variance'] = (stat_dist, ci) - - if 'kurtosis' in moment: - stat_1 = np.array(kurtosis(data1_boot)) - stat_2 = np.array(kurtosis(data2_boot)) - - stat_dist = kurtosis([data1])[0] - kurtosis([data2])[0] - ci = calc_ci(stat_1, stat_2) - - res['kurtosis'] = (stat_dist, ci) - - if "mean" in moment: - stat_1 = np.array([np.mean(bs) for bs in data1_boot]) - stat_2 = np.array([np.mean(bs) for bs in data2_boot]) - - stat_dist = np.mean(data1) - np.mean(data2) - ci = calc_ci(stat_1, stat_2) - - res['mean'] = (stat_dist, ci) - - return res - -def mean_squared_error(obs, pred): - ''' - Calculates the mean squared error between observed and predicted data sets. - The data sets must be of the same length - - Parameters - ---------- - obs : array-like object - The observed data - pred : array-like object - The predicted data - - Returns - ------- - : float - The mean squared error - ''' - - if len(obs) != len(pred): - raise ValueError('obs and pred parameters must have the same length') - - obs, pred = cnvrt_to_arrays(obs, pred) - - return sum((pred - obs)**2) / len(obs) - - -def cnvrt_to_arrays(*args): - ''' - Converts all args to np.arrays - ''' - arg_list = [] - for arg in args: - try: - len(arg); arg = np.array(arg) - except: - arg = np.array([arg]) - arg_list.append(arg) - return tuple(arg_list) - -def get_name(obj): - ''' - Return the name of the object - ''' - return obj.__class__.__name__ - -def make_dist_list(dist_list): - ''' - If the dist_list is all strings, eval them. Else return as is - ''' - - if np.all([type(dist) == str for dist in dist_list]): - - ret_dist_list = np.empty(len(dist_list), dtype=object) - - for i, dist_obj in enumerate(dist_list): - - # Clean strings - dist_obj = dist_obj.strip() - try: - ret_dist_list[i] = eval(dist_obj + '()') - except: - # Do this if passing in a gen_sar sad and ssad - # Assumes the sad and ssad are separated by '-' - try: - sad, ssad = tuple(dist_obj.split('-')) - if sad.find('(') != 1 and sad.find(')') != -1: - sad_obj = eval(sad.strip()) - else: - sad_obj = eval(sad.strip() + '()') - if ssad.find('(') != 1 and ssad.find(')') != -1: - ssad_obj = eval(ssad.strip()) - else: - ssad_obj = eval(ssad.strip() + '()') - ret_dist_list[i] = gen_sar(sad_obj, ssad_obj) - except: - raise NameError("Could not evaluate '%s' as an object name" - % dist_obj + '. It may not exist or may be improperly' + - ' formatted. Please check your distribution list in ' - + 'your parameters.xml file or in the dist_list' + - " argument '%s'" % str(dist_list)) - - ret_dist_list = list(ret_dist_list) - else: - ret_dist_list = dist_list - - return ret_dist_list - -def unpack(zipped_data): - ''' - Unpacks zipped data - - ''' - - unzipped_data = zip(*zipped_data) - unzipped_data = [list(tup) for tup in unzipped_data] - return tuple(unzipped_data) - diff --git a/data.py b/data.py deleted file mode 100644 index a94229e..0000000 --- a/data.py +++ /dev/null @@ -1,371 +0,0 @@ -#!/usr/bin/python - -''' -Routines for loading census data and metadata. - -Classes -------- -- `DataTable` -- data and metadata for a single censused area -- `Metadata` -- load and parse EML metadata for data file -''' - -from __future__ import division -import os -import logging -import numpy as np -import xml.etree.ElementTree as etree -from matplotlib.mlab import csv2rec -import sqlite3 as lite -import pandas as pd - - -class DataTable: - ''' - Class to hold data table and metadata. - - Parameters - ---------- - data_path : str - Path to data - location of metadata determined from this path. - subset : str - An SQL query string - - Attributes - ---------- - asklist : list - A list of tuples of column name and attribute, e.g., [('x', - 'precision'), ('y', 'maximum')], that defines the columns and - parameters that are needed for analysis. Defined in data_load method. - table : recarray - Census data table. - meta : dict - Dictionary of metadata needed for analysis. Needed variables for each - column are defined in asklist - ''' - - def __init__(self, data_path, subset={}): - '''Initialize DataTable object. See class docstring.''' - - self.table, self.meta = self.data_load(data_path, subset=subset) - - - def data_load(self, data_path, subset={}): - ''' - Load data and metadata from files. - - Parameters - ---------- - data_path : str - Path to data table file. - - Returns - ------- - table : recarray - Census data table. - meta : dict - Dictionary of metadata associated with table. - ''' - end = data_path.split('.')[-1] - # Check that file is csv. If so, read in as rec array - if end == 'csv': - table = csv2rec(data_path) - # Load main table - dtype detected automatically - # Use panda to load and convert to records - #table = pd.read_csv(data_path) - # Check if there is a column named index, if so rename and delete - # it. Why? Index is a special word - #if hasattr(table, 'index'): - # table['index_1'] = table['index'] - # del table['index'] - - #table = table.to_records() - - elif end == 'db' or end == 'sql': - - if type(subset) == type({}): - raise ValueError('No SQL query string provided') - - table = db_table(data_path, subset) - else: - raise TypeError('Cannot handle file of type %s' % end) - - # Store asklist defining columns and fields needed for analysis. - # asklist is - self.asklist = [] - for name in table.dtype.names: - self.asklist.append((name, 'minimum')) - self.asklist.append((name, 'maximum')) - self.asklist.append((name, 'precision')) - self.asklist.append((name, 'type')) - - # Load metadata from file - meta = Metadata(data_path, self.asklist).meta_dict - - return table, meta - - - def get_subtable(self, subset): - ''' - Return subtable matching all conditions in subset. - - Parameters - ---------- - subset : dict - Dictionary of conditions for subsetting data (see description in - Patch Class docstring). - - Returns - ------- - subtable : ndarray - Subtable with records from table meeting requirements in subset. - - ''' - - # If no subset, return original table - if subset == {}: - return self.table - - # Declare array to track valid rows of table - valid = np.ones(len(self.table), dtype=bool) - - # TODO: Add ability to do logical or - and is just multiple subsets on - # same column. - for key, value in subset.iteritems(): - if type(value) is not type(['a']): # Make all iterables - value = [value] - - # Merge tuples into a string - merged_values = [] - for val in value: - try: # check if val[1] is a string - eval(str(val[1])) - merged_values.append(val[0] + str(val[1])) - except: - merged_values.append(val[0] + "'" + val[1] + "'") - - for this_value in merged_values: - if this_value != "=='whole'": - this_valid = eval("self.table[key]" + this_value) - valid = np.logical_and(valid, this_valid) - - subtable = self.table[valid] - return subtable - - -class Metadata: - ''' - Metadata values for any analysis stored using Ecological Metadata Language. - - Parameters - ---------- - data_path : str - Path to csv data file. Metadata file must be in same dir, with same - filename, but with .xml extension. - - Attributes - ---------- - valid_file : bool - Whether valid metadata file was found. - root : object - Root of Element Tree representation of metadata xml file. - meta_dict : dict - Dictionary of metadata with values given by asklist. - - ''' - - def __init__(self, data_path, asklist): - '''Initialize Metadata object. See class docstring.''' - - # Get path to metadata file - data_path, data_extension = os.path.splitext(data_path) - xml_path = os.path.abspath(os.path.join(data_path + '.xml')) - - # Determine if metadata file is valid and if so store self.root - self.valid_file = True - - try: - open(xml_path) - except: - logging.info('Missing or invalid metadata file at %s' % xml_path) - self.valid_file = False - - try: - self.root = etree.ElementTree(file=xml_path).getroot() - except: - logging.info('Error parsing metadata file at %s' % xml_path) - self.root = None - self.valid_file = False - - # Check if metadata file is missing or invalid, if so return None - if self.valid_file == False: - self.meta_dict = None - else: - self.meta_dict = self.get_meta_dict(asklist) - - - def get_meta_dict(self, asklist): - ''' - Parse metadata dictionary from xml file. - - Parameters - ---------- - asklist : list - A list of tuples of column name and attribute, e.g., [('x', - 'precision'), ('y', 'maximum')], that defines the columns and - parameters that are needed for analysis. - - Returns - ------- - meta_dict : dict - Dictionary of metadata values for each item in asklist, in form - {('column_name', 'element'): value}. column_name in data table is - equivalent to attribute in xml. - ''' - - # TODO: Column attribute will be None if either column entry does not - # exist in metadata or if column entry exists but attribute is missing. - # We may want to distinguish these, perhaps just with logging. - - # Populate dictionary of metadata values for asklist items - meta_dict = {} - - for item in asklist: - # Get list of all elements for this attribute - all_elements = self.get_all_elements(item[0]) - - # Get value of element for this attribute if it exists - if all_elements is None: - value = None - else: - value = self.get_element_value(all_elements, item[1], item[0]) - - # Eval value if possible and log outcome - try: - value = eval(value) - value_type = str(type(value)).split("'")[1] - logging.debug('Metadata value %s, %s evaluated to %s' % - (item[0], item[1], value_type)) - except: - logging.debug('Metadata value %s, %s left as string' % - (item[0], item[1])) - - # Store value for this item - meta_dict[item] = value - - return meta_dict - - - def get_all_elements(self, attribute): - '''Returns list of XML elements of type attribute for attribute.''' - - attributes = self.root.findall('.//dataTable/attributeList/attribute') - for a in attributes: - if a.find('.//attributeName').text == attribute: - return a - - - def get_element_value(self, all_elements, element_name, col_name): - '''Returns value of attribute_name from all_attributes list.''' - if element_name == 'type': - if len(all_elements.findall('.//dateTime')) == 1: - return 'ordinal' - elif len(all_elements.findall('.//interval')) == 1: - return 'interval' - elif len(all_elements.findall('.//ordinal')) == 1: - return 'ordinal' - elif len(all_elements.findall('.//nominal')) == 1: - return 'nominal' - elif len(all_elements.findall('.//ratio')) == 1: - return 'ratio' - else: - logging.warning("Could not find recognizable column type. " +\ - "Setting type of column name '%s' to ordinal." %\ - col_name) - return 'ordinal' - else: - try: - value = all_elements.find('.//%s' % element_name).text - return value - except AttributeError: - return None - - - def get_physical_coverage(self): - '''Returns a tuple of physical limits of the dataset (NESW).''' - coords = self.root.find('.//coverage/geographicCoverage/' + - 'boundingCoordinates') - bounds = [] - for d in ('north','east','south','west'): - bounds.append(float(coords.find('%sBoundingCoordinate'%d).text)) - return bounds - - - def get_title(self): - '''Extracts the title of the dataset. Not currently used.''' - return self.root.find('.//dataset/title').text - -def db_table(data_path, query_str): - '''Query a database and return query result as a recarray - - Parameters - ---------- - data_path : str - The data_path of the .db file - query_str : str - The SQL query string - - Returns - ------- - table : recarray - The database query as a recarray - - ''' - - end = data_path.split('.')[-1] - - if end == 'sql': - - def readData(): - f = open(data_path, 'r') - - with f: - data = f.read() - return data - - con = lite.connect(':memory:') - con.row_factory = lite.Row - - cur = con.cursor() - sql = readData() - cur.executescript(sql) - - elif end == 'db': - - con = lite.connect(data_path) - - con.row_factory = lite.Row - cur = con.cursor() - - cur.execute(query_str) - db_info = cur.fetchall() - try: - col_names = db_info[0].keys() - except IndexError: - raise lite.OperationalError("Query '%s' to database '%s' is empty" % - (query_str, data_path)) - - # Convert objects to tuples - converted_info = [tuple(x) for x in db_info] - - # NOTE: Using default value for Unicode: Seems better than checking - # lengths. Should we keep the type as unicode? - dtypes = [type(x) if type(x) != unicode else 'S150' for x in db_info[0]] - - table = np.array(converted_info, dtype=zip(col_names, dtypes)) - con.commit() - con.close() - - # Return a recarray for consistency - return table.view(np.recarray) - - diff --git a/demo/ANBO.csv b/demo/ANBO.csv new file mode 100644 index 0000000..1fa6598 --- /dev/null +++ b/demo/ANBO.csv @@ -0,0 +1,122 @@ +year,cell,row,column,spp,count +2010,1,3.0,3.0,cabr,3.0 +2010,1,3.0,3.0,caspi1,20.0 +2010,1,3.0,3.0,crcr,3.0 +2010,1,3.0,3.0,crsp2,1.0 +2010,1,3.0,3.0,gnwe,11.0 +2010,1,3.0,3.0,grass,11.0 +2010,1,3.0,3.0,lesp1,1.0 +2010,1,3.0,3.0,phdi,5.0 +2010,1,3.0,3.0,pypo,6.0 +2010,1,3.0,3.0,ticr,50.0 +2010,2,3.0,2.0,caspi1,17.0 +2010,2,3.0,2.0,comp1,2.0 +2010,2,3.0,2.0,crsp2,7.0 +2010,2,3.0,2.0,gnwe,4.0 +2010,2,3.0,2.0,grass,26.0 +2010,2,3.0,2.0,phdi,7.0 +2010,2,3.0,2.0,pypo,8.0 +2010,2,3.0,2.0,ticr,12.0 +2010,2,3.0,2.0,unsp1,1.0 +2010,3,3.0,1.0,arsp1,1.0 +2010,3,3.0,1.0,caspi1,9.0 +2010,3,3.0,1.0,crsp2,8.0 +2010,3,3.0,1.0,grass,120.0 +2010,3,3.0,1.0,mobe,4.0 +2010,3,3.0,1.0,phdi,14.0 +2010,3,3.0,1.0,pypo,12.0 +2010,3,3.0,1.0,ticr,7.0 +2010,3,3.0,1.0,unsp1,1.0 +2010,4,3.0,0.0,crcr,23.0 +2010,4,3.0,0.0,crsp2,13.0 +2010,4,3.0,0.0,gnwe,1.0 +2010,4,3.0,0.0,grass,160.0 +2010,4,3.0,0.0,magl,1.0 +2010,4,3.0,0.0,phdi,14.0 +2010,4,3.0,0.0,pypo,6.0 +2010,4,3.0,0.0,ticr,3.0 +2010,5,2.0,3.0,cabr,9.0 +2010,5,2.0,3.0,caspi1,12.0 +2010,5,2.0,3.0,crsp2,1.0 +2010,5,2.0,3.0,gnwe,13.0 +2010,5,2.0,3.0,grass,180.0 +2010,5,2.0,3.0,pypo,5.0 +2010,5,2.0,3.0,ticr,120.0 +2010,6,2.0,2.0,crsp2,15.0 +2010,6,2.0,2.0,grass,115.0 +2010,6,2.0,2.0,phdi,15.0 +2010,6,2.0,2.0,pypo,10.0 +2010,6,2.0,2.0,unsp1,15.0 +2010,7,2.0,1.0,crsp2,9.0 +2010,7,2.0,1.0,grass,12.0 +2010,7,2.0,1.0,phdi,42.0 +2010,8,2.0,0.0,arsp1,1.0 +2010,8,2.0,0.0,crcr,12.0 +2010,8,2.0,0.0,crsp2,6.0 +2010,8,2.0,0.0,grass,110.0 +2010,8,2.0,0.0,phdi,27.0 +2010,8,2.0,0.0,pypo,1.0 +2010,9,1.0,3.0,cabr,7.0 +2010,9,1.0,3.0,enfa,1.0 +2010,9,1.0,3.0,phdi,39.0 +2010,9,1.0,3.0,pypo,7.0 +2010,10,1.0,2.0,cabr,4.0 +2010,10,1.0,2.0,comp1,1.0 +2010,10,1.0,2.0,crcr,3.0 +2010,10,1.0,2.0,crsp2,3.0 +2010,10,1.0,2.0,gnwe,4.0 +2010,10,1.0,2.0,grass,20.0 +2010,10,1.0,2.0,phdi,10.0 +2010,10,1.0,2.0,pypo,6.0 +2010,11,1.0,1.0,comp1,1.0 +2010,11,1.0,1.0,crcr,6.0 +2010,11,1.0,1.0,crsp2,6.0 +2010,11,1.0,1.0,gnwe,3.0 +2010,11,1.0,1.0,grass,86.0 +2010,11,1.0,1.0,mesp,1.0 +2010,11,1.0,1.0,phdi,8.0 +2010,11,1.0,1.0,pypo,2.0 +2010,11,1.0,1.0,ticr,7.0 +2010,11,1.0,1.0,unsh1,1.0 +2010,11,1.0,1.0,unsp3,1.0 +2010,12,1.0,0.0,cabr,5.0 +2010,12,1.0,0.0,cran,1.0 +2010,12,1.0,0.0,crcr,10.0 +2010,12,1.0,0.0,crsp2,7.0 +2010,12,1.0,0.0,gnwe,5.0 +2010,12,1.0,0.0,grass,88.0 +2010,12,1.0,0.0,phdi,14.0 +2010,12,1.0,0.0,pypo,1.0 +2010,12,1.0,0.0,ticr,70.0 +2010,13,0.0,3.0,cabr,1.0 +2010,13,0.0,3.0,cran,1.0 +2010,13,0.0,3.0,crcr,2.0 +2010,13,0.0,3.0,grass,60.0 +2010,13,0.0,3.0,phdi,4.0 +2010,13,0.0,3.0,pypo,4.0 +2010,13,0.0,3.0,ticr,80.0 +2010,13,0.0,3.0,unsp1,1.0 +2010,14,0.0,2.0,comp1,1.0 +2010,14,0.0,2.0,crcr,1.0 +2010,14,0.0,2.0,grass,60.0 +2010,14,0.0,2.0,mesp,2.0 +2010,14,0.0,2.0,ticr,140.0 +2010,15,0.0,1.0,cran,1.0 +2010,15,0.0,1.0,crcr,2.0 +2010,15,0.0,1.0,crsp2,3.0 +2010,15,0.0,1.0,grass,20.0 +2010,15,0.0,1.0,mesp,3.0 +2010,15,0.0,1.0,phdi,3.0 +2010,15,0.0,1.0,pypo,2.0 +2010,15,0.0,1.0,sasp,2.0 +2010,15,0.0,1.0,ticr,100.0 +2010,16,0.0,0.0,cabr,2.0 +2010,16,0.0,0.0,chst,1.0 +2010,16,0.0,0.0,cran,1.0 +2010,16,0.0,0.0,crcr,3.0 +2010,16,0.0,0.0,grass,42.0 +2010,16,0.0,0.0,phdi,8.0 +2010,16,0.0,0.0,plsp1,1.0 +2010,16,0.0,0.0,pypo,3.0 +2010,16,0.0,0.0,ticr,140.0 +2010,16,0.0,0.0,unsp4,1.0 diff --git a/demo/ANBO.txt b/demo/ANBO.txt new file mode 100644 index 0000000..ef1f5f3 --- /dev/null +++ b/demo/ANBO.txt @@ -0,0 +1,32 @@ +[Description] +name = Anzo Borrego +author = Mary Ellen Harte and John Harte +description = Vegetation census conducted at Anza-Borrego Desert State Park. Site in Indian Valley at N 32' 52.091", W 116' 14.447". Elevation 1195 feet. Census was conducted on a 4 m x 4 m grid, with 16 grid cells each 1 m2 in area. +citation = Unpublished + +datapath = ANBO.csv +cols = spp_col:spp; count_col: count; x_col: row; y_col: column + +[year] +description = Year of census + +[cell] +description = Unique cell identifier, from 0 to 15 (total of 16 cells) + +[row] +description = Row of cell in gridded plot +min = 0 +max = 3 +step = 1 + +[column] +description = Column of cell in gridded plot +min = 0 +max = 3 +step = 1 + +[spp] +description = Name of species + +[count] +description = Number of individuals of a species in a cell diff --git a/demo/parameters.txt b/demo/parameters.txt new file mode 100644 index 0000000..8ae3185 --- /dev/null +++ b/demo/parameters.txt @@ -0,0 +1,21 @@ +[Plognorm pmf] +analysis = plnorm.pmf + +x = np.arange(10) +mu = 2 +sigma = 1. + +[SAD ANBO Row > 2] +analysis = sad +metadata = ANBO.txt + +subset = row>=2 +log_y = True + +[Comm ANBO] +analysis = comm_grid +metadata = ANBO.txt + +cols = spp_col:spp; count_col:count; x_col:row; y_col:column +divs = 4,4; +models = power_law diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..e4230b9 --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,170 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build +SUBDIR = generated + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext gh-pages + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* $(SUBDIR)/ + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/macroeco.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/macroeco.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/macroeco" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/macroeco" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +gh-pages: + +gh-pages: + git checkout gh-pages + rm -rf * ../macroeco ../*.* ../_* + cd ../; git checkout develop doc macroeco + make html + cp -r _build/html/* ../ + rm -rf _build generated ../macroeco + touch ../.nojekyll + git add --all :/ + git commit -m "Generated gh-pages for `git log develop -1 --pretty=short --abbrev-commit`" && git push origin gh-pages + rm -rf * ../*.* ../_* + git checkout develop + git checkout -- . \ No newline at end of file diff --git a/doc/_templates/autosummary/class.rst b/doc/_templates/autosummary/class.rst new file mode 100755 index 0000000..1d61b1b --- /dev/null +++ b/doc/_templates/autosummary/class.rst @@ -0,0 +1,27 @@ +{% extends "!autosummary/class.rst" %} + +{% block methods %} +{% if methods %} + .. This comment allows autosummary to build but does not display it + .. autosummary:: + :toctree: + {% for item in all_methods %} + {%- if not item.startswith('_') or item in ['__call__'] %} + {{ name }}.{{ item }} + {%- endif -%} + {%- endfor %} +{% endif %} +{% endblock %} + +{% block attributes %} +{% if attributes %} + .. This comment allows autosummary to build but does not display it + .. autosummary:: + :toctree: + {% for item in all_attributes %} + {%- if not item.startswith('_') %} + {{ name }}.{{ item }} + {%- endif -%} + {%- endfor %} +{% endif %} +{% endblock %} diff --git a/doc/_templates/layout.html b/doc/_templates/layout.html new file mode 100644 index 0000000..786fba2 --- /dev/null +++ b/doc/_templates/layout.html @@ -0,0 +1,24 @@ +{% extends "!layout.html" %} + +{# Use root link even on first page #} +{%- block rootrellink %} +
  • {{ shorttitle|e }}
  • +{%- endblock %} + +{# Use our name in header #} +{%- block header %} +{% if theme_scipy_org_logo %} +
    +
    + + SciPy +
    +
    + +{% else %} +
    +
    +
    +
    +{% endif %} +{% endblock %} diff --git a/doc/about.rst b/doc/about.rst new file mode 100644 index 0000000..a78e349 --- /dev/null +++ b/doc/about.rst @@ -0,0 +1,9 @@ +============== +About Macroeco +============== + +The current version of Macroeco was developed at the University of California, Berkeley by Justin Kitzes and Mark Wilber and is maintained by Justin Kitzes. Other contributors to current and previous versions include Chloe Lewis and Ethan White. + +Comments, bugs, and feature requests can be submitted to the developers by creating a `new issue `_ in the Macroeco GitHub repo. If you are submitting a bug, please include as much information as possible so that we can reproduce it. + +The development of macroeco has been supported by the National Science Foundation, the Gordon and Betty Moore Foundation, and the Berkeley Institute for Global Change Biology. \ No newline at end of file diff --git a/doc/compare.rst b/doc/compare.rst new file mode 100644 index 0000000..dcd8cf8 --- /dev/null +++ b/doc/compare.rst @@ -0,0 +1 @@ +.. automodule:: macroeco.compare diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 0000000..1f95c5f --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,290 @@ +# -*- coding: utf-8 -*- +# +# macroeco documentation build configuration file, created by +# sphinx-quickstart on Sun Feb 16 21:19:54 2014. +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. +# +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +import sys +import os + +sys.path.insert(0, os.path.abspath('..')) +from macroeco import __version__ + +# -- General configuration ----------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.mathjax', + 'sphinx.ext.autosummary', 'numpydoc', 'sphinx.ext.intersphinx'] + +autosummary_generate = True +#autodoc_default_flags = ['inherited-members'] + +intersphinx_mapping = {'scipy': ('http://docs.scipy.org/doc/scipy/reference/', + None)} + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'macroeco' +copyright = u'Justin Kitzes and Mark Wilber' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.3' +# The full version, including alpha/beta/rc tags. +release = '0.3' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build', '_ext', '_templates'] + +# The reST default role (used for this markup: `text`) to use for all documents. +default_role = 'py:obj' + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# Mock objects that RTC can't build/import +class Mock(object): + + __all__ = [] + + def __init__(self, *args, **kwargs): + pass + + def __call__(self, *args, **kwargs): + return Mock() + + @classmethod + def __getattr__(cls, name): + if name in ('__file__', '__path__'): + return '/dev/null' + elif name[0] == name[0].upper(): + mockType = type(name, (), {}) + mockType.__module__ = __name__ + return mockType + else: + return Mock() + +MOCK_MODULES = ['shapely', 'shapely.geometry'] +for mod_name in MOCK_MODULES: + sys.modules[mod_name] = Mock() + + +# -- Options for HTML output --------------------------------------------------- + +# Use local RTD theme if building locally +on_rtd = os.environ.get('READTHEDOCS', None) == 'True' +if not on_rtd: # only import and set the theme if we're building docs locally + import sphinx_rtd_theme + html_theme = 'sphinx_rtd_theme' + html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +#html_theme = 'scipy' + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = ['_theme'] + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = { +# "edit_link": False, +# "rootlinks": [], +# "sidebar": "right", +# "scipy_org_logo": True, +# "navigation_links": True, +#} +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +html_sidebars = {'**': ['globaltoc.html', 'searchbox.html']} +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Output file base name for HTML help builder. +htmlhelp_basename = 'macroecodoc' + + +# -- Options for LaTeX output -------------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'macroeco.tex', u'Macroeco', + u'Justin Kitzes and Mark Wilber', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output -------------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + ('index', 'macroeco', u'Macroeco', + [u'Justin Kitzes and Mark Wilber'], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------------ + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ('index', 'macroeco', u'Macroeco', + u'Justin Kitzes and Mark Wilber', 'macroeco', + 'Ecological pattern analysis in Python', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' diff --git a/doc/empirical.rst b/doc/empirical.rst new file mode 100644 index 0000000..2c30f04 --- /dev/null +++ b/doc/empirical.rst @@ -0,0 +1 @@ +.. automodule:: macroeco.empirical diff --git a/doc/index.rst b/doc/index.rst new file mode 100644 index 0000000..8c5b6aa --- /dev/null +++ b/doc/index.rst @@ -0,0 +1 @@ +.. automodule:: macroeco diff --git a/doc/main.rst b/doc/main.rst new file mode 100644 index 0000000..8a345b8 --- /dev/null +++ b/doc/main.rst @@ -0,0 +1 @@ +.. automodule:: macroeco.main diff --git a/doc/make.bat b/doc/make.bat new file mode 100644 index 0000000..9dd6c3f --- /dev/null +++ b/doc/make.bat @@ -0,0 +1,190 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\macroeco.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\macroeco.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +:end diff --git a/doc/misc.rst b/doc/misc.rst new file mode 100644 index 0000000..ee73cfa --- /dev/null +++ b/doc/misc.rst @@ -0,0 +1 @@ +.. automodule:: macroeco.misc diff --git a/doc/models.rst b/doc/models.rst new file mode 100644 index 0000000..a21b662 --- /dev/null +++ b/doc/models.rst @@ -0,0 +1 @@ +.. automodule:: macroeco.models diff --git a/doc/reference.rst b/doc/reference.rst new file mode 100644 index 0000000..8a25a0f --- /dev/null +++ b/doc/reference.rst @@ -0,0 +1,20 @@ +.. _reference: + +========= +Reference +========= + +The ``macroeco`` Python package is organized into five subpackages. + +The three core packages are empirical, models, and compare, which contain functions for analyzing empirical data, classes and objects for theoretical models, and functions for comparing the fits of models and data. + +The main subpackage contains the code for MacroecoDesktop, which allows a allows a user to specify all of the parameters for an analysis in a text file. The misc subpackage provides a set of miscellaneous functions. + +.. toctree:: + :maxdepth: 1 + + empirical + models + compare + misc + main diff --git a/doc/requirements.txt b/doc/requirements.txt new file mode 100644 index 0000000..8e4e869 --- /dev/null +++ b/doc/requirements.txt @@ -0,0 +1,9 @@ +scipy +numpy +matplotlib +pandas +mpmath +configparser +decorator +twiggy +numpydoc diff --git a/doc/tutorial_getting_started.rst b/doc/tutorial_getting_started.rst new file mode 100644 index 0000000..5e5bdeb --- /dev/null +++ b/doc/tutorial_getting_started.rst @@ -0,0 +1,120 @@ +=============== +Getting Started +=============== + +This tutorial provides an introduction to the basic use of Macroeco for ecological pattern analysis. + +The functionality of the software Macroeco can be accessed through two interfaces: the low-level Python package ``macroeco`` or the high-level MacroecoDesktop interface. + +The Python package ``macroeco`` is a scientific Python package that can be imported into a user's custom scripts and modules along with other scientific packages such as ``scipy`` or ``pandas``. + +The MacroecoDesktop interface is designed for users who wish to use the functionality of Macroeco but are not Python programmers. Instead of writing Python code, users of MacroecoDesktop create simple text files, known as parameters files, that describe an analysis and the type of desired output. MacroecoDesktop provides both a window-based graphical interface and a "headless" command line mode - the latter of these allows MacroecoDesktop to be called from other computing environments such as R. + + +.. _installation: + +Installation +============ + +For users with an existing scientific Python environment, the latest stable version of both ``macroeco`` and the MacroecoDesktop interface can be installed with ``pip install macroeco``. Several package dependencies may also be installed by this command. The latest development version of Macroeco can be found in the "develop" branch of the `Macroeco GitHub repo `_. + +Mac OS X users who wish only to use MacroecoDesktop can instead download the MacroecoDesktop application from `this link `_. After unzipping, drag the MacroecoDesktop application into the Applications folder. + +Windows and Linux users who wish to use MacroecoDesktop will need to set up a scientific Python environment. The developers recommend the free `Continuum Anaconda `_ scientific Python installation for new users. After downloading and installing Anaconda, run the command ``pip install macroeco`` from a Terminal window. + +The remainder of this tutorial uses demo data from a vegetation census in Anza-Borrego Desert State Park in southern California. This demo data can be downloaded at `this link `_. The file ANBO.csv contains the census data and the file ANBO.txt contains metadata describing the data table. This data may be freely shared and used for analysis so long as credit is given to the authors. + +.. _first-steps-macroeco: + +First steps: ``macroeco`` +========================= + +Users of MacroecoDesktop should skip this section and proceed below to :ref:`first-steps-macroeco-desktop`. + +The ``macroeco`` package contains three main subpackages of interest: + +* Empirical - loads data tables and performs empirical analysis of macroecological metrics, such as the species abundance distribution and species area relationship + +* Models - provides objects for distributions and curves predicted by macroecological theory, such as the logseries distributions and power law function + +* Compare - provides utility functions for comparing the fit of models to empirical metrics, such as AIC weights and r-squared statistics + +A common workflow involves loading data, calculating an empirical metric, fitting one or more models to the empirical metric, and evaluating the fit of the model to the metric. The following example calculates a simple species abundance distribution for the demo data. + +First, the ``Patch`` class from the empirical subpackage is used to create a Patch object that holds the data table and a metadata dictionary describing the data. ``Patch`` requires a path, absolute or relative, to a metadata file as a mandatory argument (see :ref:`own-data` for information on creating a metadata file for a new data set). + + >>> import macroeco as meco + >>> pat = meco.empirical.Patch('~/Desktop/demo/ANBO.txt') + +The empirical subpackage contains a number of functions that operate on patch objects and return macroecological metrics. Here we'll use the function ``sad`` to calculate a species abundance distribution. The first argument is the patch object to use, the second is a string specifying which column has the species names (spp_col) and which, if any, has a count of individuals at a particular location (count_col), and the third is a string specifying how to split the data (see the Reference guide for the functions in the empirical module for more information on input arguments). + + >>> sad = meco.empirical.sad(pat, 'spp_col:spp; count_col:count', '') + +All functions for macroecological metrics return their results as a list of tuples. Each tuple has two elements: a string describing how the data were split and a result table with a column ``y`` (for univariate distributions like the species abundance distribution) or columns ``y`` and ``x`` (for curves such as a species area relationship) giving the results of the analysis. Since the data were not split in this example, the list has only one tuple. + +Any number of distributions from the models subpackage can be fit to the resulting empirical metric. The code below fits the two parameters of the LINK upper truncated logseries distribution and uses the function ``AIC`` from the compare subpackage to calculate the AIC for this distribution and data. + + >>> p, b = meco.models.logser_uptrunc.fit_mle(sad[0][1]['y']) + >>> p, b + (0.9985394369365049, 2445.0) + >>> meco.compare.AIC(sad[0][1]['y'], meco.models.logser_uptrunc(p, b)) + 208.61902087378027 + +The two fitted parameters can be used to generate a rank abundance distribution of the same length as the empirical data. The empirical and predicted rank curves are plotted. + + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> plt.semilogy(meco.models.logser_uptrunc.rank(len(sad[0][1]),p,b)[::-1]) + >>> plt.semilogy(np.sort(sad[0][1]['y'])[::-1]) + >>> plt.show() + +For information on performing more complex analyses using ``macroeco``, see :ref:`using-macroeco`. + + +.. _first-steps-macroeco-desktop: + +First steps: MacroecoDesktop +============================= + +This section describes the MacroecoDesktop interface. Mac OS X users who have downloaded the MacroecoDesktop standalone application will have a choice between accessing MacroecoDesktop through a graphical program or at the command line. Other users will only have access to MacroecoDesktop through the command line interface. Both methods of interaction are described below. + +The purpose of MacroecoDesktop is to provide non-programmers an interface for accessing the functionality of Macroeco without the need to write Python code. Instead, the user creates a text file, called a parameters file, that contains the information and instructions needed by MacroecoDesktop to execute an analysis. + +This section gives a very brief overview of how to create a simple parameter file and use it to analyze a species abundance distribution (the analysis and output are identical to that shown above in :ref:`first-steps-macroeco`). More information on the structure of parameter files and how to customize them can be found in the tutorial XXXX. + +To create a simple parameter file, open a text editor of your choice. Windows users can use Notepad, which can be accessed through the Start Menu. Mac users can use the program TextEdit, which is located in Utilities folder inside of the Applications folder. + +IMPORTANT: Mac users who use TextEdit should open the program and immediately go to the Format menu and select the option Make Plain Text. This will need to be done every time TextEdit is used to create a new document. Alternatively, you might wish to download a better text editor such as the free program `TextWrangler `_. + +To get started, type the following text into your text editor. Save this file with the name "new_parameters.txt" in the demo directory containing the ANBO.txt and ANBO.csv files. :: + + [SAD-ANBO] + + analysis = sad + + metadata = ANBO.txt + + models = logser_uptrunc; lognorm + log_y = True + + +A single parameter file can contain multiple "runs", each of which is denoted by the name of the run written in brackets (this run is titled "SAD ANBO", as it will analyze the species abundance distribution for the Anza-Borrego demo data). + +Conceptually, the information required for a single run can be broken down into three parts. The first part tells MacroecoDesktop the type of analysis that's desired, in this case a species abundance distribution (any function contained in the empirical or models subpackage of ``macroeco`` can be listed here as an analysis). + +The second part contains the information that MacroecoDesktop needs to complete the core analysis. To generate an empirical species abundance distribution, the necessary inputs are the location of a metadata file that both points to a data table and provides information about the data and a variable called "cols" that tells MacroecoDesktop which column in the data table represents the name of the species and which (if any) gives the count of individuals at a location. + +The third part describes what, if any, theoretical models should be compared to the core empirical result and what options should be used for the comparison. The models variable gives a list of distribution names to compare to the empirical data. An additional variable log_y specifies that the y-axis of output graphs should be log transformed. + +Once the parameter file has been created and saved, MacroecoDesktop can be called either from the graphical MacroecoDesktop program or from the Terminal. + +For Mac users who have downloaded the standalone MacroecoDesktop application, double click to launch the program. Use the Open button near the top to find and open the new_parameters.txt file that you just created. The parameters file will appear, and it can be edited and saved here again if desired. Once the parameter file is opened, click the Run button near the bottom. When the line "Finished analysis successfully" appears in the bottom window, the analysis is complete and the results are available. The results will be found in a folder named "results" in the same location as the new_parameters.txt file. + +For users who wish to access MacroecoDesktop from the terminal and who have installed ``macroeco`` in their Python environment, simply run the command ``mecodesktop path/to/new_parameters.txt``. Output about the analysis progress will be printed in the Terminal window, and the results will eventually be saved in a folder named "results" in the same location as the new_parameters.txt file. + +Mac users who have downloaded the standalone MacroecoDesktop application can also access MacroecoDesktop from the command line if desired. Presuming that the MacroecoDesktop program has been placed in the Applications folder, the command to use is ``/Applications/MacroecoDesktop.app/Contents/MacOS/mecodesktop path/to/new_parameters.txt`` + +For information on performing more complex analyses using MacroecoDesktop, see :ref:`using-macroecodesktop`. + + + diff --git a/doc/tutorial_macroeco.rst b/doc/tutorial_macroeco.rst new file mode 100644 index 0000000..1bb507e --- /dev/null +++ b/doc/tutorial_macroeco.rst @@ -0,0 +1,9 @@ +.. _using-macroeco: + +============== +Using macroeco +============== + +This tutorial describes the basic usage of the ``macroeco`` Python package. Users who wish to use the high-level MacroecoDesktop interface should refer to the :ref:`using-macroecodesktop` tutorial. + +Coming soon. \ No newline at end of file diff --git a/doc/tutorial_macroeco_desktop.rst b/doc/tutorial_macroeco_desktop.rst new file mode 100644 index 0000000..2540916 --- /dev/null +++ b/doc/tutorial_macroeco_desktop.rst @@ -0,0 +1,184 @@ +.. _using-macroecodesktop: + +===================== +Using MacroecoDesktop +===================== + +This tutorial describes the basic usage of the the high-level MacroecoDesktop interface. Users who wish to use the ``macroeco`` Python package should refer to the :ref:`using-macroeco` tutorial. + +This tutorial builds on the :ref:`first-steps-macroeco-desktop` tutorial, which should be completed first. + +There are three basic types of analysis that can be completed using MacroecoDesktop: analysis of an empirical ecological pattern, fitting models to an empirical pattern, and exploration of a model of a macroecological pattern without empirical data. This tutorial describes these three types of analysis in turn. + +Analyzing Empirical Patterns +============================ + +The first step in analyzing an empirical data set is to prepare the table and metadata file for the empirical data set as described in the :ref:`own-data` tutorial. It is generally most convenient to place the data table (usually in csv format) and the metadata file in the same folder. + +The second step is to prepare a parameters file to describe the desired analysis. A parameter file has a section for each run that is part of an analysis. Each run is independent of the others, and multiple runs may be combined in a single parameter file for convenience (for example, several analyses may be run on a single data set, or a single metric may be calculated for many data sets). + +An example of a run performing a species area analysis for the demo data set is shown below. :: + + [SAR] + analysis = ssad + + metadata = ANBO.txt + subset = column >= 2 + cols = spp_col:spp; count_col:count; x_col:row; y_col:column + splits = row:2 + divs = 1,1; 1,2; 2,1; 2,2 + ear = False + +Each run begins with a title that is enclosed in square brackets. The run title should not have any spaces in it. + +The first section following the run name contains a single variable ``analysis`` which gives the name of an empirical ecological pattern to analyze for this data set. The available empirical analyses in Macroeco |version| are + +.. currentmodule:: macroeco.empirical +.. autosummary:: + :toctree: generated/ + + sad + ssad + sar + comm_grid + o_ring + +The pages linked above from each analysis name describe the analysis and the different input parameters required to complete the analysis. Each of these input parameters can be specified here in the parameters file. + +For example, examining the ``sar`` metric above shows that this analysis takes five input parameters: + +* patch - a Patch object containing the empirical data +* cols - a string associating column headers in the data table with their uses +* splits - a string describing how and whether to split the data into multiple separate subsets before analysis +* divs - the number of divisions along coordinate columns +* ear - True or False, where True calculates an endemics area relationship and False calculates a species area relationship + +Each of these five input parameters is provided directly in the run shown above, with the exception of the ``patch`` parameter, which is described slightly differently. Although the descriptions below apply to the ``sar`` metric, many of the same input parameters are used by the other analysis metrics. + +In all empirical analyses in Macroeco, the first input parameter is a patch object. Instead of describing this object directly in MacroecoDesktop, the user first provides the ``metadata`` and, optionally, the ``subset`` input parameters. + +The first parameter, ``metadata``, gives the relative path to the metadata file from the parameter file (if the parameter file and metadata file are in the same folder, this is just the name of the metadata file). + +The second parameter, ``subset``, takes a subset of the empirical data for further analysis. Any logical mathematical statement beginning with a column name and ending with a number can be used here. For example, setting ``subset`` to ``year==2010; row < 2; spp=='cabr'`` would perform all subsequent calculations only for data in which the year column is 2010, the row column is greater than 2, and the species column is equal to 'cabr'. Multiple conditions are separated by semicolons. In the example run above, the SAR will be calculated only for columns 2 and 3 of the data. + +The next input parameter for an SAR analysis is ``cols``, which is a string describing which column in the data table should be used for which "special columns" in analysis. The five possible special columns are + +- spp_col - Unique species identifiers +- count_col - Number of individuals at a location +- x_col - x coordinate of location +- y_col - y coordinate of location +- energy_col - Energetic requirements of individual(s) at a location + +Analyses that do not have a spatial component (like a species abundance distribution without subsets or splits) require only spp_col and count_col (if one exists - if not, each record is taken to represent one individual). Spatial analyses, such as the species-area relationship, also require x_col and y_col. Energy metrics require energy_col. + +The ``cols`` parameter can also be set in the Description section of the metadata file, in which case it is not required here. If ``cols`` is set both in a run and in a metadata file, the value from the run takes precedence. + +The next input parameter is ``splits``, which provides a convenient way to divide a data set into separate analyses. The value ``year:split; row:2:``, for example, would split the data set into unique years and also into two subplots along the row column, each of equal length. The value before the ``:`` is a column name, and the value after is either a number (if a numeric column is to be split into equal sized divisions) or the word "split" (if a column is to be split among all unique values). + +This parameter is particularly useful if a column for plot ID, family name, functional group, etc. is present in the data table, in which case splitting on that column would perform an identical analysis for each different plot, family, group, etc. ``splits`` can also be used, for example, to split a plot into four subplots along two coordinate axes and perform a species area analysis within each one. + +The next input parameter is ``divs``, which gives the number of divisions to perform along the x and y columns. For example, ``3:2;`` will divide a plot into six subplots, with three "columns" formed by splitting the x axis into three parts and two "rows" formed by splitting the y axis into two parts. Here, ``1,1; 1,2; 2,1; 2,2`` will analyze the species area relationship for the entire plot, half plots (split in both directions), and quarter plots. + +The final input parameter, ``ear``, determines whether a species area or an endemics area relationship should be calculated. This is a boolean value that can be either True (endemics area relationship) or False (species area relationship). + +Once the parameters file has been created and saved, it can be executed using MacroecoDesktop by following the instructions at the end of the :ref:`first-steps-macroeco-desktop` tutorial. + +A sample parameter file containing runs that complete many of the above empirical analyses can be found in :ref:`recipes`. + + +Fitting and Comparing Models of Empirical Patterns +================================================== + +NacroecoDesktop can also be used to fit models to empirical data patterns, analyze the goodness of fit of these models, and to compare the fits of multiple models. This process is identical to that described above for analyzing empirical patterns, except that one additional set of input parameters is added to a run in the parameters file. :: + + [SAR] + analysis = ssad + + metadata = ANBO.txt + subset = column >= 2 + cols = spp_col:spp; count_col:count; x_col:row; y_col:column + splits = row:2 + divs = 1,1; 1,2; 2,1; 2,2 + ear = False + + models = power_law + log_x = true + log_y = true + +The third portion of this run begins with the input parameter ``models``, which can be set equal to one or several of the models within the ``macroeco`` package. If the metric is a curve, such as the species area relationship, the following models may be used. + +.. currentmodule:: macroeco.models +.. autosummary:: + :toctree: generated/ + + power_law + mete_sar + mete_sar_iterative + mete_ear + +If the metric is a probability distribution, the following models may be used (note that some are discrete and some continuous). + +.. autosummary:: + :toctree: generated/ + + expon + expon_uptrunc + lognorm + geom + geom_uptrunc + nbinom + cnbinom + logser_uptrunc + plnorm + plnorm_ztrunc + +More information about these models can be found by clicking on their names above. Some of these models have additional optional parameters that can be provided here (see the Methods section of the page for each individual model). + +Two special input parameters, ``log_x`` and ``log_y``, are used to log transform the x and y axes of the output plots created by MacroecoDesktop. + +As another example, the run below will calculate a species abundance distribution for the demo data set, fit both a lognormal and upper-truncated logseries distribution to the empirical data, and compare their fits. :: + + [SAD] + analysis = sad + + metadata = ANBO.txt + + models = logser_uptrunc; lognorm + log_y = True + +Note that no subsets or splits are given here, so that the entire data table is used for the analysis. The ``cols`` parameter is also not given, and the value of this parameter from the metadata file is used as a result. + +Exploring Models +================ + +Finally, MacroecoDesktop may also be used to explore the behavior of models without specific reference to empirical data. Given a set of model parameters, the "y" values of curves may be calculated for any "x" values, and the probability density, cumulative density, random variates, and many other values may be calculated for probability distributions. + +To see the possible options for exploring models, choose a model from the lists above and refer to the Methods section of that page. Any model and any method may be used with MacroecoDesktop so long as all of the input parameters required by that method are provided. Note that although the ``loc`` and ``scale`` parameters are listed for distributions, these are not used by Macroeco and should not be entered in a parameters file. + +For example, the parameter file below contains runs that calculate the pmf of a geometric distribution with a known shape parameter ``p``, calculate the ``p`` parameter of the upper-truncated geometric distribution from the distribution mean and aggregation parameter ``k``, fit the parameters of a lognormal distribution to a small data set, and draw 20 random variables from a conditioned negative binomial distribution. :: + + [Geom-pmf] + analysis = geom.pmf + + x = 0,1,2,3,4,5 + p = 0.5 + + [GeomUptrunc-p] + analysis = geom_uptrunc.translate_args + + mu = 5 + b = 20 + + [Lognorm-fit] + analysis = lognorm.fit_mle + + data = 2,2,5,8,4,3 + + [Cnbinom-random] + analysis = cnbinom.rvs + + mu = 10 + k_agg = 2 + b = 15 + size = 10 + diff --git a/doc/tutorial_own_data.rst b/doc/tutorial_own_data.rst new file mode 100644 index 0000000..ba558dc --- /dev/null +++ b/doc/tutorial_own_data.rst @@ -0,0 +1,88 @@ +.. _own-data: + +============== +Preparing Data +============== + +Both data tables and metadata must be provided to MacroecoDesktop and the package ``macroeco`` for empirical analyses. Data should be placed in a csv file following the basic structure described below. Metadata must also be prepared to describe features of the data table that cannot be inferred from the table itself (for example, the minimum and maximum values of the extent of a census, as these may be smaller and larger, respectively, than the minimum and maximum coordinates of recorded individuals). + +.. note + To avoid the possibility of errors, the names of the data table and metadata files should not contain any spaces. Additionally, the column headers within the data table must not contain any spaces. + +Preparing Data Tables +--------------------- + +Data should be prepared as a csv (comma separated values) file. The first row should contain column names, and each subsequent row should refer to a single record, most commonly a combination of a species identifier and a coordinate location. For point census data, each record will identify a single individual, while gridded census data will generally have an additional "count" column that gives the number of individuals of a species found in a grid cell. + +Other columns, such as those identify genera, plot ID, etc. may also be included. The ``splits`` argument used by the empirical data analysis functions can be easily used to divide the analysis according to the values found in any provided column. For example, splitting a species area analysis on a column containing a plot ID will perform a separate species area analysis for each unique plot. + +The demo data file ANBO.csv provides an examples of a correctly formatted data table file. + +Preparing a Metadata File +------------------------- + +Macroeco requires a metadata file to be provided along with each data table file (both MacroecoDesktop and ``macroeco`` require the user to provide the location of a metadata file, not the data table itself). The metadata file contains basic descriptive information about the data table as well as parameter values that are necessary for empirical data analysis. + +The format of the metadata file is very similar to the parameters files used to describe analyses for MacroecoDesktop. A metadata file has an initial section called Description, followed by a section containing information for each column in the data table. + +The metadata file ANBO.txt is shown here. :: + + [Description] + name = Anzo Borrego + author = Mary Ellen Harte and John Harte + description = Vegetation census conducted at Anza-Borrego Desert State Park. Site in Indian Valley at N 32' 52.091", W 116' 14.447". Elevation 1195 feet. Census was conducted on a 4 m x 4 m grid, with 16 grid cells each 1 m2 in area. + citation = Unpublished + + datapath = ANBO.csv + cols = spp_col:spp; count_col: count; x_col; row: y_col; column + + [year] + description = Year of census + + [cell] + description = Unique cell identifier, from 0 to 15 (total of 16 cells) + + [row] + description = Row of cell in gridded plot + min = 0 + max = 3 + step = 1 + + [column] + description = Column of cell in gridded plot + min = 0 + max = 3 + step = 1 + + [spp] + description = Name of species + + [count] + description = Number of individuals of a species in a cell + +The initial section, Description, begins with a number of variables providing basic information on the data table. + +The ``datapath`` variable in this section gives the location of the data table file relative to the metadata file. If the data file and metadata file are in the same directory, as is usually the case, then datapath is simply the name of the data table file. + +The ``cols`` variable here provides an opportunity to identify the columns in the data table that indicate different values used in empirical data analysis. The four special columns shown here, which are common to most data tables, are + +* spp_col - the species identifier +* count_col - optional column with the number of individuals of a species at a point (if count_col is not given, each row is taken to represent a single individual) +* x_col - the x coordinate of the record location +* y_col - the y coordinate of the record location + +The value of ``cols`` can also be specified separately in individual runs in MacroecoDesktop or when calling individual functions in ``macroeco``. The values given here in the metadata file are the defaults which are used if ``cols`` is not specified in these other locations. + +The remaining sections each refer to a column in the data table. Each section begins with a short description of the data in that column. Additionally, numeric columns (any column that can be split or subset by a numeric value) must have a minimum and maximum value and a step size giving the precision of the census. These are most commonly used with coordinate columns where, for example, the min and max values give the extent of the census and the step gives the minimum distance between two individuals. + +The demo metadata file ANBO.txt contains the metadata shown above. + +Using Data Files with Macroeco +------------------------------ + +Once the data and metadata files are prepared, they can be used with both MacroecoDesktop and ``macroeco``. + +In MacroecoDesktop, each run that involves empirical data analysis must contain the variable ``metadata_path``, which should indicate the path of the metadata file relative to the parameters file. If the parameters file and the data file are in the same folder, this is simply the name of the metadata file. + +In ``macroeco``, the absolute path to the metadata file (or the relative path from the present working directory) is a required argument to the Patch class of the empirical subpackage. Patch objects are required for all empirical pattern analysis using the functions in empirical. + diff --git a/doc/tutorial_recipes.rst b/doc/tutorial_recipes.rst new file mode 100644 index 0000000..634384d --- /dev/null +++ b/doc/tutorial_recipes.rst @@ -0,0 +1,108 @@ +.. _recipes: + +======================= +MacroecoDesktop Recipes +======================= + +To provide a "jump start" on setting up analyses for MacroecoDesktop, the sample parameter file below contains a variety of runs that perform different types of calculations on the demo dataset provided with Macroeco. This file, or individual runs from this file (consisting of a run title in square brackets and all subsequent lines until the next run title), can be copied and pasted into parameters files and modified as needed. + +The lines beginning with the ``#`` symbol are comments. They are purely for information and are ignored by MacroecoDesktop. In some cases below, lines containing variables are prefaced by the ``#`` symbol, indicating that they are "commented out" and will not affect the analysis. Removing the ``#`` at the start of these lines will have the effect described in the associated comment for that line. :: + + # The runs below provide examples of empirical data analysis, some with + # model comparisons. + + # A simple species abundance distribution for the full plot + [SAD] + analysis = sad + + metadata = ANBO.txt + + models = logser_uptrunc; lognorm + log_y = True # Log transform the y axis of output plots + + # Four separate SAD's for the four quadrants of the plot + # cols is only required if it is not set in the metadata file + [SAD4] + analysis = sad + + metadata = ANBO.txt + #cols = spp_col:spp; count_col:count; x_col:row; y_col:column + splits = row:2; column:2 + clean = True # Remove species with 0 individuals from SADs + + models = logser_uptrunc; lognorm + log_y = True # Log transform the y axis of output plots + + # Empirical spatial abundance distribution for all 16 cells + [SSAD] + analysis = ssad + + metadata = ANBO.txt + splits = row: 4; column: 4 + + # Species area relationship + [SAR ANBO] + analysis = sar + + metadata = ANBO.txt + divs = 1,1;1,2;2,1;2,2;2,4;4,4 + + models = mete_iterative_sar + #ear = True # Endemics area relationship instead of species area + log_y = True + log_x = True + + # Gridded commonality, calculating Sorensen index for each pair of cells + [Commonality] + analysis = comm_grid + + metadata = ANBO.txt + #subset = row>=2;column>=2 # Use only cells in rows 2-3 and columns 2-3 + cols = spp_col:spp; count_col:count; x_col:row; y_col:column + #splits = row:2 # Perform analysis once for rows 0-1 and again for 2-3 + divs = 2,2; + #metric = Jaccard # Use Jaccard instead of Sorensen index + + models = power_law + + # O ring measure of distance decay + # This measure is best suited to point count census data + [Oring] + analysis = o_ring + + metadata = ANBO.txt + cols = spp_col:spp; count_col:count; x_col:row; y_col:column + spp = 'crcr' + bin_edges = 0, 1, 2, 3, 4 + + # The runs below provide examples of model exploration + + # pmf of geometric distribution + [Geom-pmf] + analysis = geom.pmf + + x = range(10) # x values from 0 to 9 + p = 0.5 + + # Shape parameter of upper truncated geometric distribution + [GeomUptrunc-p] + analysis = geom_uptrunc.translate_args + + mu = 5 + b = 20 + + # Fit parameters of lognormal to a small data set + [Lognorm-fit] + analysis = lognorm.fit_mle + + data = 2,2,5,8,4,3 + + + # Draw random variates from a conditioned negative binomial distribution + [Cnbinom-random] + analysis = cnbinom.rvs + + mu = 10 + k_agg = 2 + b = 15 + size = 10 diff --git a/doc/tutorial_with_r.rst b/doc/tutorial_with_r.rst new file mode 100644 index 0000000..e9e2b0d --- /dev/null +++ b/doc/tutorial_with_r.rst @@ -0,0 +1,43 @@ +=========================== +MacroecoDesktop for R users +=========================== + +Users who primarily work in R can access the functionality of Macroeco through the command line MacroecoDesktop interface. + +First, install a working copy of MacroecoDesktop by following the installation instructions in :ref:`installation`. Windows and Linux users will need to install a Python environment and the ``macroeco`` package, while Mac users can instead install the standalone MacroecoDesktop program. Follow the :ref:`first-steps-macroeco-desktop` tutorial to create the "new_parameters.txt" file and ensure that your copy of MacroecoDesktop is working properly. + +For all platforms and installation options, the basic idea will be to call MacroecoDesktop from an R script using the command line interface, wait for the analysis to complete, and then read in any output tables saved by MacroecoDesktop that will be used for further analysis. + +As an example, the script below completes the following steps: + +* Writes a "new_parameters.txt" file describing a desired MacroecoDesktop analysis +* Uses the ``system`` command within R to execute the MacroecoDesktop analysis specified in "new_parameters.txt" +* Reads in the resulting data tables +* Plots gridded distance decay data with a best fit power law curve +* Prints out the R2 value for the power law fit to the data:: + + param_dir <- "~/Desktop/demo/" + param_file <- "new_parameters.txt" + + cat(" + [DistanceDecay] + + analysis = comm_grid + + metadata = ANBO.txt + cols = spp_col: spp; count_col: count; y_col: row; x_col: column + divs = 4,4; + models = power_law + ",file=paste(param_dir,param_file,sep=""), sep="\n") + + system(paste("mecodesktop ", param_dir, param_file, sep="")) + + data_models <- read.csv(paste(param_dir, "results/DistanceDecay/1_data_models.csv", sep="")) + test_statistics <- read.csv(paste(param_dir, "results/DistanceDecay/1_test_statistics.csv", sep="")) + + plot(data_models$x, data_models$empirical) + lines(data_models$x, data_models$power_law) + + test_statistics$R2 + +Mac users who installed the standalone MacroecoDesktop program should replace ``"mecodesktop "`` above with ``"/Applications/MacroecoDesktop.app/Contents/MacOS/mecodesktop "``. diff --git a/doc/tutorials.rst b/doc/tutorials.rst new file mode 100644 index 0000000..981dd74 --- /dev/null +++ b/doc/tutorials.rst @@ -0,0 +1,13 @@ +========= +Tutorials +========= + +.. toctree:: + :maxdepth: 1 + + tutorial_getting_started + tutorial_macroeco + tutorial_macroeco_desktop + tutorial_own_data + tutorial_with_r + tutorial_recipes diff --git a/empirical.py b/empirical.py deleted file mode 100644 index 520e472..0000000 --- a/empirical.py +++ /dev/null @@ -1,811 +0,0 @@ -#!/usr/bin/python - -''' -Calculating macroecological metrics for empirical or theoretical patch. Patch -is interpreted broadly as any temporally and spatially defined census. - -Classes -------- -- `Patch` -- empirical metrics for census data - -Patch Methods -------------- -- `sad` -- calculate species abundance distribution (grid or sample) -- `sar` -- calculate species-area relationship (grid or sample) -- `universal_sar` -- calculates the universal sar curve -- `ear` -- calculate endemics-area relationship (grid or sample) -- `comm` -- calculate commonality between sub-patches (grid) -- `ssad` -- calculate species-level spatial abundance distrib (grid or sample) -- `sed` -- calculate species energy distribution (grid or sample) -- `ied` -- calculate the community (individual) energy distribution -- `ased` -- calculate the average species energy distribution - -- `get_sp_centers` -- -- 'get_div_areas' -- return list of areas made by div_list - -Misc functions --------------- -- `distance` -- return Euclidean distance between two points -''' - -from __future__ import division -import numpy as np -from math import radians, cos, sin, asin, sqrt -import itertools -from copy import deepcopy -from data import DataTable - - -class Patch: - ''' - An object representing an empirical census. - - Parameters - ---------- - data_path : str - Path to csv file containing census data. - subset : dict or str - Dictionary of permanent subset to data, {'column_name': 'condition'}, - which will limit all analysis to records in which column_name meets the - condition, ie, {'year': ('==', 2005), 'x': [('>', 20), ('<', 40)]} - restricts analysis to year 2005 and x values between 20 and 40. These - conditions can also be passed to the individual methods, but subsetting - the data table up front may save analysis time. Subsetting on a string - would look something like {'name' : [('==', 'John'), ('==', 'Harry')]}. - In addition, subset can be a query string for a SQL database. - - Attributes - ---------- - data_table : object of class DataTable - Object containing patch data and metadata. - - ''' - - def __init__(self, datapath, subset = {}): - '''Initialize object of class Patch. See class documentation.''' - - # Handle csv - self.data_table = DataTable(datapath, subset=subset) - - # If datapath is sql or db the subsetting is already done. - if type(subset) == type({}): - self.data_table.table = self.data_table.get_subtable(subset) - - - def sad(self, criteria, clean=False): - ''' - Calculates an empirical species abundance distribution given criteria. - - Parameters - ---------- - criteria : dict - Dictionary of form {column_name: value}. Must contain a key with a - value of 'species' indicating the column with species identifiers - (this column must be type categorical in metadata). If a column - giving the counts of species found at a point is also in the data, - a key with the value 'count' should also be given. - - Value has a different meaning depending on column type: - - metric - number of divisions of data along this axis, int/float - - categorical - 'split' calculates each category separately, - 'whole' takes the entire column. - clean : bool - If True, all the zeros are removed from the sads. If False, sads - are left as is. - - Returns - ------- - result : list - List of tuples containing results, where the first element is a - dictionary of criteria for this calculation and second element is a - 1D ndarray of length species containing the abundance for each - species. The third element is 1D array listing identifiers for - species in the same order as they appear in the second element of - result. - ''' - - spp_list, spp_col, count_col, engy_col, mass, combinations = \ - self.parse_criteria(criteria) - - if spp_col == None: - raise TypeError('No species column specified in "criteria" ' + - 'parameter') - result = [] - for comb in combinations: - - subtable = self.data_table.get_subtable(comb) - - sad_list = [] - for species in spp_list: - spp_subtable = subtable[subtable[spp_col] == species] - if count_col: - count = np.sum(spp_subtable[count_col]) - else: - count = len(spp_subtable) - sad_list.append(count) - - sad_list = np.array(sad_list) - - if clean: - ind = np.where(sad_list != 0)[0] - sad_list = sad_list[ind] - temp_spp_list = spp_list[ind] - else: - temp_spp_list = spp_list - - - result.append((comb, sad_list, temp_spp_list)) - - return result - - def ssad(self, criteria): - ''' - Calculates empirical species-level spatial abundance distributions - given criteria. - - Parameters - ---------- - criteria : dict - See Patch.sad docstring - - Returns - ------- - : tuple - Returns a tuple with two objects. The first object is an array of - dicts that correspond to the criteria used to generate each cell. - The length of the first object in equal to the number of divisions - specified. The second object is a dictionary that has length - species and each keyword is a species. Each species keyword looks - up an array with the ssad for the given species. The array that - each keyword looks up is the same length as criteria. - - - ''' - sad_return = self.sad(criteria, clean=False) - spp_list = sad_return[0][2] - combs, array_res = flatten_sad(sad_return) - ssad = {} - - for i, spp in enumerate(spp_list): - ssad[spp] = array_res[i,:] - - return combs, ssad - - def parse_criteria(self, criteria): - ''' - Parses criteria list to get all possible column combinations. - - Parameters - ---------- - criteria : dict - (See docstring for Patch.sad) - energy : bool - If False, does not return an energy column, if True, returns an - energy column. - - Returns - ------- - spp_list : ndarray - 1D array listing identifiers for species in the same order as they - appear in arrays found in result. - spp_col : str - Name of column containing species identifiers. - count_col : str - Name of column containing counts, if any. - combinations : list of dicts - List of dictionaries giving all possible combinations of criteria. - Columns not mentioned in criteria are ignored and will be averaged - over in later analyses. - - ''' - - spp_list = None - spp_col = None - count_col = None - engy_col = None - mass_col = None - combinations = [] - - # Calculate all possible combinations of columns based on criteria - # TODO: Add error checking - for key, value in criteria.items(): - - # Look for two special values indicating species and count cols - if value == 'species': - spp_list = np.unique(self.data_table.table[key]) - spp_col = key - continue - if value == 'count': - count_col = key - continue - if value == 'energy': - engy_col = key - continue - if value == 'mass': - mass_col = key - continue - - # Get levels of categorial or metric data - if value == 'split': # Categorial - levels = np.unique(self.data_table.table[key]) - levels_str = [('==' , x.astype(levels.dtype)) for x in levels] - elif value == 'whole': - # Random string to minimize chance of overlap? - levels_str = [('==','whole')] - else: # Metric - - # TODO: Throw a warning if the data is not divisible by the - # divisions specified. - try: - dmin = self.data_table.meta[(key, 'minimum')] - dmax = self.data_table.meta[(key, 'maximum')] - dprec = self.data_table.meta[(key, 'precision')] - - # TODO: Error if step < prec - step = (dmax + dprec - dmin) / value - starts = np.arange(dmin, dmax + dprec, step) - ends = starts + step - except TypeError: - raise TypeError('Unable to proceed to with values ' + - 'obtained from metadata. Please check ' + - 'the metadata file and/or parameters file') - - - starts_str = [('>=', x) for x in starts] - ends_str = [('<', x) for x in ends] - levels_str = [list(lvl) for lvl in zip(starts_str, ends_str)] - - - # Add these levels to combinations dictionary - if len(combinations) == 0: # If first criteria - for i, level in enumerate(levels_str): - combinations.append({key: level}) - else: - temp_comb = [] - for i, level in enumerate(levels_str): - exist_recs = deepcopy(combinations) - for rec in exist_recs: - rec[key] = level - temp_comb += exist_recs - combinations = temp_comb - - if len(combinations) == 0: - combinations.append({}) - - return spp_list, spp_col, count_col, engy_col, mass_col, combinations - - - - def sar(self, div_cols, div_list, criteria, form='sar', output_N=False): - ''' - Calculate an empirical species-area relationship given criteria. - - Parameters - ---------- - div_cols : tuple - Column names to divide, eg, ('x', 'y'). Must be metric. - div_list : list of tuples - List of division pairs in same order as div_cols, eg, [(2,2), - (2,4), (4,4)]. Values are number of divisions of div_col. - criteria : dict - See docstring for EPatch.sad. Here, criteria SHOULD NOT include - items referring to div_cols (if there are any, they are ignored). - form : string - 'sar' or 'ear' for species or endemics area relationship. EAR is - relative to the subtable selected after criteria is applied. - output_N : bool - Adds the column N to the output rec array which contains the - average N for a given area. - - Returns - ------- - rec_sar: structured array - Returns a structured array with fields 'items' and 'area' that - contains the average items/species for each given area specified by - critieria. - full_result : list of ndarrays - List of same length as areas containing arrays with element for - count of species or endemics in each subpatch at corresponding - area. - ''' - - # If any element in div_cols in criteria, remove from criteria - criteria = {k: v for k, v in criteria.items() if k not in div_cols} - - # Loop through div combinations (ie, areas), calc sad, and summarize - areas = [] - mean_result = [] - full_result = [] - N_result = [] - - for div in div_list: - - # Add divs to criteria dict - this_criteria = deepcopy(criteria) - for i, col in enumerate(div_cols): - this_criteria[col] = div[i] - - # Get flattened sad for all criteria and this div - sad_return = self.sad(this_criteria) - - if output_N: - N_result.append(np.mean([sum(sad[1]) for sad in sad_return])) - - flat_sad = flatten_sad(sad_return)[1] - - # Store results - if form == 'sar': - this_full = np.sum((flat_sad > 0), axis=0) - this_mean = np.mean(this_full) - elif form == 'ear': - totcnt = np.sum(flat_sad, axis=1) - totcnt_arr = \ - np.array([list(totcnt),]*np.shape(flat_sad)[1]).transpose() - - this_full = np.sum(np.equal(flat_sad, totcnt_arr), axis=0) - this_mean = np.mean(this_full) - else: - raise NotImplementedError('No SAR of form %s available' % form) - - full_result.append(this_full) - mean_result.append(this_mean) - - # Store area - area = 1 - for i, col in enumerate(div_cols): - dmin = self.data_table.meta[(col, 'minimum')] - dmax = self.data_table.meta[(col, 'maximum')] - dprec = self.data_table.meta[(col, 'precision')] - length = (dmax + dprec - dmin) - - area *= length / div[i] - - areas.append(area) - - # Return - if not output_N: - rec_sar = np.array(zip(mean_result, areas), dtype=[('items', - np.float), ('area', np.float)]) - else: - rec_sar = np.array(zip(mean_result, N_result, areas), - dtype=[('items', np.float), ('N', np.float), ('area', np.float)]) - - return rec_sar, full_result - - - def universal_sar(self, div_cols, div_list, criteria, include_full=False): - ''' - Calculates the empirical universal sar given criteria. The universal - sar calculates the slope of the SAR and the ratio of N / S at all - the areas in div_cols (where N is the total number of species and S is - the total number of species). - - This function assumes that the div_list contains halvings. If they are not, - the function will still work but the results will be meaningless. An - example a of div_list with halvings is: - - [(1,1), (1,2), (2,2), (2,4), (4,4)] - - Parameters - ---------- - div_cols : tuple - Column names to divide, eg, ('x', 'y'). Must be metric. - div_list : list of tuples - List of division pairs in same order as div_cols, eg, [(2,2), - (2,4), (4,4)]. Values are number of divisions of div_col. - criteria : dict - See docstring for EPatch.sad. Here, criteria SHOULD NOT include - items referring to div_cols (if there are any, they are ignored). - include_full : bool - If include_full = True, the division (1,1) will be included if it - was now already included. Else it will not be included. (1,1) is - equivalent to the full plot - - - Returns - ------- - z_array : a structured array - Has the columns names: - 'z' : slope of the SAR at the given area - 'S' : Number of species at the given division - 'N' : Number of individuals at the given division - 'N/S' : The ratio of N/S at the given division - - - Notes - ----- - If you give it n divisions in div_list you will get a structured array - back that has length n - 2. Therefore, if you only have one - ''' - - # If (1,1) is not included, include it - if include_full: - try: - div_list.index((1,1)) - except ValueError: - div_list.insert(0, (1,1)) - - # Run sar with the div_cols - sar = self.sar(div_cols, div_list, criteria, output_N=True)[0] - - # sort by area - sar = np.sort(sar, order=['area'])[::-1] - - # Calculate z's - if len(sar) >= 3: # Check the length of sar - z_list = [z(sar['items'][i - 1], sar['items'][i + 1]) for i in - np.arange(1, len(sar)) if sar['items'][i] != sar['items'][-1]] - else: - return np.empty(0, dtype=[('z', np.float), ('S', np.float), ('N', - np.float), ('N/S', np.float)]) - - N_over_S = sar['N'][1:len(sar) - 1] / sar['items'][1:len(sar) - 1] - - z_array = np.array(zip(z_list, sar['items'][1:len(sar) - 1], - sar['N'][1:len(sar) - 1], N_over_S), dtype=[('z', np.float), ('S', - np.float), ('N', np.float), ('N/S', np.float)]) - - return z_array - - def comm_sep(self, plot_locs, criteria, loc_unit=None): - ''' - Calculates commonality (Sorensen and Jaccard) between pairs of plots. - - Parameters - ---------- - plot_locs : dict - Dictionary with keys equal to each plot name, which must be - represented by a column in the data table, and values equal to a - tuple of the x and y coordinate of each plot - criteria : dict - See docstring for Patch.sad. - loc_unit : str - Unit of plot locations. Special cases include 'decdeg' (decimal - degrees), returns result in km. Otherwise ignored. - - Returns - ------- - result: structured array - Returns a structured array with fields plot-a and plot-b (names of - two plots), dist (distance between plots), and sorensen and jaccard - (similarity indices). Has row for each unique pair of plots. - ''' - - # Set up sad_dict with key=plot and val=clean sad for that plot - sad_dict = {} - - # Loop through all plot cols, updating criteria, and getting spp_list - for plot in plot_locs.keys(): - - # Find current count col and remove it from criteria - for crit_key in criteria.keys(): - if criteria[crit_key] == 'count': - criteria.pop(crit_key, None) - - # Add this plot as col with counts - criteria[plot] = 'count' - - # Get SAD for existing criteria with this plot as count col - sad_return = self.sad(criteria, clean=True) - - # Check that sad_return only has one element, or throw error - if len(sad_return) > 1: - raise NotImplementedError('Too many criteria for comm_sep') - - # Get unique species list for this plot and store in sad_dict - sad_dict[plot] = sad_return[0][2] - - # Set up recarray to hold Sorensen index for all pairs of plots - n_pairs = np.sum(np.arange(len(plot_locs.keys()))) - result = np.recarray((n_pairs,), dtype=[('plot-a','S32'), - ('plot-b', 'S32'), - ('spp-a', int), - ('spp-b', int), - ('dist', float), - ('sorensen', float), - ('jaccard', float)]) - - # Loop through all combinations of plots and fill in result table - row = 0 - for pair in itertools.combinations(plot_locs.keys(), 2): - - # Names of plots - plota = pair[0] - plotb = pair[1] - - result[row]['plot-a'] = plota - result[row]['plot-b'] = plotb - - # Calculate inter-plot distance - if loc_unit == 'decdeg': - result[row]['dist'] = decdeg_distance(plot_locs[plota], - plot_locs[plotb]) - else: - result[row]['dist'] = distance(plot_locs[plota], - plot_locs[plotb]) - - # Get similarity indices - spp_a = len(sad_dict[plota]) - spp_b = len(sad_dict[plotb]) - - result[row]['spp-a'] = spp_a - result[row]['spp-b'] = spp_b - - intersect = set(sad_dict[plota]).intersection(sad_dict[plotb]) - union = set(sad_dict[plota]).union(sad_dict[plotb]) - - # Fill in zero if denom is zero - if spp_a + spp_b == 0: - result[row]['sorensen'] = 0 - else: - result[row]['sorensen'] = (2*len(intersect)) / (spp_a+spp_b) - - if len(union) == 0: - result[row]['jaccard'] = 0 - else: - result[row]['jaccard'] = len(intersect) / len(union) - - # Increment row counter - row += 1 - - return result - - - def ied(self, criteria, normalize=True, exponent=0.75): - ''' - Calculates the individual energy distribution for the entire community - given the criteria - - Parameters - ---------- - criteria : dict - Dictionary must have contain a key with the value 'energy'. See - sad method for further requirements. - normalize : bool - If True, this distribution is normalized by dividing by the lowest - energy value within each element of criteria. If False, returns raw - energy values. - exponent : float - The exponent of the allometric scaling relationship if energy is - calculated from mass. - - Returns - ------- - result : list - List of tuples containing results, where first element is - dictionary of criteria for this calculation and second element is a - 1D ndarray containing the energy measurement of each individual in - the subset. The third element is the full (not unique) species - list for the given criteria. - - Notes - ----- - If count_col is None or is all ones, the entire energy column for each - subtable is returned. Else, the average energy per individual, - repeated for each individual is returned. This is equivalent to the psi - distribution from Harte (2011). - - - ''' - - spp_list, spp_col, count_col, engy_col, mass_col, combinations = \ - self.parse_criteria(criteria) - - if engy_col == None and mass_col == None: - raise ValueError("No energy or mass column given") - elif engy_col == None and mass_col != None: - mass = True - this_engy = mass_col - else: - mass = False - this_engy = engy_col - - result = [] - for comb in combinations: - - subtable = self.data_table.get_subtable(comb) - - # If all counts are not 1 - if count_col and (not np.all(subtable[count_col] == 1)): - - # Remove any zero counts - subtable = subtable[subtable[count_col] != 0] - # Convert counts to ints - temp_counts = subtable[count_col].astype(int) - - energy = np.repeat((subtable[this_engy] / - subtable[count_col]), temp_counts) - species = np.repeat(subtable[spp_col], temp_counts) - else: - energy = subtable[this_engy] - species = subtable[spp_col] - - # Convert mass to energy if mass is True - if mass: - energy = (energy ** exponent) - - # Normalizing energy - if normalize: - energy = energy / np.min(energy) - result.append((comb, energy, species)) - - return result - - def sed(self, criteria, normalize=True, exponent=0.75, clean=False): - ''' - Calculates the species-level energy distribution for each given species - in the community. - - Parameters - ---------- - criteria : dict - Dictionary must have contain a key with the value 'energy' or - 'mass'. See sad method for further requirements. - normalize : bool - If True, this distribution is normalized by dividing by the lowest - energy value within each element of criteria. If False, returns raw - energy values. - exponent : float - The exponent of the allometric scaling relationship if energy is - calculated from mass - clean : bool - If False, sed dictionary contains all species. If True, species - with no individuals are removed. This is useful when subsetting. - - Returns - ------- - result : list of tuples - Each tuple contains two objects. The first object is a dict with - the division specifications that generated the given species energy - distributions. The second object is a dict with a keyword - corresponding to each species in the spp_list. Each species - keyword looks up a np.array that contains the given species - energy distribution. - - Note - ---- - The theta distribution from Harte (2011) is a an sed. - - ''' - spp_list, spp_col, count_col, engy_col, mass_col, combinations = \ - self.parse_criteria(criteria) - - ied = self.ied(criteria, normalize=normalize, exponent=exponent) - - result = [] - for this_ied in ied: - this_criteria_sed = {} - - for spp in spp_list: - spp_ind = (spp == this_ied[2]) - this_spp_sed = this_ied[1][spp_ind] - - if clean: # If True, don't add empty species lists - if len(this_spp_sed) > 0: - this_criteria_sed[spp] = this_spp_sed - else: - this_criteria_sed[spp] = this_spp_sed - - result.append((this_ied[0], this_criteria_sed)) - - return result - - def ased(self, criteria, normalize=True, exponent=0.75): - ''' - Calculates the average species energy distribution for each given - species in a subset. - - Parameters - ---------- - criteria : dict - Dictionary must have contain a key with the value 'energy' or - 'mass'. See sad method for further requirements. - - Returns - ------- - result : list - List of tuples containing results, where the first element is a - dictionary of criteria for this calculation and second element is a - 1D ndarray of length species containing the average energy for each - species. The third element is 1D array listing identifiers for - species in the same order as they appear in the second element of - result. - - Notes - ----- - This is equivalent to the nu distribution from Harte 2011 - - ''' - - sed = self.sed(criteria, normalize=normalize, exponent=exponent) - - result = [] - for this_sed in sed: - spp_list = list(this_sed[1].viewkeys()) - spp_list.sort() - - # Take the mean energy for each species - nu = [np.mean(this_sed[1][spp]) for spp in spp_list if - len(this_sed[1][spp]) != 0] - # Truncated spp_list if necessary - spp_list = [spp for spp in spp_list if len(this_sed[1][spp]) != 0] - - result.append((this_sed[0], np.array(nu), np.array(spp_list))) - - return result - -def flatten_sad(sad): - ''' - Takes a list of tuples, like sad output, ignores keys, and converts values - into a 2D array with each value as a column (ie, species in rows, samples - in columns. - ''' - - combs = [cmb[0] for cmb in sad] - result = np.zeros((len(sad[0][1]), len(sad))) - - for i, tup in enumerate(sad): - result[:,i] = tup[1] - - return combs, result - - -def distance(pt1, pt2): - ''' Calculate Euclidean distance between two points ''' - return np.sqrt((pt1[0] - pt2[0]) ** 2 + (pt1[1] - pt2[1]) ** 2) - - -def decdeg_distance(pt1, pt2): - ''' Calculate Earth surface distance (in km) between decimal latlong points - using Haversine approximation. - - http://stackoverflow.com/questions/15736995/how-can-i-quickly-estimate-the-distance-between-two-latitude-longitude-points - ''' - lat1, lon1 = pt1 - lat2, lon2 = pt2 - - # Convert decimal degrees to radians - lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) - - # haversine formula - dlon = lon2 - lon1 - dlat = lat2 - lat1 - a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 - c = 2 * asin(sqrt(a)) - km = 6367 * c - - return km - -def divisible(dividend, precision, divisor, tol = 1e-9): - ''' - Check if dividend (here width or height of patch) is evenly divisible by - divisor (here a number of patch divs) while accounting for floating point - rounding issues. - ''' - if divisor == 0: - return False - if divisor > round(dividend / precision): - return False - - quot_raw = (dividend / precision) / divisor - quot_round = round(quot_raw) - diff = abs(quot_raw - quot_round) - - if diff < tol: - return True - else: - return False - - -def rnd(num): - ''' - Round num to number of decimal places in precision. Used to avoid issues - with floating points in the patch and subpatch width and height that make - subpatches not lie exactly on even divisions of patch. - ''' - return round(num, 6) - -def z(doubleS, halfS): - '''Calculates the z for a double S value and a half S value''' - - return np.log(doubleS / halfS) / (2 * np.log(2)) diff --git a/icon.icns b/icon.icns new file mode 100644 index 0000000..575f823 Binary files /dev/null and b/icon.icns differ diff --git a/macroeco/__init__.py b/macroeco/__init__.py new file mode 100644 index 0000000..645c0b7 --- /dev/null +++ b/macroeco/__init__.py @@ -0,0 +1,43 @@ +""" +=============================================== +Macroeco: Ecological pattern analysis in Python +=============================================== + +Macroeco provides a comprehensive set of functions for analyzing empirical +patterns in ecological data, predicting patterns using theory and models, and +comparing empirical patterns to theory. Many major macroecological patterns +can be analyzed using this package, including the species abundance +distribution, the species and endemics area relationships, several measures of +beta diversity, and many others. + +Macroeco can be used either as a scientific python Package or through a high- +level interface called MacroecoDesktop. Users new to Macroeco should begin by +reviewing the tutorials found below. Experienced Python programmers who wish to +use the ``macroeco`` Python package can ``pip install macroeco`` and refer to +the :ref:`using-macroeco` tutorial and the :ref:`reference` guide. + +.. toctree:: + :maxdepth: 2 + + tutorials + reference + about + +""" + +import sys as _sys + +__version__ = '0.3' + +import empirical +import models +import compare +import main +import misc + +def mecodesktop(): + if len(_sys.argv) > 1: + param_path = _sys.argv[1] + main.main(param_path) + else: + print "Macroeco Desktop must be called with path to parameters file" \ No newline at end of file diff --git a/macroeco/compare/__init__.py b/macroeco/compare/__init__.py new file mode 100644 index 0000000..2b34519 --- /dev/null +++ b/macroeco/compare/__init__.py @@ -0,0 +1,25 @@ +""" +================================= +Compare (:mod:`macroeco.compare`) +================================= + +This module contains functions that compare the goodness of fit of a +distribution/curve to data or the fit of two distributions/curves to each +other. + +.. autosummary:: + :toctree: generated/ + + nll + lrt + AIC + AIC_compare + sum_of_squares + r_squared + preston_bin + +""" + +from ._compare import (nll, lrt, AIC, AIC_compare, + sum_of_squares, full_model_nll, deviance, r_squared, + preston_bin, pueyo_bins) diff --git a/macroeco/compare/_compare.py b/macroeco/compare/_compare.py new file mode 100644 index 0000000..f519a43 --- /dev/null +++ b/macroeco/compare/_compare.py @@ -0,0 +1,402 @@ +from __future__ import division + +import numpy as np +import scipy as sp +import scipy.stats as stats +import pandas as pd + +from ..misc import doc_sub + +_data_doc = \ + """data : iterable + Data for analysis""" + +_model_doc = \ + """model : obj + Scipy frozen distribution object. When freezing, keyword args ``loc`` + and ``scale`` should only be included if they represent a parameter. + """ + +_obs_pred_doc = \ + """obs, pred : array-like objects + Observed and predicted data + """ + + +@doc_sub(_data_doc, _model_doc) +def nll(data, model): + """ + Negative log likelihood given data and a model + + Parameters + ---------- + {0} + {1} + + Returns + ------- + float + Negative log likelihood + + """ + + try: + log_lik_vals = model.logpmf(data) + except: + log_lik_vals = model.logpdf(data) + return -np.sum(log_lik_vals) + + +@doc_sub(_data_doc) +def lrt(data, model_null, model_alt, df=None): + """ + Compare two nested models using a likelihood ratio test + + Parameters + ---------- + {0} + model_null : obj + A frozen scipy distribution object representing the null model. + model_alt : scipy distribution object + A frozen scipy distribution object representing the alternative model. + df : int + The degrees of freedom for the lrt (optional). If none, df is + calculated as the difference between the number of parameters in the + null and alternative models. + + Returns + ------- + tuple + G^2 statistic, p-value + + Notes + ----- + Parameters of distribution objects must be given as keyword arguments. Ex. + ``norm = stats.norm(loc=0, scale=1)`` + + A p-value < alpha suggests signficant evidence for the alternative model. + + The LRT only applies to nested models. The G^2 statistic and G-test rely on + the the assumption that -2log(Likelihood_null / Likelihood_alt) is + approximately chi-squared distributed. This assumption breaks down for + small samples sizes. + + """ + + # Calculate G^2 statistic + ll_null = nll(data, model_null) * -1 + ll_alt = nll(data, model_alt) * -1 + test_stat = -2 * (ll_null - ll_alt) + + # Set df if necessary + if not df: + df = ( len(model_alt.args) + len(model_alt.kwds) + - len(model_null.args) - len(model_null.kwds) ) + + return test_stat, stats.chisqprob(test_stat, df) + + +@doc_sub(_data_doc, _model_doc) +def AIC(data, model, params=None, corrected=True): + """ + Akaike Information Criteria given data and a model + + Parameters + ---------- + {0} + {1} + params : int + Number of parameters in the model. If None, calculated from model + object. + corrected : bool + If True, calculates the small-sample size correct AICC. Default False. + + Returns + ------- + float + AIC(C) value + + Notes + ----- + AICC should be used when the number of observations is < 40. + + References + ---------- + .. [#] + Burnham, K and Anderson, D. (2002) Model Selection and Multimodel + Inference: A Practical and Information-Theoretic Approach (p. 66). New + York City, USA: Springer. + + """ + n = len(data) # Number of observations + L = nll(data, model) + + if not params: + k = len(model.kwds) + len(model.args) + else: + k = params + + if corrected: + aic_value = 2 * k + 2 * L + (2 * k * (k + 1)) / (n - k - 1) + else: + aic_value = 2 * k + 2 * L + + return aic_value + + +def AIC_compare(aic_list): + """ + Calculates delta AIC and AIC weights from a list of AIC values + + Parameters + ----------------- + aic_list : iterable + AIC values from set of candidat models + + Returns + ------------- + tuple + First element contains the delta AIC values, second element contains + the relative AIC weights. + + Notes + ----- + AIC weights can be interpreted as the probability that a given model is the + best model in the set. + + """ + + aic_values = np.array(aic_list) + minimum = np.min(aic_values) + delta = aic_values - minimum + values = np.exp(-delta / 2) + weights = values / np.sum(values) + + return delta, weights + + +def deviance(red_model_nll, full_model_nll): + """ + Calculates the deviance given the negative log-likelihood for a reduced + model and the negative log-likelihood for the full model. + + Parameters + ---------- + red_model_nll : float + Reduced model negative log-likelihood + full_model_nll : float + Full model negative log-likelihood + + Returns + ------- + : float + Deviance + + Notes + ----- + Deviance is 2 * (red_model_nll - full_model_nll) + + + """ + return 2 * (red_model_nll - full_model_nll) + + +@doc_sub(_data_doc) +def full_model_nll(data, model, **kwargs): + """ + Fits a full model to the data. Every data point has a parameter + + Parameters + ----------- + {0} + model : Scipy distribution object + The model to be fit to the data + kwargs : keyword args + Additional keyword arguments for model fitting procedure + + Returns + ------- + : float + Negative log likelihood of full model given data + + Notes + ----- + Full model log likelihoods are used when calculating deviance + + """ + data = np.sort(data) + unique_data = np.unique(data) + + try: + mle_params = [model.fit_mle(np.array([dp]), **kwargs) for dp in unique_data] + except AttributeError: + try: + mle_params = [model.fit(np.array([dp])) for dp in unique_data] + except AttributeError: + raise AttributeError("%s has no attribute fit_mle or fit" % + str(model)) + + data_df = pd.DataFrame(unique_data, columns=["unq_data"]) + data_df['mle_params'] = mle_params + data_df.set_index("unq_data", inplace=True) + fitted_data = pd.DataFrame(np.arange(len(data)), index=data).join(data_df) + full_mle = fitted_data.mle_params + + try: + ll = [model(*full_mle.iloc[i]).logpmf(data[i]) for i in + xrange(len(data))] + except: + ll = [model(*full_mle.iloc[i]).logpdf(data[i]) for i in + xrange(len(data))] + + return -np.sum(ll) + + +def sum_of_squares(obs, pred): + """ + Sum of squares between observed and predicted data + + Parameters + ---------- + obs : iterable + Observed data + pred : iterable + Predicted data + + Returns + ------- + float + Sum of squares + + Notes + ----- + The length of observed and predicted data must match. + + """ + + return np.sum((np.array(obs) - np.array(pred)) ** 2) + + +def r_squared(obs, pred, one_to_one=False, log_trans=False): + """ + R^2 value for a regression of observed and predicted data + + Parameters + ---------- + obs : iterable + Observed data + pred : iterable + Predicted data + one_to_one : bool + If True, calculates the R^2 based on the one-to-one line (see [#]_), + and if False, calculates the standard R^2 based on a linear regression. + Default False. + log_trans : bool + If True, log transforms obs and pred before R^2 calculation. + + Returns + ------- + float + R^2 value + + Notes + ----- + Using the traditional R^2 to compare the fit of observed and predicted + values may be misleading as the relationship may not be one-to-one but the + R^2 value may be quite high. The one-to-one option alleviates this problem. + + References + ---------- + .. [#] + White, E., Thibault, K., & Xiao, X. (2012). Characterizing the species + abundance distributions across taxa and ecosystems using a simple + maximum entropy model. Ecology, 93(8), 1772-8 + + """ + + if log_trans: + obs = np.log(obs) + pred = np.log(pred) + + if one_to_one: + r_sq = 1 - (sum_of_squares(obs, pred) / + sum_of_squares(obs, np.mean(obs))) + else: + b0, b1, r, p_value, se = stats.linregress(obs, pred) + r_sq = r ** 2 + + return r_sq + +def preston_bin(data, max_num): + """ + Bins data on base 2 using Preston's method + + Parameters + ---------- + data : array-like + Data to be binned + max_num : float + The maximum upper value of the data + + Returns + ------- + tuple + (binned_data, bin_edges) + + Notes + ----- + Uses Preston's method of binning, which has exclusive lower boundaries and + inclusive upper boundaries. Densities are not split between bins. + + References + ---------- + .. [#] + Preston, F. (1962). The canonical distribution of commonness and rarity. + Ecology, 43, 185-215 + + """ + + log_ub = np.ceil(np.log2(max_num)) + + # Make an exclusive lower bound in keeping with Preston + if log_ub == 0: + boundaries = np.array([0, 1]) + elif log_ub == 1: + boundaries = np.arange(1, 4) + else: + boundaries = 2 ** np.arange(0, log_ub + 1) + boundaries = np.insert(boundaries, 2, 3) + boundaries[3:] = boundaries[3:] + 1 + + hist_data = np.histogram(data, bins=boundaries) + return hist_data + + +def pueyo_bins(data): + """ + Binning method based on Pueyo (2006) + + Parameters + ---------- + data : array-like data + Data to be binned + + Returns + ------- + : tuple of arrays + binned data, empirical probability density + + Notes + ----- + Bins the data in into bins of lenth 2**i, i=0, 1, 2 ... + The empirical probability densities will sum to 1 if multiplied by the + respective 2**i. + + """ + log_ub = np.ceil(np.log2(np.max(data))) + bins = 2**np.arange(log_ub + 1) + binned_data = np.histogram(data, bins=bins)[0] + epdf = (1 / bins[:-1]) * binned_data / len(data) + return binned_data, epdf + + diff --git a/macroeco/compare/test_compare.py b/macroeco/compare/test_compare.py new file mode 100644 index 0000000..3d7e023 --- /dev/null +++ b/macroeco/compare/test_compare.py @@ -0,0 +1,175 @@ +from __future__ import division + +from numpy.testing import (TestCase, assert_equal, assert_array_equal, + assert_almost_equal, assert_array_almost_equal, + assert_allclose, assert_, assert_raises) + +from macroeco.compare import * +import numpy as np +import scipy.stats as stats +import macroeco.models as mod + + +class TestNLL(TestCase): + + def test_nll(self): + # R: sum(dnorm(c(1,2,3,4,5), log=TRUE)) + R_res = 32.09469 + data = np.array([1, 2, 3, 4, 5]) + model = stats.norm(loc=0, scale=1) + lglk = nll(data, model) + assert_almost_equal(R_res, lglk, decimal=5) + + +# TODO: Test LRT + + +class TestAIC(TestCase): + + def test_aic_basic(self): + + model = stats.norm(loc=0, scale=1) + data = np.arange(1, 9) + aic1 = AIC(data, model, corrected=False) + expected = 222.703016531 # Calculated by hand + assert_almost_equal(aic1, expected, decimal=6) + + model = stats.gamma(a=2) + data = [1, 1, 1, 2, 4, 5, 7, 12] + aic1 = AIC(data, model, corrected=False) + expected = 51.760607494 # Calculated by hand + assert_almost_equal(aic1, expected, decimal=6) + + model = stats.gamma(a=2, loc=0) + aic1 = AIC(data, model, corrected=False) + expected = 53.760607494 # Calculated by hand + assert_almost_equal(aic1, expected, decimal=6) + + def test_aic_given_params(self): + + model = stats.norm() + data = np.arange(1, 9) + aic1 = AIC(data, model, corrected=False, params=2) + # statsmodel.tools.eval_measures.aic: aic(L, 8, 2) + expected = 222.703016531 + assert_almost_equal(aic1, expected) + + model = stats.gamma(2) + data = [1, 1, 1, 2, 4, 5, 7, 12] + aic1 = AIC(data, model, corrected=False, params=1) + # statsmodel.tools.eval_measures.aic: aic(L, 8, 1) + expected = 51.760607494 + assert_almost_equal(aic1, expected, decimal=6) + + model = stats.gamma(2, 0) + aic1 = AIC(data, model, corrected=False, params=2) + # statsmodel.tools.eval_measures.aic: aic(L, 8, 2) + expected = 53.760607494 + assert_almost_equal(aic1, expected, decimal=6) + + def test_aicc(self): + + model = stats.norm() + data = np.arange(1, 9) + aic1 = AIC(data, model, corrected=True, params=2) + expected = 225.10302 # Calculated by hand + assert_almost_equal(expected, aic1, decimal=5) + + +class TestFullModelNLL(TestCase): + + def test_correct_value_for_continuous_models(self): + + # Test that the full model returns what we expect + data = np.array([3, 4, 5]) + + models = [mod.lognorm] + for model in models: + + params = [model.fit_mle(np.array([td])) for td in data] + values = [model(*params[i]).logpdf(data[i]) for i in + xrange(len(data))] + pred_nll = -np.sum(values) + + test_nll = full_model_nll(data, model) + + assert_equal(pred_nll, test_nll) + + def test_correct_value_for_discrete_models(self): + + # Test that the full model returns what we expect + data = np.array([3, 4, 5]) + + models = [mod.nbinom] + for model in models: + + params = [model.fit_mle(np.array([td])) for td in data] + values = [model(*params[i]).logpmf(data[i]) for i in + xrange(len(data))] + pred_nll = -np.sum(values) + + test_nll = full_model_nll(data, model) + + assert_equal(pred_nll, test_nll) + + +class TestAICCompare(TestCase): + + def test_aic_delta_and_weights(self): + + data = [1, 1, 1, 2, 3, 4, 7, 23, 78] + models = [stats.norm(scale=100), stats.norm(scale=99)] + aic_vals = [AIC(data, tm) for tm in models] + daic, aicw = AIC_compare(aic_vals) + + pred = np.array([0.47909787, 0.52090213]) # Calculated by hand + assert_array_almost_equal(aicw, pred) + assert_array_almost_equal(daic, [daic[0]-daic[1], 0]) + + +class TestRsquared(TestCase): + + def test_r_squared_repeated_data(self): + + # Identical data should lead to an R^2 of 1 + test_data = np.random.randint(5, 100, 100) + rsq = r_squared(test_data, test_data) + assert_equal(rsq, 1) + + # TODO: Test known R2 for regression and one-to-one + + +class TestPrestonBin(TestCase): + + def test_bin_functionality(self): + + # Test against R's vegan prestonfit: prestonfit(data, tiesplit=FALSE) + # Note that vegan drops the bins with 0 values + + data = np.array([1, 1, 1, 1, 2, 2, 4, 4, 8, 16, 17.1, 89]) + vegan = np.array([4, 2, 2, 1, 1, 1, 0, 1], dtype=np.float) + test_res = preston_bin(data, max(data))[0] + assert_array_equal(test_res, vegan) + + data = np.array([1, 1, 1, 1, 4, 5, 6, 7, 12, 34, 56]) + vegan = np.array([4, 0, 1, 3, 1, 0, 2], dtype=np.float) + test_res = preston_bin(data, max(data))[0] + assert_array_equal(test_res, vegan) + + def test_bin_data_boundary(self): + # Test boundary condition + data = np.array([1, 2]) + vegan = np.array([1, 1], dtype=np.float) + test_res = preston_bin(data, max(data))[0] + assert_array_equal(test_res, vegan) + + data = np.array([1, 1, 1]) + vegan = np.array([3], dtype=np.float) + test_res = preston_bin(data, max(data))[0] + assert_array_equal(test_res, vegan) + + data = np.array([1, 2, 3]) + vegan = np.array([1, 1, 1], dtype=np.float) + test_res = preston_bin(data, max(data))[0] + assert_array_equal(test_res, vegan) + diff --git a/macroeco/empirical/__init__.py b/macroeco/empirical/__init__.py new file mode 100644 index 0000000..5444e8d --- /dev/null +++ b/macroeco/empirical/__init__.py @@ -0,0 +1,48 @@ +""" +===================================== +Empirical (:mod:`macroeco.empirical`) +===================================== + +This module contains functions used in the analysis of ecological patterns in +empirical data sets. + +Patch +===== + +Patch is the core class of the empirical module. It reads and validates +metadata and data table files, and patch objects are the first argument to all +of the empirical metric functions in this module. + +.. autosummary:: + :toctree: generated/ + + Patch + +Metrics +======= + +Each of these functions calculates an empirical ecological metric for a given +patch object. + +.. autosummary:: + :toctree: generated/ + + sad + ssad + sar + comm_grid + o_ring + +Other +===== + +.. autosummary:: + :toctree: generated/ + + empirical_cdf + +""" + +from ._empirical import (Patch, + sad, ssad, sar, comm_grid, o_ring, + empirical_cdf) diff --git a/macroeco/empirical/_empirical.py b/macroeco/empirical/_empirical.py new file mode 100644 index 0000000..b73d836 --- /dev/null +++ b/macroeco/empirical/_empirical.py @@ -0,0 +1,1183 @@ +from __future__ import division +import os +import re +import copy +from configparser import ConfigParser +import itertools +from copy import deepcopy +from twiggy import log +log = log.name('emp ') + +import numpy as np +import pandas as pd +import scipy.spatial.distance as dist +try: + import shapely.geometry as geo +except: + pass +# TODO: Make shapely import work with pyinstaller + +from ..misc import doc_sub, log_start_end + +metric_params = \ + """patch : Patch obj + Patch object containing data for analysis + cols : str + Indicates which column names in patch data table are associated with + species identifiers, counts, energy, and mass. See Notes. + splits : str + If multiple analyses for subsets of patch data table are desired, + specifies how columns should be split. See Notes.""" + +metric_return = \ + """list + List of tuples containing results, where the first element of each + tuple is a string indicating the split values used for that result and + second element is a dataframe giving the result.""" + +cols_note = \ + """The parameter ``cols`` is a string describing which column in the data + table should be used for which "special columns" in analysis. The five + possible special columns are + + - spp_col - Unique species identifiers + - count_col - Number of individuals at a location + - x_col - x coordinate of location + - y_col - y coordinate of location + - energy_col - Energetic requirements of individual(s) at a location + + For example, setting ``cols`` to ``spp_col: spp: count_col: number`` will + use the column named "spp" in the data table to represent the unique + species identifiers, and the column "number" in the data table to represent + the count of individuals at a point. + + Different special columns are required for different analyses. count_col is + used when multiple individuals of a species may be found at a single + recorded location, as is the case in gridded censuses where all individuals + in a quadrat are "assigned" to a single point. If count_col is not + specified, each record in the data table will be presumed to represent a + single individual (i.e., a count of 1). + + Note that the value of spp_col may be set to a columm in the data table + giving the genus, family, functional group, etc., which allows for analysis + of this metric by those groups. """ + +splits_note = \ + """The parameter ``splits`` is a semicolon-separated string in the form of + "column: value", where column is a name of a column in the patch data + table and value is either (a) an integer giving the number of + equally-spaced divisions of a column, or (b) the special keyword + 'split', which evaluates all unique levels of a column. + + For example, presume a data table has columns for x and y spatial + coordinates and a column for year, of which there are three. The string + "x:2; y:2; year:split" will perform the analysis separately for each of + four subplots of the patch (created by dividing the x and y coordinates + each into two equally sized divisions) within each of the three years, + for a total of 12 separate analyses. Note that if you pass in the x + split you MUST also pass in a y split (even if it is just "y:1") or vice + versa. Otherwise, the computed areas will be incorrect.""" + +division_note = \ + """The parameter divisions describes how to successively divide the patch + along the x_col and y_col dimensions. For + example, the string '1,2; 2,2; 2,4' will produce an output table with three + rows, giving the result across two subplots when the patch is split + along y_col, across four subplots when the patch is split into a 2x2 grid, + and across eight subplots when the patch is split into 2 parts along x_col + and 4 parts along y_col.""" + + +class Patch(object): + """ + An object representing an empirical census + + Parameters + ---------- + metadata_path : str + Path to metadata file describing census data + subset : str + String describing subset of data to use for Patch analysis. See Notes. + + Attributes + ---------- + table : dataframe + Table of census data recorded in patch + meta : ConfigParser obj + Dict-like metadata, loaded from metadata_path and processed by subset + subset : str + Subset string passed as parameter + + Notes + ----- + The table file described by the metadata must contain column names + consisting only of letters and numbers, with no spaces or other special + characters. + + The parameter subset takes different forms depending on whether the data + file described by the metadata is a csv or a sql/db file. + + For csv data files, subset is a semicolon-separated string describing + subset operations. For example, the string "year==2005; x>20; x<40; + spp=='cabr'" loads a data table containing only records for which the year + is 2005, x values are between 20 and 40, and species is 'cabr'. Note that + for categorical columns, the value of the column must be enclosed in single + quotes. + + For sql/db files, subset is a SQL query string that selects the data from + the data file. + + The meta attribute of this object is processed to reflect the value of + subset. If columns with a min and a max are included in the subset string, + the min and max values for that column in meta will be updated to reflect + the specified limits. + + An empty Patch object can be created with a metadata_path of None. + + """ + + def __init__(self, metadata_path, subset=''): + + if not metadata_path: # Allow for creation of empty patch + self.meta = None + self.subset = '' + self.table = None + else: + self.meta = ConfigParser() + self.meta.read(os.path.expanduser(metadata_path)) + self.subset = subset + self.table = self._load_table(metadata_path, + self.meta['Description']['datapath']) + + self.incremented = False + + def _load_table(self, metadata_path, data_path): + """ + Load data table, taking subset if needed + + Parameters + ---------- + metadata_path : str + Path to metadata file + data_path : str + Path to data file, absolute or relative to metadata file + + Returns + ------- + dataframe + Table for analysis + + """ + + metadata_dir = os.path.dirname(os.path.expanduser(metadata_path)) + data_path = os.path.normpath(os.path.join(metadata_dir, data_path)) + + extension = data_path.split('.')[-1] + + if extension == 'csv': + full_table = pd.read_csv(data_path, index_col=False) + table = _subset_table(full_table, self.subset) + self.meta, _ = _subset_meta(self.meta, self.subset) + elif extension in ['db', 'sql']: + + # TODO: deal with incrementing in DB table + table = self._get_db_table(data_path, extension) + else: + raise TypeError('Cannot process file of type %s' % extension) + + return table + + def _get_db_table(self, data_path, extension): + """ + Query a database and return query result as a recarray + + Parameters + ---------- + data_path : str + Path to the database file + extension : str + Type of database, either sql or db + + Returns + ------- + table : recarray + The database query as a recarray + + """ + # TODO: This is probably broken + raise NotImplementedError, "SQL and db file formats not yet supported" + + # Load table + if extension == 'sql': + con = lite.connect(':memory:') + con.row_factory = lite.Row + cur = con.cursor() + + with open(data_path, 'r') as f: + sql = f.read() + + cur.executescript(sql) + + else: + con = lite.connect(data_path) + con.row_factory = lite.Row + cur = con.cursor() + + cur.execute(self.subset) + + # Check that table is not empty + db_info = cur.fetchall() + try: + col_names = db_info[0].keys() + except IndexError: + raise lite.OperationalError("Query %s to database %s is empty" % + (query_str, data_path)) + + # Convert objects to tuples + converted_info = [tuple(x) for x in db_info] + + # NOTE: Using default value for Unicode: Seems better than checking + # lengths. Should we keep the type as unicode? + dtypes=[type(x) if type(x) != unicode else 'S150' for x in db_info[0]] + + table = np.array(converted_info, dtype=zip(col_names, dtypes)) + con.commit() + con.close() + + # Return a recarray for consistency + # TODO: This should now be a pd.dataframe + return table.view(np.recarray) + + +def _subset_table(full_table, subset): + """ + Return subtable matching all conditions in subset + + Parameters + ---------- + full_table : dataframe + Entire data table + subset : str + String describing subset of data to use for analysis + + Returns + ------- + dataframe + Subtable with records from table meeting requirements in subset + + """ + if not subset: + return full_table + + # TODO: Figure out syntax for logical or + conditions = subset.replace(' ','').split(';') + + valid = np.ones(len(full_table), dtype=bool) + for condition in conditions: + this_valid = eval('full_table.' + condition) + valid = np.logical_and(valid, this_valid) + + return full_table[valid] + +def _subset_meta(full_meta, subset, incremented=False): + """ + Return metadata reflecting all conditions in subset + + Parameters + ---------- + full_meta : ConfigParser obj + Metadata object + subset : str + String describing subset of data to use for analysis + incremented : bool + If True, the metadata has already been incremented + + Returns + ------- + Configparser object or dict + Updated version of full_meta accounting for subset string + + """ + if not subset: + return full_meta, False + + meta = {} # Make deepcopy of entire meta (all section dicts in meta dict) + for key, val in full_meta.iteritems(): + meta[key] = copy.deepcopy(dict(val)) + + conditions = subset.replace(' ','').split(';') + + inc = False + for condition in conditions: + condition_list = re.split('[<>=]', condition) + col = condition_list[0] + val = condition_list[-1] + + try: + col_step = meta[col]['step'] + except: # If there's no metadata for this col, do nothing + continue + + operator = re.sub('[^<>=]', '', condition) + + if operator == '==': + meta[col]['min'] = val + meta[col]['max'] = val + elif operator == '>=': + meta[col]['min'] = val + elif operator == '>': + if incremented: + meta[col]['min'] = val + else: + meta[col]['min'] = str(eval(val) + eval(col_step)) + inc = True + elif operator == '<=': + meta[col]['max'] = val + elif operator == '<': + if incremented: + meta[col]['max'] = val + else: + meta[col]['max'] = str(eval(val) - eval(col_step)) + inc = True + else: + raise ValueError, "Subset %s not valid" % condition + + return meta, inc + + +@log_start_end +@doc_sub(metric_params, metric_return, cols_note, splits_note) +def sad(patch, cols, splits, clean=True): + """ + Calculates an empirical species abundance distribution + + Parameters + ---------- + {0} + clean : bool + If True, all species with zero abundance are removed from SAD results. + Default False. + + Returns + ------- + {1} Result has two columns: spp (species identifier) and y (individuals of + that species). + + Notes + ----- + {2} + + {3} + + """ + + (spp_col, count_col), patch = \ + _get_cols(['spp_col', 'count_col'], cols, patch) + + full_spp_list = np.unique(patch.table[spp_col]) + + # Loop through each split + result_list = [] + for substring, subpatch in _yield_subpatches(patch, splits): + + # Get abundance for each species + sad_list = [] + for spp in full_spp_list: + this_spp = (subpatch.table[spp_col] == spp) + count = np.sum(subpatch.table[count_col][this_spp]) + sad_list.append(count) + + # Create dataframe of spp names and abundances + subdf = pd.DataFrame({'spp': full_spp_list, 'y': sad_list}) + + # Remove zero abundance rows if requested + if clean: + subdf = subdf[subdf['y'] > 0] + + # Append subset result + result_list.append((substring, subdf)) + + # Return all results + return result_list + + +@log_start_end +@doc_sub(metric_params, metric_return, cols_note, splits_note) +def ssad(patch, cols, splits): + """ + Calculates an empirical intra-specific spatial abundance distribution + + Parameters + ---------- + {0} + + Returns + ------- + {1} Result has one column giving the individuals of species in each + subplot. + + Notes + ----- + {2} + + {3} + + """ + + # Get and check SAD + sad_results = sad(patch, cols, splits, clean=False) + + # Create dataframe with col for spp name and numbered col for each split + for i, sad_result in enumerate(sad_results): + if i == 0: # For first result, create dataframe + fulldf = sad_result[1] + fulldf.columns = ['spp', '0'] # Renames y col to 0 + else: # For other results, append col to dataframe, named by num + fulldf[str(i)] = sad_result[1]['y'] + + # Get each spp SSAD (row of fulldf) and append as tuple in result_list + result_list = [] + for _, row in fulldf.iterrows(): + row_values_array = np.array(row[1:], dtype=float) + result_list.append((row[0], pd.DataFrame({'y': row_values_array}))) + + # Return all results + return result_list + + +@log_start_end +@doc_sub(metric_params, metric_return, cols_note, splits_note, division_note) +def sar(patch, cols, splits, divs, ear=False): + """ + Calculates an empirical species area or endemics area relationship + + Parameters + ---------- + {0} + divs : str + Description of how to divide x_col and y_col. See notes. + ear : bool + If True, calculates an endemics area relationship + + Returns + ------- + {1} Result has 5 columns; div, x, and y; that give the ID for the + division given as an argument, fractional area, and the mean species + richness at that division. + + Notes + ----- + {2} + + For the SAR and EAR, cols must also contain x_col and y_col, giving the x + and y dimensions along which to grid the patch. + + {3} + + {4} + + """ + + def sar_y_func(spatial_table, all_spp): + return np.mean(spatial_table['n_spp']) + + def ear_y_func(spatial_table, all_spp): + endemic_counter = 0 + for spp in all_spp: + spp_in_cell = [spp in x for x in spatial_table['spp_set']] + spp_n_cells = np.sum(spp_in_cell) + if spp_n_cells == 1: # If a spp is in only 1 cell, endemic + endemic_counter += 1 + n_cells = len(spatial_table) + return endemic_counter / n_cells # mean endemics / cell + + if ear: + y_func = ear_y_func + else: + y_func = sar_y_func + + return _sar_ear_inner(patch, cols, splits, divs, y_func) + + +def _sar_ear_inner(patch, cols, splits, divs, y_func): + """ + y_func is function calculating the mean number of species or endemics, + respectively, for the SAR or EAR + """ + + (spp_col, count_col, x_col, y_col), patch = \ + _get_cols(['spp_col', 'count_col', 'x_col', 'y_col'], cols, patch) + + # Loop through each split + result_list = [] + for substring, subpatch in _yield_subpatches(patch, splits): + + # Get A0 + A0 = _patch_area(subpatch, x_col, y_col) + + # Loop through all divisions within this split + all_spp = np.unique(subpatch.table[spp_col]) + subresultx = [] + subresulty = [] + subresultnspp = [] + subresultnindivids = [] + subdivlist = _split_divs(divs) + for subdiv in subdivlist: + spatial_table = _yield_spatial_table(subpatch, subdiv, spp_col, + count_col, x_col, y_col) + subresulty.append(y_func(spatial_table, all_spp)) + subresultx.append(A0 / eval(subdiv.replace(',', '*'))) + subresultnspp.append(np.mean(spatial_table['n_spp'])) + subresultnindivids.append(np.mean(spatial_table['n_individs'])) + + # Append subset result + subresult = pd.DataFrame({'div': subdivlist, 'x': subresultx, + 'y': subresulty, 'n_spp': subresultnspp, + 'n_individs': subresultnindivids}) + result_list.append((substring, subresult)) + + return result_list + + +def _split_divs(divs): + if type(divs) == type((1,1)): # Tuple (occurs when main evals single div) + subdivlist = [str(divs)[1:-1]] + else: # String + subdivlist = divs.split(';') + return subdivlist + + +@log_start_end +@doc_sub(metric_params, metric_return, cols_note, splits_note) +def comm_grid(patch, cols, splits, divs, metric='Sorensen'): + """ + Calculates commonality as a function of distance for a gridded patch + + Parameters + ---------- + {0} + divs : str + Description of how to divide x_col and y_col. Unlike SAR and EAR, only + one division can be given at a time. See notes. + metric : str + One of Sorensen or Jaccard, giving the metric to use for commonality + calculation + + Returns + ------- + {1} Result has three columns, pair, x, and y, that give the locations of + the pair of patches for which commonality is calculated, the distance + between those cells, and the Sorensen or Jaccard result. + + Notes + ----- + {2} + + For gridded commonality, cols must also contain x_col and y_col, giving the + x and y dimensions along which to grid the patch. + + {3} + + """ + + (spp_col, count_col, x_col, y_col), patch = \ + _get_cols(['spp_col', 'count_col', 'x_col', 'y_col'], cols, patch) + + # Loop through each split + result_list = [] + for substring, subpatch in _yield_subpatches(patch, splits): + + # Get spatial table and break out columns + spatial_table = _yield_spatial_table(subpatch, divs, spp_col, + count_col, x_col, y_col) + spp_set = spatial_table['spp_set'] + cell_loc = spatial_table['cell_loc'] + n_spp = spatial_table['n_spp'] + + # Get all possible pairwise combinations of cells + pair_list = [] + dist_list = [] + comm_list = [] + for i in range(len(spatial_table)): + for j in range(i+1, len(spatial_table)): + + iloc = np.round(cell_loc[i], 6) + jloc = np.round(cell_loc[j], 6) + pair_list.append('('+str(iloc[0])+' '+str(iloc[1])+') - '+ + '('+str(jloc[0])+' '+str(jloc[1])+')') + + dist_list.append(_distance(cell_loc[i], cell_loc[j])) + + ij_intersect = spp_set[i] & spp_set[j] + if metric.lower() == 'sorensen': + comm = 2*len(ij_intersect) / (n_spp[i] + n_spp[j]) + elif metric.lower() == 'jaccard': + comm = len(ij_intersect) / len(spp_set[i] | spp_set[j]) + else: + raise ValueError, ("Only Sorensen and Jaccard metrics are " + "available for gridded commonality") + comm_list.append(comm) + + # Append subset result + subresult = pd.DataFrame({'pair': pair_list, 'x': dist_list, + 'y': comm_list}) + result_list.append((substring, subresult)) + + # Return all results + return result_list + + +def _yield_spatial_table(patch, div, spp_col, count_col, x_col, y_col): + """ + Calculates an empirical spatial table + + Yields + ------- + DataFrame + Spatial table for each division. See Notes. + + Notes + ----- + The spatial table is the precursor to the SAR, EAR, and grid-based + commonality metrics. Each row in the table corresponds to a cell created by + a given division. Columns are cell_loc (within the grid defined by the + division), spp_set, n_spp, and n_individs. + + """ + + div_split_list = div.replace(';','').split(',') + div_split = (x_col + ':' + div_split_list[0] + ';' + + y_col + ':' + div_split_list[1]) + + # Get cell_locs + # Requires _parse_splits and _product functions to go y inside of x + x_starts, x_ends = _col_starts_ends(patch, x_col, div_split_list[0]) + x_offset = (x_ends[0] - x_starts[0]) / 2 + x_locs = x_starts + x_offset + + y_starts, y_ends = _col_starts_ends(patch, y_col, div_split_list[1]) + y_offset = (y_ends[0] - y_starts[0]) / 2 + y_locs = y_starts + y_offset + + cell_locs = _product(x_locs, y_locs) + + # Get spp set and count for all cells + n_spp_list = [] # Number of species in cell + n_individs_list = [] + spp_set_list = [] # Set object giving unique species IDs in cell + for cellstring, cellpatch in _yield_subpatches(patch,div_split,name='div'): + spp_set = set(np.unique(cellpatch.table[spp_col])) + spp_set_list.append(spp_set) + n_spp_list.append(len(spp_set)) + n_individs_list.append(np.sum(cellpatch.table[count_col])) + + # Create and return dataframe + df = pd.DataFrame({'cell_loc': cell_locs, 'spp_set': spp_set_list, + 'n_spp': n_spp_list, 'n_individs': n_individs_list}) + + return df + + +@log_start_end +@doc_sub(metric_params, metric_return, cols_note, splits_note) +def o_ring(patch, cols, splits, spp, bin_edges, density=True, full=False): + """ + Calculates univariate O-ring for a species + + Parameters + ---------- + {0} + bin_edges : iterable + List of edges of distance classes to bin histogram of distances + spp : str + String corresponding to focal species code + density : bool + If True, return densities (counts divided by area of torus defined + by bin edges) instead of counts. Default True. + full : bool + If True, return a separate column giving density at distance x for + every individual, rather than mean density. Default False. + + Returns + ------- + {1} Result has two columns, x and y, that give the distance to the center + of a torus and the number or density of individuals found in that torus. + + Notes + ----- + If density is False, counts are raw counts, non-edge corrected, within + rings. + + Pairwise distances are directional, giving n(n-1) total distances for a + species with n individuals, as edge correction is inherently directional. + + Bins include the lower edge and exclude the upper edge, except for the + final bin which includes both the lower and upper edge. Floating point + arithmetic may cause points located "exactly" on edges to be allocated + contrary to this rule, however. + + If there are no records for a species, result table will be a dataframe + with no records. If there are records but a species has only one + individual, dataframe will have zero count at all torus areas. + + When using density, the maximum distance used for edge correction, given by + the mean of the last two bin_edge values, should ideally be set to no + greater than one half the diagonal distance across the plot. This ensures + that it is not possible for an entire edge correction buffer to be outside + of the plot. + + {2} + + For the 0-ring analysis, cols must also contain x_col and y_col, giving the + x and y dimensions along which to analyze spatial pattern. + + {3} + + """ + + try: + geo.box(0, 0, 1, 1) + except: + raise ImportError, "O-ring analysis requires shapely package" + + (spp_col, count_col, x_col, y_col), patch = \ + _get_cols(['spp_col', 'count_col', 'x_col', 'y_col'], cols, patch) + + # Loop through each split + result_list = [] + for substring, subpatch in _yield_subpatches(patch, splits): + + # Get table for just this species + spp_table = subpatch.table[subpatch.table[spp_col] == spp] + + # If spp not present, continue + if (len(spp_table) == 0): + result_list.append((substring, pd.DataFrame(columns=['x','y']))) + continue + + # Set up plot geometry + plot_poly, radii, torus_areas = \ + _get_plot_geometry(subpatch, bin_edges, x_col, y_col) + + # Get lists of all points and counts in spp_table + x = spp_table[x_col] + y = spp_table[y_col] + points = zip(x,y) + counts = list(spp_table[count_col]) + + # Arrays to hold summed areas and distance histograms for all points + + if full: + hists = [] # Vectors of len(radii) appended for each point + areas = [] + else: + hists = np.zeros(len(radii)) + areas = np.zeros(len(radii)) + + # Go through each point and associated count + for i, (point, count) in enumerate(zip(points, counts)): + + # Create list of all other points and counts except this + other_points = points[0:i] + points[i+1:] + other_counts = counts[0:i] + counts[i+1:] + + # Get dist from this point to all other points + if other_points: + other_dists = dist.cdist(np.array([point]), + np.array(other_points)) + else: + other_dists = np.array(()) + + # Repeat other point distances to acccount for their counts + other_dists = np.repeat(other_dists, other_counts) + + # Repeat entire other_dist array to account for count here + other_dists = np.tile(other_dists, count) + + # Add 0's for count at this point to account for count here + if count > 1: + other_dists = np.concatenate((other_dists, + np.zeros(count*(count-1)))) + + # Calculate histogram of distances to other points + hist, _ = np.histogram(other_dists, bin_edges) + + # Convert histogram to density if desired + corr_factor = np.ones(len(radii)) # Frac length in plot + for j, r in enumerate(radii): + circ = geo.Point(*point).buffer(r, resolution=64) + outside_len = circ.boundary.difference(plot_poly).length + corr_factor[j] = ((circ.boundary.length - outside_len) / + circ.boundary.length) + + # Add hist and corrected area for this point to running totals + if full: + hists.append(hist) + areas.append(torus_areas * corr_factor * count) + else: + hists += hist + areas += torus_areas * corr_factor * count + + # If density, divide summed torus counts by summed areas + if density: + hists = np.array(hists) / np.array(areas) + + # Append subset result + subresult = pd.DataFrame({'x': radii}) + if full: + for i in range(len(hists)): + subresult[i] = hists[i] + else: + subresult['y'] = hists + result_list.append((substring, subresult)) + + # Return all results + return result_list + + +def _get_plot_geometry(subpatch, bin_edges, x_col, y_col): + + # Plot polygon + xmin = eval(subpatch.meta[x_col]['min']) + xmax = eval(subpatch.meta[x_col]['max']) + ymin = eval(subpatch.meta[y_col]['min']) + ymax = eval(subpatch.meta[y_col]['max']) + plot_poly = geo.box(xmin, ymin, xmax, ymax) + + # Radii of toruses + bin_edges = np.array(bin_edges) + radii = (bin_edges[:-1] + bin_edges[1:]) / 2 + + # Areas of all toruses + torus_areas = [] + for i in range(len(bin_edges) - 1): + torus_areas.append(np.pi * (bin_edges[i+1]**2 - bin_edges[i]**2)) + + return plot_poly, radii, np.array(torus_areas) + + + +def comm_sep(self, plot_locs, criteria, loc_unit=None): + ''' + Calculates commonality (Sorensen and Jaccard) between pairs of plots. + + Parameters + ---------- + plot_locs : dict + Dictionary with keys equal to each plot name, which must be + represented by a column in the data table, and values equal to a + tuple of the x and y coordinate of each plot + criteria : dict + See docstring for Patch.sad. + loc_unit : str + Unit of plot locations. Special cases include 'decdeg' (decimal + degrees), returns result in km. Otherwise ignored. + + Returns + ------- + result: structured array + Returns a structured array with fields plot-a and plot-b (names of + two plots), dist (distance between plots), and sorensen and jaccard + (similarity indices). Has row for each unique pair of plots. + ''' + + # Set up sad_dict with key=plot and val=clean sad for that plot + sad_dict = {} + + # Loop through all plot cols, updating criteria, and getting spp_list + for plot in plot_locs.keys(): + + # Find current count col and remove it from criteria + for crit_key in criteria.keys(): + if criteria[crit_key] == 'count': + criteria.pop(crit_key, None) + + # Add this plot as col with counts + criteria[plot] = 'count' + + # Get SAD for existing criteria with this plot as count col + sad_return = self.sad(criteria, clean=True) + + # Check that sad_return only has one element, or throw error + if len(sad_return) > 1: + raise NotImplementedError('Too many criteria for comm_sep') + + # Get unique species list for this plot and store in sad_dict + sad_dict[plot] = sad_return[0][2] + + # Set up recarray to hold Sorensen index for all pairs of plots + n_pairs = np.sum(np.arange(len(plot_locs.keys()))) + result = np.recarray((n_pairs,), dtype=[('plot-a','S32'), + ('plot-b', 'S32'), + ('spp-a', int), + ('spp-b', int), + ('dist', float), + ('sorensen', float), + ('jaccard', float)]) + + # Loop through all combinations of plots and fill in result table + row = 0 + for pair in itertools.combinations(plot_locs.keys(), 2): + + # Names of plots + plota = pair[0] + plotb = pair[1] + + result[row]['plot-a'] = plota + result[row]['plot-b'] = plotb + + # Calculate inter-plot distance + if loc_unit == 'decdeg': + result[row]['dist'] = _decdeg_distance(plot_locs[plota], + plot_locs[plotb]) + else: + result[row]['dist'] = _distance(plot_locs[plota], + plot_locs[plotb]) + + # Get similarity indices + spp_a = len(sad_dict[plota]) + spp_b = len(sad_dict[plotb]) + + result[row]['spp-a'] = spp_a + result[row]['spp-b'] = spp_b + + intersect = set(sad_dict[plota]).intersection(sad_dict[plotb]) + union = set(sad_dict[plota]).union(sad_dict[plotb]) + + # Fill in zero if denom is zero + if spp_a + spp_b == 0: + result[row]['sorensen'] = 0 + else: + result[row]['sorensen'] = (2*len(intersect)) / (spp_a+spp_b) + + if len(union) == 0: + result[row]['jaccard'] = 0 + else: + result[row]['jaccard'] = len(intersect) / len(union) + + # Increment row counter + row += 1 + + return result + + +def _get_cols(special_col_names, cols, patch): + """ + Retrieve values of special_cols from cols string or patch metadata + """ + + # If cols not given, try to fall back on cols from metadata + if not cols: + if 'cols' in patch.meta['Description'].keys(): + cols = patch.meta['Description']['cols'] + else: + raise NameError, ("cols argument not given, spp_col at a minimum " + "must be specified") + + # Parse cols string into dict + cols = cols.replace(' ', '') + col_list = cols.split(';') + col_dict = {x.split(':')[0]: x.split(':')[1] for x in col_list} + + # Get special_col_names from dict + result = [] + for special_col_name in special_col_names: + col_name = col_dict.get(special_col_name, None) + + # Create a count col if its requested and doesn't exist + if special_col_name is 'count_col' and col_name is None: + col_name = 'count' + patch.table['count'] = np.ones(len(patch.table)) + + # All special cols must be specified (count must exist by now) + if col_name is None: + raise ValueError, ("Required column %s not specified" % + special_col_name) + + result.append(col_name) + + return tuple(result), patch + + +@doc_sub(splits_note) +def _yield_subpatches(patch, splits, name='split'): + """ + Iterator for subtables defined by a splits string + + Parameters + ---------- + patch : obj + Patch object containing data to subset + splits : str + Specifies how a column of a dataset should be split. See Notes. + + Yields + ------ + tuple + First element is subset string, second is subtable dataframe + + Notes + ----- + {0} + + """ + + if splits: + subset_list = _parse_splits(patch, splits) + for subset in subset_list: + log.info('Analyzing subset %s: %s' % (name, subset)) + subpatch = copy.copy(patch) + subpatch.table = _subset_table(patch.table, subset) + subpatch.meta, subpatch.incremented = _subset_meta(patch.meta, + subset, incremented=True) + + yield subset, subpatch + else: + yield '', patch + + +@doc_sub(splits_note) +def _parse_splits(patch, splits): + """ + Parse splits string to get list of all associated subset strings. + + Parameters + ---------- + patch : obj + Patch object containing data to subset + splits : str + Specifies how a column of a dataset should be split. See Notes. + + Returns + ------- + list + List of subset strings derived from splits string + + Notes + ----- + {0} + + """ + + split_list = splits.replace(' ','').split(';') + subset_list = [] # List of all subset strings + + for split in split_list: + col, val = split.split(':') + + if val == 'split': + uniques = [] + for level in patch.table[col]: + if level not in uniques: + uniques.append(level) + level_list = [col + '==' + str(x) + '; ' for x in uniques] + else: + starts, ends = _col_starts_ends(patch, col, val) + level_list = [col + '>=' + str(x) + '; ' + col + '<' + str(y)+'; ' + for x, y in zip(starts, ends)] + + subset_list.append(level_list) + + # Get product of all string levels as list, conv to string, drop final ; + return [''.join(x)[:-2] for x in _product(*subset_list)] + + +def _patch_area(patch, x_col, y_col): + + lengths = [] + for col in [x_col, y_col]: + col_step = eval(patch.meta[col]['step']) + col_min = eval(patch.meta[col]['min']) + col_max = eval(patch.meta[col]['max']) + + if patch.incremented: + lengths.append(col_max - col_min) + else: + lengths.append(col_max - col_min + col_step) + + return lengths[0] * lengths[1] + +def _col_starts_ends(patch, col, slices): + + col_step = eval(patch.meta[col]['step']) + col_min = eval(patch.meta[col]['min']) + col_max = eval(patch.meta[col]['max']) + + edges = np.linspace(col_min-col_step/2, col_max+col_step/2, eval(slices)+1) + + starts = edges[:-1] + ends = edges[1:] + + return starts, ends + + +def _product(*args, **kwds): + """ + Generates cartesian product of lists given as arguments + + From itertools.product documentation + """ + + pools = map(tuple, args) * kwds.get('repeat', 1) + result = [[]] + for pool in pools: + result = [x+[y] for x in result for y in pool] + return result + + +def _distance(pt1, pt2): + """Euclidean distance between two points""" + return np.sqrt((pt1[0] - pt2[0]) ** 2 + (pt1[1] - pt2[1]) ** 2) + + +def _decdeg_distance(pt1, pt2): + """ + Earth surface distance (in km) between decimal latlong points using + Haversine approximation. + + http://stackoverflow.com/questions/15736995/ + how-can-i-quickly-estimate-the-distance-between-two-latitude-longitude- + points + """ + + lat1, lon1 = pt1 + lat2, lon2 = pt2 + + # Convert decimal degrees to radians + lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2]) + + # haversine formula + dlon = lon2 - lon1 + dlat = lat2 - lat1 + a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2 + c = 2 * np.arcsin(np.sqrt(a)) + km = 6367 * c + + return km + + +def empirical_cdf(data): + """ + Generates an empirical cdf from data + + Parameters + ---------- + data : iterable + Empirical data + + Returns + -------- + DataFrame + Columns 'data' and 'ecdf'. 'data' contains ordered data and 'ecdf' + contains the corresponding ecdf values for the data. + + """ + + vals = pd.Series(data).value_counts() + ecdf = pd.DataFrame(data).set_index(keys=0) + probs = pd.DataFrame(vals.sort_index().cumsum() / np.float(len(data))) + ecdf = ecdf.join(probs, how="right") + ecdf = ecdf.reset_index() + ecdf.columns = ['data', 'ecdf'] + + return ecdf diff --git a/macroeco/empirical/test_empirical.py b/macroeco/empirical/test_empirical.py new file mode 100644 index 0000000..646d1c6 --- /dev/null +++ b/macroeco/empirical/test_empirical.py @@ -0,0 +1,307 @@ +from __future__ import division +import os +from configparser import ConfigParser + +from numpy.testing import (TestCase, assert_equal, assert_array_equal, + assert_almost_equal, assert_array_almost_equal, + assert_allclose, assert_, assert_raises) +from pandas.util.testing import (assert_frame_equal) + +import macroeco.empirical as emp +import macroeco.empirical._empirical as _emp +import numpy as np +import pandas as pd +import scipy.stats as stats + + +class Patches(TestCase): + + def setUp(self): + local_path = os.path.dirname(os.path.abspath(__file__)) + + self.meta1_path = os.path.join(local_path, 'test_meta1.txt') + self.table1_path = os.path.join(local_path, 'test_table1.csv') + self.table1 = pd.DataFrame.from_csv(self.table1_path, index_col=False) + self.meta1 = ConfigParser() + self.meta1.read(self.meta1_path) + self.pat1 = emp.Patch(self.meta1_path) # No subset + self.cols1 = 'spp_col:spp; count_col:count; x_col:x; y_col:y' + self.A1 = 0.2 * 0.3 + + +class TestPatch(Patches): + + def test_load_data_meta(self): + assert_array_equal(self.pat1.table, self.table1) + assert_equal(self.pat1.meta, self.meta1) + + def test_subset_numeric(self): + pat1 = emp.Patch(self.meta1_path, 'x>=0.2') + assert_array_equal(pat1.table, self.table1[self.table1.x >= 0.2]) + + self.meta1['x']['min'] = '0.2' + assert_equal(pat1.meta, self.meta1) + + def test_subset_categorical(self): + pat1 = emp.Patch(self.meta1_path, "spp=='b'") + assert_array_equal(pat1.table, self.table1[self.table1['spp']=='b']) + assert_equal(pat1.meta, self.meta1) # Meta should not change + + def test_multiple_subset(self): + # Only first element in table remains + pat1 = emp.Patch(self.meta1_path, "spp=='a' ; y < 0.2") + assert_array_equal(pat1.table.iloc[0], self.table1.iloc[0]) + assert_equal(len(pat1.table), 1) + + self.meta1['y']['max'] = '0.1' + assert_equal(pat1.meta, self.meta1) + + +class TestSAD(Patches): + + def test_simple(self): + # Falling back on spp_col in metadata, so count 1 for each row + sad = emp.sad(self.pat1, None, None) + assert_array_equal(sad[0][1]['y'], [3,2]) + + def test_simple_with_cols(self): + # Specify count and spp_col here + sad = emp.sad(self.pat1, self.cols1, None) + assert_array_equal(sad[0][1]['y'], [4,4]) + + def test_two_way_split(self): + # Complete split generates 6 results + sad = emp.sad(self.pat1, self.cols1, 'x:2; y:3') + assert_equal(len(sad), 6) + + # Goes through x then y + assert_equal(sad[0][1]['spp'].values, 'a') + assert_equal(sad[0][1]['y'].values, 2) + assert_equal(sad[1][1]['y'].values, [1,1]) + assert_equal(sad[5][1]['spp'].values, 'b') + assert_equal(sad[0][1]['y'].values, 2) + + def test_one_way_uneven_split(self): + # 0.2 should fall in second division of y + sad = emp.sad(self.pat1, self.cols1, 'y:2') + assert_equal(len(sad), 2) + assert_equal(sad[0][1]['spp'].values, ['a']) + assert_equal(sad[0][1]['y'].values, [2]) + assert_equal(sad[1][1]['spp'].values, ['a','b']) + assert_equal(sad[1][1]['y'].values, [2,4]) + + def test_split_categorical(self): + sad = emp.sad(self.pat1, self.cols1, 'year:split; x:2') + assert_equal(sad[0][1]['y'].values, 3) + assert_equal(sad[1][1]['y'].values, []) + assert_equal(sad[2][1]['y'].values, [1,1]) + assert_equal(sad[3][1]['y'].values, [3]) + + def test_clean(self): + # No a in second split on x + sad = emp.sad(self.pat1, self.cols1, 'x:2', clean=False) + assert_equal(len(sad[1][1]), 2) # Both spp when clean False + + sad = emp.sad(self.pat1, self.cols1, 'x:2', clean=True) + assert_equal(len(sad[1][1]), 1) # Only 'b' when clean True + + +class TestSSAD(Patches): + + def test_no_splits(self): + # Just total abundance by species + ssad = emp.ssad(self.pat1, self.cols1, None) + assert_array_equal(ssad[0][1]['y'], [4]) + assert_array_equal(ssad[1][1]['y'], [4]) + + def test_with_split(self): + ssad = emp.ssad(self.pat1, self.cols1, 'x:2') + assert_array_equal(ssad[0][1]['y'], [4,0]) # spp a + assert_array_equal(ssad[1][1]['y'], [1,3]) # spp b + + +class TestSAR(Patches): + + def test_no_splits(self): + sar = emp.sar(self.pat1, self.cols1, None, '1,1; 2,1; 2,3') + assert_array_almost_equal(sar[0][1]['x'], + [1*self.A1, 0.5*self.A1, 1/6*self.A1]) + assert_array_equal(sar[0][1]['y'], [2, 1.5, (1+2+1+0+0+1)/6.]) + + def test_with_split(self): + sar = emp.sar(self.pat1, self.cols1, 'year:split', '2,1; 1,3') + assert_array_almost_equal(sar[0][1]['x'], [0.5*self.A1, 1/3.*self.A1]) + assert_array_almost_equal(sar[1][1]['x'], [0.5*self.A1, 1/3.*self.A1]) + assert_array_equal(sar[0][1]['y'], [0.5, 2/3.]) + assert_array_equal(sar[1][1]['y'], [3/2., 1]) + + def test_single_division(self): + sar = emp.sar(self.pat1, self.cols1, None, '2,1') + assert_array_almost_equal(sar[0][1]['x'], [0.5*self.A1]) + assert_array_equal(sar[0][1]['y'], [1.5]) + + def test_empty_equals_split_subset(self): + sar_empty = emp.sar(self.pat1, self.cols1, "", '1,1') + sar_split = emp.sar(self.pat1, self.cols1, "x:1; y:1", '1,1') + print sar_empty + print sar_split + assert_frame_equal(sar_empty[0][1].sort(axis=1), + sar_split[0][1].sort(axis=1)) + + + +class TestEAR(Patches): + + def test_no_splits(self): + sar = emp.sar(self.pat1, self.cols1, None, '1,1; 2,1; 2,3', ear=True) + assert_array_equal(sar[0][1]['y'], [2, 0.5, 0]) + + def test_with_split(self): + sar = emp.sar(self.pat1, self.cols1, 'year:split', '2,1;1,3', ear=True) + assert_array_equal(sar[0][1]['y'], [0.5, 0]) + assert_array_equal(sar[1][1]['y'], [0.5, 1/3.]) + + +class TestCommGrid(Patches): + + def test_no_splits_Sorensen(self): + comm = emp.comm_grid(self.pat1, self.cols1, None, '2,1') + assert_almost_equal(comm[0][1]['x'], [0.1]) + assert_array_equal(comm[0][1]['y'], [2./(2+1)]) + + def test_no_splits_Jaccard(self): + comm = emp.comm_grid(self.pat1, self.cols1, None, '2,1', + metric='Jaccard') + assert_almost_equal(comm[0][1]['x'], [0.1]) + assert_array_equal(comm[0][1]['y'], [1/2.]) + + def test_with_split(self): + comm = emp.comm_grid(self.pat1, self.cols1, 'year:split', '2,1') + assert_array_equal(comm[0][1]['y'], [0]) + assert_array_equal(comm[1][1]['y'], [2/3.]) + + def test_y_division_even(self): + comm = emp.comm_grid(self.pat1, self.cols1, '', '1,3') + assert_array_equal(comm[0][1]['pair'], ['(0.15 0.1) - (0.15 0.2)', + '(0.15 0.1) - (0.15 0.3)', + '(0.15 0.2) - (0.15 0.3)']) + assert_array_almost_equal(comm[0][1]['x'], [0.1, 0.2, 0.1]) + assert_array_equal(comm[0][1]['y'], [2/3., 2/3., 1.]) + + def test_x_y_division_uneven_y(self): + comm = emp.comm_grid(self.pat1, self.cols1, '', '2,2') + print comm + assert_array_equal(comm[0][1]['pair'], ['(0.1 0.125) - (0.1 0.275)', + '(0.1 0.125) - (0.2 0.125)', + '(0.1 0.125) - (0.2 0.275)', + '(0.1 0.275) - (0.2 0.125)', + '(0.1 0.275) - (0.2 0.275)', + '(0.2 0.125) - (0.2 0.275)']) + assert_array_almost_equal(comm[0][1]['x'], [0.15, 0.1, 0.180278, 0.180278, + 0.1, 0.15], 6) + assert_array_equal(comm[0][1]['y'], [2/3., 0, 0, 0, 2/3., 0]) + + def test_x_y_division_uneven_y_jaccard(self): + comm = emp.comm_grid(self.pat1, self.cols1, '', '2,2',metric='Jaccard') + assert_array_equal(comm[0][1]['y'], [1/2., 0, 0, 0, 1/2., 0]) + +class TestORing(Patches): + # TODO: Main may fail with error if dataframe has no records when trying to + # fit or make plot. + + def test_spp_no_present_returns_empty_df(self): + o_ring = emp.o_ring(self.pat1, self.cols1, '', 'nothere', [0,.1,.2]) + assert_frame_equal(o_ring[0][1], pd.DataFrame(columns=['x','y'])) + + def test_one_individual_returns_zeros(self): + self.pat1.table = self.pat1.table[2:4] # Leave 1 'a' and 1 'b' + o_ring = emp.o_ring(self.pat1, self.cols1, '', 'a', [0,.1,.2]) + assert_equal(o_ring[0][1]['y'], [0, 0]) + + def test_no_density_a(self): + # Points on bin edge may be allocated ambiguously due to floating point + # issues - testing here with slightly offset edges + o_ring = emp.o_ring(self.pat1, self.cols1, '', 'a', [0,.101,.201,.301], + density=False) + assert_almost_equal(o_ring[0][1]['x'], [0.0505, 0.151, 0.251]) + assert_almost_equal(o_ring[0][1]['y'], [8, 4, 0]) + + def test_no_density_b(self): + o_ring = emp.o_ring(self.pat1, self.cols1, '', 'b', [0,.1,.2,.3], + density=False) + assert_almost_equal(o_ring[0][1]['x'], [0.05, 0.15,0.25]) + assert_almost_equal(o_ring[0][1]['y'], [6, 6, 0]) + + def test_with_split_a(self): + o_ring = emp.o_ring(self.pat1, self.cols1, 'y:2', 'a', [0,.1,.2], + density=False) + assert_equal(o_ring[0][1]['y'], [2, 0]) # Bottom + assert_equal(o_ring[1][1]['y'], [2, 0]) # Top + + def test_with_split_b(self): + o_ring = emp.o_ring(self.pat1, self.cols1, 'y:2', 'b', [0,.1,.2], + density=False) + assert_equal(o_ring[0][1]['y'], []) # Bottom + assert_equal(o_ring[1][1]['y'], [6, 6]) # Top + + def test_density_a(self): + # First radius is 0.05 + o_ring = emp.o_ring(self.pat1, self.cols1, '', 'a', [0,.10000001]) + assert_array_almost_equal(o_ring[0][1]['y'], + [8 / (1.25*np.pi*(0.1)**2)], + 3) + + def test_density_b(self): + # First radius is 0.05 + o_ring = emp.o_ring(self.pat1, self.cols1, '', 'b', [0,.10000001,.1828427]) + assert_array_almost_equal(o_ring[0][1]['y'], + [6 / (1.25*np.pi*(0.1)**2), + 6 / (3/8 * np.pi*(0.1828427**2 - 0.1**2))], + 3) + + +class TestProduct(): + + def test_product_with_order(self): + # Several places rely on product to sequentially loop first -> last + expected = [[1,5], [1,6], [1,7], [2,5], [2,6], [2,7]] + assert_equal(_emp._product([1,2],[5,6,7]), expected) + + +class TestDistance(): + + def test_cartesian_distance(self): + assert_equal(_emp._distance((0,0),(2,2)), np.sqrt(8)) + + +class TestDecDegDistance(): + + def test_ucberkeley_to_sf(self): + # Latlong: http://www.findlatitudeandlongitude.com + # Dist: http://www.movable-type.co.uk/scripts/latlong.html (17.37 km) + berkeley = (37.87133, -122.259293) + sf = (37.780213, -122.419968) + assert_almost_equal(_emp._decdeg_distance(berkeley, sf), 17.37, 1) + + +class TestEmpiricalCDF(): + + def test_sorted_data(self): + test_data = [1, 1, 1, 1, 2, 3, 4, 5, 6, 6] + ans = [.4, .4, .4, .4, .5, .6, .7, .8, 1, 1] + res = emp.empirical_cdf(test_data) + assert_array_equal(ans, res['ecdf']) + + def test_unsorted_data(self): + test_data = [6, 6, 1, 1, 5, 1, 1, 2, 3, 4] + ans = [.4, .4, .4, .4, .5, .6, .7, .8, 1, 1] + res = emp.empirical_cdf(test_data) + assert_array_equal(ans, res['ecdf']) # Result sorted + assert_array_equal(np.sort(test_data), res['data']) # Data sorted + + def test_all_data_same(self): + test_data = [3, 3, 3, 3] + ans = [1, 1, 1, 1] + res = emp.empirical_cdf(test_data) + assert_array_equal(ans, res['ecdf']) + diff --git a/macroeco/empirical/test_meta1.txt b/macroeco/empirical/test_meta1.txt new file mode 100644 index 0000000..a8e157e --- /dev/null +++ b/macroeco/empirical/test_meta1.txt @@ -0,0 +1,14 @@ +[Description] +name = Test Table 1 +datapath = test_table1.csv +cols = spp_col:spp + +[x] +min = 0.1 +max = 0.2 +step = 0.1 + +[y] +min = 0.1 +max = 0.3 +step = 0.1 \ No newline at end of file diff --git a/macroeco/empirical/test_table1.csv b/macroeco/empirical/test_table1.csv new file mode 100644 index 0000000..a019594 --- /dev/null +++ b/macroeco/empirical/test_table1.csv @@ -0,0 +1 @@ +spp,x,y,count,year a,0.1,0.1,2,2000 a,0.1,0.2,1,2000 a,0.1,0.3,1,2010 b,0.1,0.2,1,2010 b,0.2,0.3,3,2010 \ No newline at end of file diff --git a/macroeco/main/__init__.py b/macroeco/main/__init__.py new file mode 100644 index 0000000..d8ba388 --- /dev/null +++ b/macroeco/main/__init__.py @@ -0,0 +1,15 @@ +""" +=========================== +Main (:mod:`macroeco.main`) +=========================== + +This module contains the functions that make up MacroecoDesktop. + +.. autosummary:: + :toctree: generated/ + + main + +""" + +from .main import main diff --git a/macroeco/main/main.py b/macroeco/main/main.py new file mode 100644 index 0000000..7ce2843 --- /dev/null +++ b/macroeco/main/main.py @@ -0,0 +1,610 @@ +from __future__ import division +import sys +import os +import shutil +import warnings +import inspect +import configparser +import threading as thread +from twiggy import log +import copy +log = log.name('meco') + +import numpy as np +import pandas as pd + +import matplotlib as mpl +import matplotlib.pyplot as plt + +#from .. __init__ import __version__ +from .. import empirical as emp +from .. import models as mod +from .. import compare as comp +from .. import misc + + +def main(param_path='parameters.txt'): + """ + Entry point function for analysis based on parameter files. + + Parameters + ---------- + param_path : str + Path to user-generated parameter file + + """ + + # Confirm parameters file is present + if not os.path.isfile(param_path): + raise IOError, "Parameter file not found at %s" % param_path + + # Get raw params and base options (non-run-dependent options) + params, base_options = _get_params_base_options(param_path) + + # Start logging + log = misc.setup_log(base_options['results_dir']) + log.info('Running macroeco') # v%s' % __version__) + log.info('Parameters file at %s' % os.path.abspath(param_path)) + log.info('Starting analysis') + + # Do analysis for each run + for run_name in base_options['run_names']: + log.info('Starting run %s' % run_name) + options = dict(params[run_name]) # All parameters from this run + options.update(base_options) # Add base parameters + options['run_dir'] = os.path.join(base_options['results_dir'],run_name) + if 'format' in options['analysis']: + _do_format(options) + else: + _do_analysis(options) + log.info('Finished run %s' % run_name) + log.info('Finished analysis successfully') + log.info('Results available at %s' % options['param_dir']) + + +def _get_params_base_options(param_path): + + # Read parameter file into params object + params = configparser.ConfigParser() + try: + params.read(param_path) + except: + raise ValueError, "Parameter file is invalid" + + # Setup param_dir and results_dir, get run_names + param_dir = os.path.abspath(os.path.dirname(param_path)) + results_dir = os.path.join(param_dir, 'results') + + if os.path.isdir(results_dir): + shutil.rmtree(results_dir) + os.makedirs(results_dir) + + run_names = params.sections() + + # Check there's at least one run + if not run_names: + raise NameError, "Parameters file must contain at least one run" + + # Create options dict + base_options = {} + base_options['param_dir'] = param_dir + base_options['results_dir'] = results_dir + base_options['run_names'] = run_names + + return params, base_options + + +def _do_format(options): + + datapath = os.path.normpath(os.path.join(options['param_dir'], + options['data'])) + out_path = os.path.splitext(datapath)[0] + "_formatted.csv" + + format_type = options['analysis'].split('_')[1] + misc.data_read_write(datapath, out_path, format_type, **options) + + +def _do_analysis(options): + """ + Do analysis for a single run, as specified by options. + + Parameters + ---------- + options : dict + Option names and values for analysis + + """ + + module = _function_location(options) + core_results = _call_analysis_function(options, module) + + if module == 'emp' and ('models' in options.keys()): + fit_results = _fit_models(options, core_results) + else: + fit_results = None + + _save_results(options, module, core_results, fit_results) + + +def _function_location(options): + # TODO: Add spec and misc modules + # This relies on the assumption that there are no duplicate member names + # in the different modules. + func_name = options['analysis'].split('.')[0] # Ignore method if present + emp_members = [x[0] for x in inspect.getmembers(emp)] + mod_members = [x[0] for x in inspect.getmembers(mod)] + if func_name in emp_members: + module = 'emp' + elif func_name in mod_members: + module = 'mod' + else: + raise ValueError, ("No analysis of type '%s' is available" % + options['analysis']) + return module + + +def _call_analysis_function(options, module): + """ + Call function from module and get result, using inputs from options + + Parameters + ---------- + options : dict + Option names and values for analysis + module : str + Short name of module within macroeco containing analysis function + + Returns + ------- + dataframe, array, value, list of tuples + Functions from emp module return a list of tuples in which first + element of the tuple gives a string describing the result and the + second element giving the result of the analysis as a dataframe. + Functions in other modules return dataframe, array, or value. + + """ + + args, kwargs = _get_args_kwargs(options, module) + return eval("%s.%s(*args, **kwargs)" % (module, options['analysis'])) + + +def _get_args_kwargs(options, module): + """ + Given an options (including analysis), and module, extract args and kwargs + """ + + if module == 'emp': + options = _emp_extra_options(options) + arg_names, kw_names = _arg_kwarg_lists(module, options['analysis']) + + # Create list of values for arg_names + args = [] + for arg_name in arg_names: + + if arg_name == 'patch': # For patch arg, append actual patch obj + args.append(options['patch']) + continue + if arg_name == 'self': # Ignore self from class methods + continue + if arg_name == 'k': # scipy dists use k and x, we always use x + arg_name = 'x' + + try: + exec 'args.append(eval("%s"))' % options[arg_name] + except SyntaxError: # eval failing because option is a string + args.append(options[arg_name]) + except: + raise ValueError, ("Value for required argument %s not provided" + % arg_name) + + # Create dict with vals for kw_names + kwargs = {} + for kw_name in kw_names: + if kw_name in options.keys(): # If a value is given for this kwarg + try: + exec 'kwargs[kw_name] = eval("%s")' % options[kw_name] + except SyntaxError: # eval failing because value is a string + kwargs[kw_name] = options[kw_name] + except: + raise ValueError, ("Value for optional argument %s is invalid" + % kw_name) + + return args, kwargs + + +def _emp_extra_options(options): + """ + Get special options patch, cols, and splits if analysis in emp module + """ + + # Check that metadata is valid + metadata_path = os.path.normpath(os.path.join(options['param_dir'], + options['metadata'])) + if not os.path.isfile(metadata_path): + raise IOError, ("Path to metadata file %s is invalid." % + metadata_path) + options['metadata_path'] = metadata_path + + # Using subset if given, create and store patch + subset = options.get('subset', '') + options['patch'] = emp.Patch(metadata_path, subset) + + # If cols or splits not given in options, make empty strings + if 'cols' not in options.keys(): + options['cols'] = '' + if 'splits' not in options.keys(): + options['splits'] = '' + + return options + + +def _arg_kwarg_lists(module, analysis): + + # Get names of args and kwargs to method specified by analysis option + exec ("arg_and_kwd_names, _, _, kw_defaults = " + "inspect.getargspec(%s.%s)" % (module, analysis)) + if kw_defaults: # If there are kwargs + arg_names = arg_and_kwd_names[:-len(kw_defaults)] + kw_names = arg_and_kwd_names[-len(kw_defaults):] + else: # If no kwargs + arg_names = arg_and_kwd_names + kw_names = [] + + # Inspection for rv classes doesn't work since it uses args internally + # Unless method is translate_args or fit_mle, appends shapes to args + try: + obj_meth = analysis.split('.') + if obj_meth[1] not in ['fit_mle', 'translate_args']: + arg_names += eval(module + '.' + obj_meth[0] + '.' + + "shapes.replace(' ','').split(',')") + if obj_meth[1] == 'rvs': # Inspection for size not working + kw_names.append('size') + except: + pass + + return arg_names, kw_names + + +def _fit_models(options, core_results): + """ + Fit models to empirical result from a function in emp module + + Parameters + ---------- + options : dict + Option names and values for analysis + core_results : list of tuples + Output of function in emp + + Returns + ------- + list of dicts + Each element in list corresponds to a subset. The dict has a key for + each model given in options, and the value is a list of fitted + parameters (tuple), values (array), comparison statistic names (list), + and comparison statistic values (list). + + Notes + ----- + To determine if the empirical result refers to a curve or a distribution, + the result dataframe is inspected for a column 'x', which indicates a + curve. + + """ + + log.info("Fitting models") + models = options['models'].replace(' ', '').split(';') + + # TODO: Make work for 2D results, i.e., curves, comm_sep, o_ring + # TODO: Make work for curves in general (check if 'x' present in core_res) + fit_results = [] + for core_result in core_results: # Each subset + fit_result = {} + for model in models: + fits = _get_fits(core_result, model, options) + values = _get_values(core_result, model, fits) + stat_names, stats = _get_comparison_stat(core_result, values, + model, fits) + fit_result[model] = [fits, values, stat_names, stats] + fit_results.append(fit_result) + + return fit_results + + +def _get_fits(core_result, model, options): + + options_copy = {} + for key, val in options.iteritems(): + if key not in ['patch']: # Ignore patch since won't deepcopy + options_copy[key] = copy.deepcopy(val) + + model_obj = eval('mod.' + model) + if hasattr(model_obj, 'fit_mle'): + options_copy['analysis'] = model + '.' + 'fit_mle' + options_copy['data'] = core_result[1]['y'].values + else: + options_copy['analysis'] = model + '.' + 'fit_lsq' + options_copy['x'] = core_result[1]['x'].values + options_copy['y_obs'] = core_result[1]['y'].values + options_copy['df'] = core_result[1] # Entire result df, for mete_sar + + return _call_analysis_function(options_copy, 'mod') + + +def _get_values(core_result, model, fits): + + model_obj = eval('mod.' + model) + if hasattr(model_obj, 'vals'): + x = core_result[1]['x'].values # Calc model at x values + values = eval("mod.%s.vals(x, *fits)" % model) + else: + n = len(core_result[1]) # Calc model at data values + values = eval("mod.%s.rank(n, *fits)" % model) + + return values + + +def _get_comparison_stat(core_result, values, model, fits): + # Uses AIC for distributions, R2 one-to-one for curves + + try: # Only curves have vals + eval("mod.%s" % model + ".vals.__doc__") + obs = core_result[1]['y'].values + pred = values + name = ['R2'] + stat = comp.r_squared(obs, pred, one_to_one=True) + except AttributeError: + obs = core_result[1]['y'].values + name = ['AIC'] + stat = comp.AIC(obs, eval("mod.%s" % model + "(*fits)")) + + return name, stat + + +def _save_results(options, module, core_results, fit_results): + """ + Save results of analysis as tables and figures + + Parameters + ---------- + options : dict + Option names and values for analysis + module : str + Module that contained function used to generate core_results + core_results : dataframe, array, value, list of tuples + Results of main analysis + fit_results : list or None + Results of comparing emp analysis to models, None if not applicable + + """ + + log.info("Saving all results") + + # Use custom plot format + mpl.rcParams.update(misc.rcparams.ggplot_rc) + + # Make run directory + os.makedirs(options['run_dir']) + + # Write core results + _write_core_tables(options, module, core_results) + + # Write additional results if analysis from emp + if module == 'emp': + _write_subset_index_file(options, core_results) + + # Write model/data comparison if models were given + if fit_results: + models = options['models'].replace(' ','').split(';') + for i, core_result in enumerate(core_results): + _write_fitted_params(i, models, options, fit_results) + _write_test_statistics(i, models, options, fit_results) + _write_comparison_plot_table(i, models, options, + core_results, fit_results) + +def _write_core_tables(options, module, core_results): + """ + Notes + ----- + Depending on function that was called for analysis, core_results may be a + list of tuples (empirical), a dataframe, an array, or a single value. + + For the list of tuples from empirical, the second element of each tuple is + the raw result, and we write them all with the appropriate prefix. For + dataframes, we write them. For arrays or single values, we convert to data + frames and write them. + + """ + + table_name = 'core_result.csv' + single_file_path = os.path.join(options['run_dir'], table_name) + + if module == 'emp': # List of tuples + for i, core_result in enumerate(core_results): + file_path = _get_file_path(i, options, table_name) + core_result[1].to_csv(file_path, index=False, float_format='%.4f') + + elif type(core_results) == type(pd.DataFrame()): # DataFrame + core_results.to_csv(single_file_path, index=False, float_format='%.4f') + + else: # Array or single value (atleast_1d corrects for unsized array) + df = pd.DataFrame({'y': np.atleast_1d(core_results)}) + df.to_csv(single_file_path, index=False, float_format='%.4f') + + +def _get_file_path(spid, options, file_name): + return os.path.join(options['run_dir'], + '%i_%s' % (spid+1, file_name)) + + +def _write_subset_index_file(options, core_results): + """ + Write table giving index of subsets, giving number and subset string + """ + + f_path = os.path.join(options['run_dir'], '_subset_index.csv') + subset_strs = zip(*core_results)[0] + index = np.arange(len(subset_strs)) + 1 + df = pd.DataFrame({'subsets': subset_strs}, index=index) + df.to_csv(f_path) + + +def _write_fitted_params(spid, models, options, fit_results): + # TODO: Consider converting to pandas, need to deal with variable length + # TODO: Possibility - empty data frame max length, max width = nparams + f = open(_get_file_path(spid, options, 'fitted_params.csv'), 'w') + f.write("Model, Fit Parameters\n") + + for model in models: + fit_result = fit_results[spid][model] + mod_fits = str(fit_result[0])[1:-1] # Drop parens around tuple + f.write("%s,%s\n" % (model, mod_fits)) + f.close() + + +def _write_test_statistics(spid, models, options, fit_results): + # TODO: Add delta test statistics columns + # TODO: Make dataframe? + f = open(_get_file_path(spid, options, 'test_statistics.csv'), 'w') + + # Gets stat name list from any element of result dict - same for all + stat_names_list = next(fit_results[spid].itervalues())[2] + stat_names_str = str(stat_names_list)[1:-1].strip("'") + + f.write("Model, %s\n" % stat_names_str) + + for model in models: + fit_result = fit_results[spid][model] + fit_stats = str(fit_result[3])[1:-1] + f.write("%s,%s\n" % (model, fit_stats)) + f.close() + + +def _write_comparison_plot_table(spid, models, options, core_results, + fit_results): + """ + Notes + ----- + Only applies to analysis using functions from empirical in which models are + also given. + + """ + # TODO: Clean up sorting, may not work if SAR x out of order, e.g. + + is_curve = 'x' in core_results[0][1] + df = core_results[spid][1] + df.rename(columns={'y': 'empirical'}, inplace=True) + + # If distribution, need to sort values so will match sorted rank in fits + if not is_curve: + x = np.arange(len(df)) + 1 + df = df.sort(columns='empirical') + df.insert(0, 'x', x[::-1]) + + # Add residual column for each model + for model in models: + fit_result = fit_results[spid][model] + df[model] = fit_result[1] + df[model + "_residual"] = df[model] - df['empirical'] + + # If curve, sort now for plotting purposes + if is_curve: + df = df.sort(columns='x') + + # Set up file paths + f_path = _get_file_path(spid, options, 'data_models.csv') + p_path = _get_file_path(spid, options, 'data_models.pdf') + + # Save table + df.to_csv(f_path, index=False, float_format='%.4f') # Table + + # Save plot + fig, (ax1, ax2) = plt.subplots(1, 2) + + ax1.scatter(df['x'], df['empirical'], color='k') + ax1.plot(df['x'], df[models]) + ax1.legend(models + ['empirical'], loc='best') + ax1.set_xlabel('x') + ax1.set_ylabel('value') + + ax2.hlines(0, np.min(df['x']), np.max(df['x'])) + ax2.plot(df['x'], df[[x + '_residual' for x in models]]) + ax2.legend(models + ['empirical'], loc='best') + ax2.set_xlabel('x') + ax2.set_ylabel('residual') + ax2.set_xlim(ax1.get_xlim()) + ax2.set_ylim(min(ax2.get_ylim()[0], -1), max(ax2.get_ylim()[1], 1)) + + if options.get('log_y', None): + ax1.set_yscale('log') + ax2.set_yscale('symlog', linthreshy=1) + if options.get('log_x', None): + ax1.set_xscale('log') + ax2.set_xscale('log') + + if not options.get('log_x', None) and not options.get('log_y', None): + ax1.set_ylim(bottom=0) + ax1.set_xlim(left=0) + ax1 = _pad_plot_frame(ax1) + ax2 = _pad_plot_frame(ax2) + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + fig.tight_layout() + fig.savefig(p_path) + + plt.close('all') + + +def _pad_plot_frame(ax, pad=0.01): + """ + Provides padding on sides of frame equal to pad fraction of plot + """ + + xmin, xmax = ax.get_xlim() + ymin, ymax = ax.get_ylim() + xr = xmax - xmin + yr = ymax - ymin + + ax.set_xlim(xmin - xr*pad, xmax + xr*pad) + ax.set_ylim(ymin - yr*pad, ymax + yr*pad) + + return ax + + +def _output_cdf_plot(core_result, spid, models, options, fit_results): + """Function for plotting cdf""" + + # CDF + x = core_result['y'].values + df = emp.empirical_cdf(x) + df.columns = ['x', 'empirical'] + + def calc_func(model, df, shapes): + return eval("mod.%s.cdf(df['x'], *shapes)" % model) + + plot_exec_str = "ax.step(df['x'], emp, color='k', lw=3);ax.set_ylim(top=1)" + + _save_table_and_plot(spid, models, options, fit_results, 'data_pred_cdf', + df, calc_func, plot_exec_str) + + +def output_pdf_plot(core_result, spid, models, options, fit_results): + """ Function for plotting pdf/pmf """ + # PDF/PMF + hist_bins = 11 + emp_hist, edges = np.histogram(core_result['y'].values, hist_bins, + normed=True) + x = (np.array(edges[:-1]) + np.array(edges[1:])) / 2 + df = pd.DataFrame({'x': x, 'empirical': emp_hist}) + + def calc_func(model, df, shapes): + try: + return eval("mod.%s.pmf(np.floor(df['x']), *shapes)" % model) + except: + return eval("mod.%s.pdf(df['x'], *shapes)" % model) + + plot_exec_str = "ax.bar(df['x']-width/2, emp, width=width, color='gray')" + + _save_table_and_plot(spid, models, options, fit_results, 'data_pred_pdf', + df, calc_func, plot_exec_str) diff --git a/macroeco/misc/__init__.py b/macroeco/misc/__init__.py new file mode 100644 index 0000000..9bae656 --- /dev/null +++ b/macroeco/misc/__init__.py @@ -0,0 +1,39 @@ +""" +=============================== +Misc (:mod:`macroeco.misc`) +=============================== + +This module contains miscellaneous functions that support the functions of +other modules of macroeco. + +Support Functions +================= + +.. autosummary:: + :toctree: generated/ + + setup_log + log_start_end + inherit_docstring_from + doc_sub + +""" +""" + +Data Formatting Functions +========================= + +.. autosummary:: + :toctree: generated/ + + data_read_write + format_dense + +""" + +from .misc import (setup_log, _thread_excepthook, log_start_end, + inherit_docstring_from, doc_sub) +from .rcparams import ggplot_rc +from .format_data import (data_read_write, format_dense) + +_thread_excepthook() # Make desktop app catch and log sys except from thread diff --git a/macroeco/misc/format_data.py b/macroeco/misc/format_data.py new file mode 100644 index 0000000..e8f73ff --- /dev/null +++ b/macroeco/misc/format_data.py @@ -0,0 +1,194 @@ +import numpy as np +import pandas as pd + +def data_read_write(data_path_in, data_path_out, format_type, **kwargs): + """ + General function to read, format, and write data. + + Parameters + ---------- + data_path_in : str + Path to the file that will be read + data_path_out : str + Path of the file that will be output + format_type : str + Either 'dense', 'grid', 'columnar', or 'transect' + kwargs + Specific keyword args for given data types. See Notes + + Notes + ----- + + 'Dense Parameters' + + non_label_cols : str + Comma separated list of non label columns. ex. "lat, long, tree" + sep : str + The delimiter for the dense data. Default, "," + na_values : int, float, str + Value to be labeled as NA. Default, "" + + See misc.format_dense() for additional keyword parameters + """ + + if format_type == "dense": + + # Set dense defaults + kwargs = _set_dense_defaults_and_eval(kwargs) + + # Try to parse non label columns appropriately + try: + nlc = [nm.strip() for nm in kwargs['non_label_cols'].split(",")] + kwargs.pop('non_label_cols', None) + except KeyError: + raise KeyError("'non_label_cols' is a required keyword dense data") + + # Read data with dense specific keywords + arch_data = pd.read_csv(data_path_in, sep=kwargs['delimiter'], + na_values=kwargs['na_values']) + + form_data = format_dense(arch_data, nlc, **kwargs) + + elif format_type == "grid": + pass + elif format_type == "stacked": + pass + elif format_type == "transect": + pass + else: + raise NameError("%s is not a supported data format" % format_type) + + form_data.to_csv(data_path_out, index=False) + + +def format_dense(base_data, non_label_cols, **kwargs): + """ + Formats dense data type to stacked data type. + + Takes in a dense data type and converts into a stacked data type. + + Parameters + ---------- + data : DataFrame + The dense data + non_label_cols : list + A list of columns in the data that are not label columns + label_col : str + Name of the label column in the formatted data. Default, "label" + count_col : str + Name of the count column in the formatted data. Default, "count" + nan_to_zero : bool + Set all nans to zero. Default, False + drop_na : bool + Drop all columns with nan in the dataset. Default, False + + Returns + ------- + : DataFrame + A formatted DataFrame in the stacked format + + + Notes + ----- + Example of Dense Data conversion + + >>> import pandas as pd + >>> dense_data = pd.DataFrame({'row' : [1,2,1,2], 'column' : [1,1,2,2], + 'labelA': [1,0,3,4], 'labelB' : [3,2,1,4]}) + + >>> dense_data + column labelA labelB row + 0 1 1 3 1 + 1 1 0 2 2 + 2 2 3 1 1 + 3 2 4 4 2 + + [4 rows x 4 columns] + >>> stacked_data = format_dense(dense_data, ['row', 'column']) + >>> stacked_data + row column label count + 0 1 1 labelA 1 + 1 1 1 labelB 3 + 2 2 1 labelA 0 + 3 2 1 labelB 2 + 4 1 2 labelA 3 + 5 1 2 labelB 1 + 6 2 2 labelA 4 + 7 2 2 labelB 4 + + [8 rows x 4 columns] + """ + + kwargs = _set_dense_defaults_and_eval(kwargs) + + # Stack data in columnar form. + indexed_data = base_data.set_index(keys=non_label_cols) + columnar_data = indexed_data.stack(dropna=False) + columnar_data = columnar_data.reset_index() + + # Rename columns + num = len(non_label_cols) + columnar_data.rename(columns={0: kwargs['count_col'], 'level_%i' % num: + kwargs['label_col']}, inplace=True) + + # Set nans to zero? + if kwargs['nan_to_zero']: + ind = np.isnan(columnar_data[kwargs['count_col']]) + columnar_data[kwargs['count_col']][ind] = 0 + columnar_data.reset_index(inplace=True, drop=True) + + # Drop nans? + if kwargs['drop_na']: + columnar_data = columnar_data.dropna(how="any") + columnar_data.reset_index(inplace=True, drop=True) + + return columnar_data + + +def _set_dense_defaults_and_eval(kwargs): + """ + Sets default values in kwargs if kwargs are not already given. + + Evaluates all values using eval + + Parameters + ----------- + kwargs : dict + Dictionary of dense specific keyword args + + Returns + ------- + : dict + Default, evaluated dictionary + + """ + + kwargs['delimiter'] = kwargs.get('delimiter', ',') + kwargs['na_values'] = kwargs.get('na_values', '') + kwargs['nan_to_zero'] = kwargs.get('nan_to_zero', False) + kwargs['drop_na'] = kwargs.get('drop_na', False) + kwargs['label_col'] = kwargs.get('label_col', 'label') + kwargs['count_col'] = kwargs.get('count_col', 'count') + + for key, val in kwargs.iteritems(): + try: + kwargs[key] = eval(val) + except: + kwargs[key] = val + + return kwargs + +def format_stacked(): + """ + """ + pass + +def format_transect(): + """ + """ + pass + +def format_grid(): + """ + """ + pass diff --git a/macroeco/misc/misc.py b/macroeco/misc/misc.py new file mode 100644 index 0000000..bb77267 --- /dev/null +++ b/macroeco/misc/misc.py @@ -0,0 +1,152 @@ +""" +Set up logging +""" + +import sys +import os +import traceback +import threading as thread + +import twiggy +from twiggy import log +log = log.name('meco') +import decorator +import time + +def setup_log(log_dir, file_name='_log.txt', clear=False): + """ + Set up and return logger object + """ + + # Get path to log file and clear if requested + log_path = os.path.join(log_dir, file_name) + if clear and os.path.isfile(log_path): + os.remove(log_path) + + # Get outputs and add emitters + file_output, std_output = _logger_outputs(log_path) + twiggy.addEmitters(('file', twiggy.levels.DEBUG, None, file_output), + ('stdout', twiggy.levels.INFO, None, std_output)) + + # Get logger + log = twiggy.log.name('meco') + + # Log uncaught exceptions (must occur after log declared) + def log_uncaught(type1, value1, traceback1): + tb_list = traceback.format_exception(type1, value1, traceback1) + tb_str = ''.join(tb_list) + log.options(suppress_newlines=False).critical('\n'+tb_str) + sys.excepthook = log_uncaught + + return log + + +def _logger_outputs(log_path): + + # std_format - to ensure Macroeco Desktop shows logging, we just print + class stdLineFormat(twiggy.formats.LineFormat): + def __call__(self, msg): + text = self.format_text(msg) + print "{text}".format(**locals()) + return "" + std_format = stdLineFormat(traceback_prefix='') + + # file_format - customized to show local time, etc + conversion = twiggy.lib.converter.ConversionTable() + conversion.add("time", _logger_better_time, "[{1}]".format) + conversion.add("name", str, "{{{1}}}".format) + conversion.add("level", str, "{1}".format) + conversion.aggregate = ' '.join + conversion.genericValue = str + conversion.genericItem = "{0}={1}".format + + file_format = twiggy.formats.LineFormat(traceback_prefix='', separator=' ', + conversion=conversion) + + # Set up outputs for file and stdout and create emitters + file_output = twiggy.outputs.FileOutput(log_path, format=file_format) + std_output = twiggy.outputs.StreamOutput(format=std_format) + + return file_output, std_output + + +def _logger_better_time(gmtime=None): + return time.strftime("%Y/%m/%d %H:%M:%S %p", time.localtime()) + + +def _thread_excepthook(): + """ + Make threads use sys.excepthook from parent process + http://bugs.python.org/issue1230540 + """ + init_old = thread.Thread.__init__ + def init(self, *args, **kwargs): + init_old(self, *args, **kwargs) + run_old = self.run + def run_with_except_hook(*args, **kw): + try: + run_old(*args, **kw) + except (KeyboardInterrupt, SystemExit): + + raise + except: + sys.excepthook(*sys.exc_info()) + self.run = run_with_except_hook + thread.Thread.__init__ = init + + +def inherit_docstring_from(cls): + """ + This decorator modifies the decorated function's docstring by + replacing occurrences of '%(super)s' with the docstring of the + method of the same name from the class `cls`. + + If the decorated method has no docstring, it is simply given the + docstring of cls method. + + Extracted from scipy.misc.doccer. + + """ + def _doc(func): + cls_docstring = getattr(cls, func.__name__).__doc__ + func_docstring = func.__doc__ + if func_docstring is None: + func.__doc__ = cls_docstring + else: + new_docstring = func_docstring % dict(super=cls_docstring) + func.__doc__ = new_docstring + return func + return _doc + + +def doc_sub(*sub): + """ + Decorator for performing substitutions in docstrings. + + Using @doc_sub(some_note, other_note) on a function with {0} and {1} in the + docstring will substitute the contents of some_note and other_note for {0} + and {1}, respectively. + + Decorator appears to work properly both with IPython help (tab completion + and ?) and with Sphinx. + + """ + def dec(obj): + obj.__doc__ = obj.__doc__.format(*sub) + return obj + return dec + +def log_start_end(f): + """ + Decorator to log start and end of function + + Use of decorator module here ensures that argspec will inspect wrapped + function, not the decorator itself. + http://micheles.googlecode.com/hg/decorator/documentation.html + """ + def inner(f, *args, **kwargs): + log.info('Starting %s' % f.__name__) + res = f(*args, **kwargs) + log.info('Finished %s' % f.__name__) + return res + return decorator.decorator(inner, f) diff --git a/macroeco/misc/rcparams.py b/macroeco/misc/rcparams.py new file mode 100644 index 0000000..e6f1eea --- /dev/null +++ b/macroeco/misc/rcparams.py @@ -0,0 +1,26 @@ +ggplot_rc = \ +{ +"patch.linewidth" : 0.5, +"patch.facecolor" : '#348ABD', +"patch.edgecolor" : '#EEEEEE', +"patch.antialiased" : True, +"font.size" : 10.0, +"axes.facecolor" : '#E5E5E5', +"axes.edgecolor" : 'white', +"axes.linewidth" : 2, +"axes.grid" : True, +"axes.titlesize" : 'x-large', +"axes.labelsize" : 'large', +"axes.labelcolor" : '#555555', +"axes.axisbelow" : True, +"axes.color_cycle" : ['#0072B2','#D55E00','#CC79A7','#009E73', '#E69F00', + '#F0E442', '#56B4E9'], +"xtick.color" : '#555555', +"xtick.direction" : 'out', +"ytick.color" : '#555555', +"ytick.direction" : 'out', +"grid.color" : 'white', +"grid.linestyle" : '-', +"figure.facecolor" : 'white', +"figure.edgecolor" : '0.50', +} diff --git a/macroeco/misc/test_format_data.py b/macroeco/misc/test_format_data.py new file mode 100644 index 0000000..9800055 --- /dev/null +++ b/macroeco/misc/test_format_data.py @@ -0,0 +1,84 @@ +from __future__ import division + +from numpy.testing import (TestCase, assert_equal, assert_array_equal, + assert_almost_equal, assert_array_almost_equal, + assert_allclose, assert_, assert_raises) + +import numpy as np +from macroeco.misc import * +import pandas as pd + +#TODO: Test data_read_write + + +class TestFormatData(TestCase): + + def test_simple_stack(self): + + # Test that stack gives the same answer as predicted by hand + test_data = pd.DataFrame({'row': [1, 2, 1, 2], + 'column': [1, 1, 2, 2], 'labelA': [1, 0, 3, 4], + 'labelB': [3, 2, 1, 4]}) + + expected = pd.DataFrame({'row': [1,1,2,2,1,1,2,2], 'column': + [1,1,1,1,2,2,2,2], 'label': np.tile(['labelA', 'labelB'], 4), + 'count': [1,3,0,2,3,1,4,4]}, columns=['row', 'column', 'label', + 'count']) + + stack = format_dense(test_data, ['row', 'column']) + assert_equal(np.all(stack == expected), True) + + def test_label_count_col(self): + # Test whether changing label count col work + test_data = pd.DataFrame({'year': ['02', '03'], 'spp1': [1, 2], + 'spp2': [3, 4]}) + + expected = pd.DataFrame({'year': np.repeat(['02', '03'], 2), 'spp': + np.tile(['spp1', 'spp2'], 2), 'ind': [1,3,2,4]}, columns=['year', + 'spp', 'ind']) + + stack = format_dense(test_data, ['year'], label_col="spp", + count_col="ind") + + print stack + print expected + + assert_equal(np.all(stack == expected), True) + + def test_drop_nan(self): + # Test whether dropping nan function works + + test_data = pd.DataFrame({'year': ['02', '03'], 'spp1': [1, np.nan], + 'spp2': [np.nan, 4]}) + + expected = pd.DataFrame({'year': ['02', '03'], 'label': + ['spp1', 'spp2'], 'count': [1,4]}, columns=['year', + 'label', 'count']) + + stack = format_dense(test_data, ['year'], drop_na=True) + + assert_equal(np.all(stack == expected), True) + + def test_nan_to_zero(self): + # Test whether setting nan to zero function works + + test_data = pd.DataFrame({'year': ['02', '03'], 'spp1': [1, np.nan], + 'spp2': [np.nan, 4]}) + + expected = pd.DataFrame({'year': np.repeat(['02', '03'], 2), 'label': + np.tile(['spp1', 'spp2'], 2), 'count': [1,0,0,4]}, columns=['year', + 'label', 'count']) + + stack = format_dense(test_data, ['year'], nan_to_zero=True) + + assert_equal(np.all(stack == expected), True) + + + + + + + + + + diff --git a/macroeco/models/__init__.py b/macroeco/models/__init__.py new file mode 100644 index 0000000..4eb9c59 --- /dev/null +++ b/macroeco/models/__init__.py @@ -0,0 +1,70 @@ +""" +=============================== +Models (:mod:`macroeco.models`) +=============================== + +This module contains distributions and curves (i.e., standard mathematical +functions) commonly used in analysis of ecological patterns. + +Distributions +============= + +All of the distributions here are subclasses of either +`~scipy.stats.rv_continuous` and `~scipy.stats.rv_discrete` found in +`scipy.stats`. Several of the distributions here are similar to or based on +existing distributions found in `scipy.stats` but are updated to allow the use +of common ecological parameterizations. + +In addition to all of the methods found in `scipy.stats`, methods for fitting +distributions and curves to data and for translating common distribution +arguments into formal parameters (i.e., deriving the ``p`` of the geometric +distribution from the distribution mean) are also provided in these classes. + +The following discrete distributions are available. + +.. autosummary:: + :toctree: generated/ + + geom + geom_uptrunc + nbinom + nbinom_ztrunc + cnbinom + logser + logser_uptrunc + plnorm + plnorm_ztrunc + dgamma + +The following continuous distributions are available. + +.. autosummary:: + :toctree: generated/ + + expon + expon_uptrunc + lognorm + +Curves +====== + +Several common curves used in ecologial analysis are included here. + +.. autosummary:: + :toctree: generated/ + + power_law + mete_sar + mete_sar_iterative + mete_ear + +""" + +from _distributions import (geom, geom_uptrunc, nbinom, nbinom_ztrunc, + cnbinom, logser, logser_uptrunc, plnorm, + plnorm_ztrunc, expon, expon_uptrunc, lognorm, + dgamma) + +from ._curves import (power_law, + mete_sar, mete_sar_iterative, mete_upscale_iterative_alt, + mete_ear) diff --git a/macroeco/models/_curves.py b/macroeco/models/_curves.py new file mode 100644 index 0000000..e109f51 --- /dev/null +++ b/macroeco/models/_curves.py @@ -0,0 +1,387 @@ +from __future__ import division + +import numpy as np +import pandas as pd +from scipy import optimize +from mpmath import lerchphi + +from ..misc import inherit_docstring_from +import _distributions as dist + +_doc_methods = \ +"""Methods + ------- + vals(x, parameters) + Dependent variable y given independent variable x and curve parameters + fit_lsq(x, y_obs, params_start=None) + Least squares fit of parameters given data""" + +_doc_parameters = \ +"""Parameters + ---------- + x : iterable + Independent variable + y_obs : iterable + Dependent variable (values observed at x) + params_start : iterable + Optional start values for all parameters. Default 1.""" + + +class curve(object): + """ + Generic function class meant for subclassing + """ + + def __init__(self, name=None, parameters=None): + """ + Distribution parameters may be given here or to individual methods + + """ + self.name = name + self.parameters = parameters + self.n_parameters = len(parameters.split(',')) + + def __call__(self, *args, **kwargs): + raise ValueError, "Choose either the vals or fit_lsq methods" + + def vals(self, x, *args, **kwargs): + """ + [Docstring] + + """ + x = np.array(x) + return self._vals(x, *args, **kwargs) + + def _vals(self, x, *args): + """ + Return y given x and parameters + """ + raise NotImplementedError, ("vals not implemented for %s" % self.name) + + def fit_lsq(self, x, y_obs, params_start=None): + """ + Fit curve by method of least squares. + + Parameters + ---------- + x : iterable + Independent variable + y_obs : iterable + Dependent variable (values observed at x) + params_start : iterable + Optional start values for all parameters. Default 1. + + Returns + ------- + array + Best fit values of parameters + + Notes + ----- + If least squares fit does not converge, ValueError is raised with + convergence message. + + """ + + # Set up variables + x = np.array(x) + y_obs = np.array(y_obs) + if not params_start: + params_start = np.ones(self.n_parameters) + + # Error checking + if len(x) != len(y_obs): + raise ValueError, "x and y_obs must be the same length" + if len(params_start) != self.n_parameters: + raise ValueError, "Incorrect number of values in params_start" + + # Calculate fit + def residuals(params, x, y_obs): + y_pred = self.vals(x, *params) + return y_obs - y_pred + + params_fit, _, _, msg, ier = optimize.leastsq(residuals, params_start, + args=(x, y_obs), full_output=True) + + # Check for convergence + if ier > 4: + raise ValueError, ("Least squares fit did not converge with " + "message %s" % msg) + + return tuple(params_fit) + + +class power_law_gen(curve): + """ + A power-law function + + .. math:: + + y = c x^z + + or equivalently + + .. math:: + + \log(y) = \log(c) + z \log(x) + + Stemming from the log form, ``c`` is often known as the intercept and ``z`` + as the slope of the power law. + + {0} + + {1} + c, z + Parameters: Log-log slope and intercept + + """ + + def _vals(self, x, c, z): + return c * x**z + +power_law = power_law_gen(name='power_law', parameters='c,z') +power_law.__doc__ = power_law.__doc__.format(_doc_methods, _doc_parameters) + + +class mete_sar_gen(curve): + """ + A SAR/EAR predicted by the Maximum Entropy Theory of Ecology + + The METE SAR and EAR may be used either for downscaling, when values of A + are less than A0, or upscaling, when values of A are greater than A0. + Downscaling creates the traditional SAR known to ecologists, while + upscaling is useful for estimating large-scale species richness from small- + scale plot data. + + A keyword argument iterative is available (default is False). If True, the + SAR is calculated at successive A values, with the result at each value of + A used as the base values of S and N for the subsequent calculation. The + iterative form was used in Harte et al [#]_, although note that the + implementation here uses a different internal equation. + + Methods + ------- + vals(x, S0, N0, iterative=False) + Calculate SAR given starting values and two models. See notes. + + Parameters + ---------- + x : iterable + Areas at which to calculate SAR (first element is A0) + S0 : float + Species richness at A0 + N0 : float + Community abundance at A0 + iterative : bool (opt) + If true, SAR calculation for subplots are based on variables for next + larger area instead of initial plot variables. Default False. + array_size : int (opt) + Maximum size of array for SAD pmf's. If N0 is greater than this value, + calculation proceeds using array_size increments until N0 is reached. + approx : bool (opt) + Use non-truncated logseries and geometric distributions. Default False. + + References + ---------- + .. [#] + Harte, J., Smith, A. B., & Storch, D. (2009). Biodiversity scales from + plots to biomes with a universal species-area curve. Ecology Letters, + 12(8), 789-797. + + """ + + def __init__(self, name=None, parameters=None, iterative=False, ear=False): + """ + Provides extra iterative attribute. + """ + if iterative and ear: + raise ValueError, "Iterative EAR calculation is not possible" + + self.name = name + self.parameters = parameters + self.n_parameters = len(parameters.split(',')) + self.iterative = iterative + self.ear = ear + + def _vals(self, x, S0, N0, array_size=1e6, approx=False, alt_up=False): + # x is area, y is S + + A0 = x[0] + y = [S0] + + for A in x[1:]: + a = A/A0 + + if a == 1: + S1, N1 = S0, N0 + elif a < 1: + S1, N1 = self._downscale_step(a, S0, N0, array_size, approx) + else: + S1, N1 = self._upscale_step(a, S0, N0, array_size, approx) + + y.append(S1) + + if self.iterative: + S0, N0, A0 = S1, N1, A + + return np.array(y) + + def _downscale_step(self, a, S0, N0, array_size, approx): + lower = 1 + upper = array_size + 1 + S = 0 + print S0,N0 + + if S0 < 1 or np.isnan(S0): # Give up if S0 too small + return np.nan, N0*a + + while lower < N0: + print lower + + if upper > N0: + upper = N0 + 1 + + n0 = np.arange(lower, upper) + + if approx: + sad_p = dist.logser.translate_args(N0/S0) + sad = dist.logser.pmf(n0, sad_p) + else: + sad_p, _ = dist.logser_uptrunc.translate_args(N0/S0, N0) + sad = dist.logser_uptrunc.pmf(n0, sad_p, N0) + + if np.isclose(a, 0.5): + ssad_p = 1 / (n0 + 1) + else: + if approx: + ssad_p = dist.geom.translate_args(a*n0) + else: + ssad_p, _ = dist.geom_uptrunc.translate_args(a*n0, N0) + + if self.ear: + if approx: + ssad = dist.geom.pmf(n0, ssad_p) + else: + ssad = dist.geom_uptrunc.pmf(n0, ssad_p, N0) + S += S0 * np.sum(ssad * sad) + else: + if approx: + ssad = dist.geom.pmf(0, ssad_p) + else: + ssad = dist.geom_uptrunc.pmf(0, ssad_p, N0) + S += S0 * np.sum((1 - ssad) * sad) + + lower += array_size + upper += array_size + + return S, N0*a + + def _upscale_step(self, a, S0, N0, array_size, approx): + + N1 = N0*a + print a + + def eq(S1, N1, a, S0, array_size, approx): + return S0-self._downscale_step(1/a, S1, N1, array_size, approx)[0] + + return optimize.brentq(eq,S0,S0*a,args=(N1,a,S0,array_size,approx)), N1 + + def fit_lsq(self, df): + """ + Parameterize generic SAR curve from empirical data set + + Parameters + ---------- + df : DataFrame + Result data frame from empirical SAR analysis + + Notes + ----- + Simply returns S0 and N0 from empirical SAR output, which are two fixed + parameters of METE SAR and EAR. The first row of the empirical + dataframe corresponds to area A0. Name ``fit_lsq`` is retained for + consistency with other curves. + + """ + # Just return S0 and N0 at largest scale, which is first row of df + return df['n_spp'].values[0], df['n_individs'].values[0] + + +mete_sar = mete_sar_gen(name='mete_sar', parameters='S0,N0') +mete_sar_iterative = mete_sar_gen(name='mete_iterative_sar', + parameters='S0,N0', iterative=True) + +mete_ear = mete_sar_gen(name='mete_ear', parameters='S0,N0', ear=True) + +def mete_upscale_iterative_alt(S, N, doublings): + """ + This function is used to upscale from the anchor area. + + Parameters + ---------- + S : int or float + Number of species at anchor scale + N : int or float + Number of individuals at anchor scale + doublings : int + Number of doublings of A. Result vector will be length doublings + 1. + + Returns + ------- + result : ndarray + 1D array of number of species at each doubling + + """ + + # Arrays to store N and S at all doublings + n_arr = np.empty(doublings+1) + s_arr = np.empty(doublings+1) + + # Loop through all scales + for i in xrange(doublings+1): + + # If this is first step (doubling 0), N and S are initial values + if i == 0: + n_arr[i] = N + s_arr[i] = S + + # If not first step + else: + + # Get previous S + SA = s_arr[i-1] + + # N is double previous N + n_arr[i] = 2 * n_arr[i-1] + N2A = n_arr[i] + + # Eq 8 from Harte 2009, setup to return S2A given input of x + # x is exp(-lam_phi, 2A) + def S2A_calc(x, SA, N2A): + return ((SA + + N2A * + (1-x)/(x-x**(N2A+1)) * + (1 - (x**N2A)/(N2A+1))) / + x**-1) + + # Eq 9 from Harte 2009, setup to equal to zero, used to solve x + # Note that two summations are replaced by known formulas for sum + # of geometric and logarithmic series. + # Note "offset" of 1e-23, which is needed because f(a) and f(b) do + # not have the same sign in solver otherwise. This introduces no + # more than a 1e-23 error in the calculation of x, which should not + # cause a significant problem. + def x_calc(x, SA, N2A): + return (S2A_calc(x,SA,N2A) / + N2A * + x*(x**N2A-1)/(x-1) - + (x**N2A * (-lerchphi(x,1,N2A+1))-np.log(1-x)) ) - 1e-23 + + # Solve for x + x = (optimize.brentq(x_calc, 1e-24, 1-1e-16, args=(SA,N2A), + xtol=1e-16, maxiter=1000, disp=True) + 1e-23) + + # Given x, calculate S2A + s_arr[i] = S2A_calc(x,SA,N2A) + + return s_arr diff --git a/macroeco/models/_distributions.py b/macroeco/models/_distributions.py new file mode 100644 index 0000000..531c329 --- /dev/null +++ b/macroeco/models/_distributions.py @@ -0,0 +1,1618 @@ +from __future__ import division +import sys + +from decimal import Decimal +import numpy as np +import numpy.random as nprand +from scipy.stats.distributions import (rv_discrete, rv_continuous) + +try: + from scipy.stats.distributions import (docdict, docdict_discrete) +except ImportError: + # Scipy version '0.14.0' support + from scipy.stats._distn_infrastructure import (docdict, docdict_discrete) + +import scipy.stats as stats +import scipy.optimize as optim +import scipy.special as special +import scipy.integrate as integrate + +from ..misc import doc_sub, inherit_docstring_from + + +# Remove header from all methods +_docdict_allmeth = docdict['allmethods'][16:] +_docdict_discrete_allmeth = docdict_discrete['allmethods'][17:] + +# **kwds in expect string followed by no space was throwing warning +_docdict_allmeth = _docdict_allmeth.replace(', **kwds','') + +# Additional docstrings for custom methods +_docdict_rank_method = \ +"""rank(n, %(shapes)s) + Predicted rank abundance distribution. +""" + +_docdict_extra_params = \ +"""n : int + number of values +data : array_like + values used to fit distribution +""" + +# Create docstring helpers +docdict['before_notes'] = ''.join([_docdict_rank_method, + _docdict_allmeth, + docdict['callparams'], + _docdict_extra_params]) + +docdict_discrete['before_notes'] = ''.join([_docdict_rank_method, + _docdict_discrete_allmeth, + docdict['callparams'], + _docdict_extra_params]) + +_doc_translate_args = \ +""" +Translates user-friendly arguments into shape parameters + +See distribution docstring for description of user arguments and shape +parameters. + +Parameters +---------- +uargs : floats + User argument(s), usually easily measured and specified + +Returns +------- +tuple of floats + Shape parameter(s) of distribution + +Notes +----- +""" + +_doc_rvs_alt = \ +""" +Alternative random number generator for discrete distributions. Uses the +model's cdf function and a uniform random number generator. Can be faster than +native scipy rvs for some custom models. Will perform well if the the models +cdf function is also fast. + +Parameters +---------- +%(shapes)s : array_like + shape parameters +l : int + Lower bound of distribution (Either 0 or 1). Default is 1 +b : int + Upper bound of distribution for computational purposes, even if + distribution technically has infinite support. Default is 1e5. +size : int + Number of random variables to draw. Default is 1. + +Returns +------- +array + Random variables from model + +Notes +----- +""" + +_doc_fit_mle = \ +""" +Return MLEs for shape parameters from data + +Parameters +---------- +data : array_like + Data to use in calculating the MLEs. +args : floats + Starting value(s) for shape parameters. Some may be held constant + (see Notes). + +Returns +------- +tuple of floats + MLEs for shape parameters + +Notes +----- +""" + +_doc_rank = \ +""" +Return predicted rank abundance distribution + +Parameters +---------- +n : int + Number of values to return +%(shapes)s : array_like + shape parameters + +Returns +------- +array + Values of rank abundance distribution + +Notes +----- +Describe 0.5 offset. References. + +""" +# TODO: Finish doc_rank above + +_doc_make_rank = \ +""" +obj : discrete distribution object + Scipy discrete distribution object +crit : float + A value between 0 - 1. Below this value ppf is used, above a this + value a solver is used. +upper : int + Upper bound to the solver. Rank will not return values above + upper +xtol : float + Precision of the brentq solver. +""" + + +class rv_continuous_meco(rv_continuous): + """ + A modified generic continuous random variable class meant for subclassing. + + This class inherits from the `rv_continuous` class of `scipy.stats` and + contains all of its functionality. See the docstring of `rv_continuous` for + information on usage and subclassing. In addition, this class adds two new + methods. + + Methods + ------- + translate_args + Shape parameters given user-friendly parameters (see notes) + fit_mle + Shape parameters given data and optional keyword arguments (see notes) + rank + Rank abundance distribution + + """ + + @doc_sub(_doc_translate_args) + def translate_args(self, *args): + """{0}""" + raise NotImplementedError, ("translate_args method not implemented " + "for this distribution") + + @doc_sub(_doc_fit_mle) + def fit_mle(self, *args): + """{0}""" + return self.fit(*args, floc=0, fscale=1)[:-2] + + @doc_sub(_doc_rank) + def rank(self, n, *args): + """{0}""" + return self.ppf((np.arange(1, n+1) - 0.5) / n, *args) + + +class rv_discrete_meco(rv_discrete): + """ + A modified generic discrete random variable class meant for subclassing. + + This class inherits from the `rv_discrete` class of `scipy.stats` and + contains all of its functionality. See the docstring of `rv_discrete` for + information on usage and subclassing. In addition, this class adds two new + methods. + + Methods + ------- + translate_args + Shape parameters given user-friendly parameters (see notes) + fit_mle + Shape parameters given data and optional keyword arguments (see notes) + rank + Rank abundance distribution + + """ + + + @doc_sub(_doc_translate_args) + def translate_args(self, *args): + """{0}""" + raise NotImplementedError, ("translate_args method not implemented " + "for this distribution") + + @doc_sub(_doc_fit_mle) + def fit_mle(self, *args): + """{0}""" + raise NotImplementedError, ("fit_mle method not implemented " + "for this distribution") + + @doc_sub(_doc_rank) + def rank(self, n, *args): + """{0}""" + return self.ppf((np.arange(1, n+1) - 0.5) / n, *args) + + @doc_sub(_doc_rvs_alt) + def rvs_alt(self, *args, **kwargs): + """{0}""" + l = kwargs.get('l', 1) + b = kwargs.get('b', 1e5) + size = kwargs.get('size', 1) + + model_cdf = self.cdf(np.arange(l, b + 1), *args) + + unif_rands = np.random.random(size) + model_rands = np.array([np.where(tx <= model_cdf)[0][0] + l + for tx in unif_rands]) + + return model_rands + + +# +# Discrete +# + +class geom_gen(rv_discrete_meco): + r""" + A geometric discrete random variable. + + This implementation of the geometric distribution differs from that in + `scipy.stats`, as the distribution here has support from 0 to inf. + + .. math:: + P(x) = (1-p)^{x} p + + for ``x >= 0``. The ``loc`` parameter is not used. + + Methods + ------- + translate_args(mu) + Shape parameter p given distribution mean. + fit_mle(data) + ML estimate of shape parameter p given data. + %(before_notes)s + mu : float + distribution mean + + """ + + @inherit_docstring_from(rv_discrete_meco) + def translate_args(self, mu): + return 1 / (np.array(mu) + 1) + + @inherit_docstring_from(rv_discrete_meco) + def fit_mle(self, data): + return self.translate_args(np.mean(data)), + + def _argcheck(self, p): + return (p <= 1) & (p >= 0) + + def _pmf(self, x, p): + return (1-p)**x * p + + def _logpmf(self, x, p): + return x*np.log(1-p) + np.log(p) + + def _cdf(self, x, p): + x = np.floor(x) + return (1.0-(1.0-p)**(x+1)) + + def _stats(self, p): + mu = (1.0 - p) / p + var = (1.0 - p) / p**2 + return mu, var, None, None + +geom = geom_gen(name='geom', shapes='p') + + +class geom_uptrunc_gen(rv_discrete_meco): + r""" + An upper-truncated geometric discrete random variable. + + .. math:: + + P(x) = \frac{(1-p)^{x} p}{1 - (1-p)^{b+1}} + + for ``x >= 0``. ``geom_uptrunc`` takes two shape parameters: ``p`` and + ``b``, the upper limit. The ``loc`` parameter is not used. + + Methods + ------- + translate_args(mu, b) + Shape parameter p given distribution mean and upper limit. + fit_mle(data, b=sum(data)) + ML estimate of shape parameter p given data and upper limit. + %(before_notes)s + mu : float + distribution mean + b : float + distribution upper limit, defaults to sum of data + + Notes + ----- + The boundary ``p = 1`` is a special case in which the ratio between + successive terms of the distribution is 1 (i.e., the pmf is uniform). This + arises when the mean of the distribution is precisely one-half the upper + limit. + + This distribution is known as the Pi distribution in the MaxEnt Theory of + Ecology [#]_, where the ``p`` parameter is equivalent to ``1 - + exp(-lambda)``. The special case of a uniform pmf has been described as + HEAP [#]_. + + References + ---------- + .. [#] + Harte, J. (2011). Maximum Entropy and Ecology: A Theory of + Abundance, Distribution, and Energetics (p. 264). Oxford, United + Kingdom: Oxford University Press. + .. [#] + Harte, J., Conlisk, E., Ostling, A., Green, J. L., & Smith, A. B. + (2005). A theory of spatial structure in ecological communities at + multiple spatial scales. Ecological Monographs, 75(2), 179-197. + + """ + # TODO: Should add a warning for b < 5 or 10 or so (p solver gives erratic + # answers. (This may or may not still be true.) + + @inherit_docstring_from(rv_discrete_meco) + def translate_args(self, mu, b): + return _geom_solve_p_from_mu_vect(mu, b), b + + @inherit_docstring_from(rv_discrete_meco) + def fit_mle(self, data, b=None): + """%(super)s + In addition to data, requires ``b``, the upper limit of the + distribution. + """ + # Take mean of data as MLE of distribution mean, then calculate p + mu = np.mean(data) + if not b: + b = np.sum(data) + p = _geom_solve_p_from_mu_vect(mu, b) + + # Just return float, not len 1 array + if len(np.atleast_1d(p)) == 1: + return float(p), b + else: + return p, b + + def _argcheck(self, p, b): + # Unlike the traditional geometric, p can be < 0 + return (p <= 1) + + def _pmf(self, x, p, b): + pmf = (1.0-p)**x * p / (1.0-(1.0-p)**(b+1)) + if len(np.atleast_1d(x)) > 1: + pmf[x > b] = 0 + elif x > b: + pmf = 0 + return pmf + + def _cdf(self, x, p, b): + x = np.floor(x) + cdf = (1.0-(1.0-p)**(x+1)) / (1.0-(1.0-p)**(b+1)) + if len(np.atleast_1d(x)) > 1: + cdf[x > b] = 1 + elif x > b: + cdf = 1 + return cdf + + def _stats(self, p, b): + mu = (p / (1 - p)) - ((b + 1) / (p**-b - 1)) + return mu, None, None, None + +geom_uptrunc = geom_uptrunc_gen(name='geom_uptrunc', shapes='p, b') + +def _geom_solve_p_from_mu(mu, b): + """ + For the geom_uptrunc, given mu and b, return p. + Ref: Harte 2011, Oxford U Press. Eq. 7.50. + """ + + def p_eq(x, mu, b): + x, mu, b = Decimal(x), Decimal(mu), Decimal(b) + return ( (x / (1 - x)) - ((b + 1) / (x**-b - 1)) - mu ) + + # x here is the param raised to the k_agg power, or 1 - p + return 1 - optim.brentq(p_eq, 1e-16, 100, args=(mu, b), disp=True) + +_geom_solve_p_from_mu_vect = np.vectorize(_geom_solve_p_from_mu) + + +class dgamma_gen(rv_discrete_meco): + r""" + A discrete gamma random variable. + + .. math:: + + P(x) = k * x^{(\alpha - 1)} * e^{(-1 / \theta)*x} + + for ``x >= 1``, ``\theta > 0``. + ``k`` is the normalizing constant. + + Methods + ------- + translate_args(alpha, theta) + not used, returns alpha and theta. + fit_mle(data) + ml estimate of shape parameters alpha and theta given data + %(before_notes)s + alpha : float + distribution parameter + theta : float + distribution parameter + + Notes + ----- + This parameterization of the discrete gamma was taken from [#]_. + + References + ---------- + .. [#] + Frank, F. (2011). Measurement scale in maximum entropy models of species + abundance. Journal of Evolutionary Biology, 24(3), 485-496 + + """ + + @inherit_docstring_from(rv_discrete_meco) + def translate_args(self, alpha, theta): + return alpha, theta + + @inherit_docstring_from(rv_discrete_meco) + def fit_mle(self, data, init_vals=(80, 80)): + """%(super)s + In addition to data, can take init_vals which allows the user to + specify initial values for (alpha, theta) during the optimization. + + """ + + if len(data) > 1: + mu = np.mean(data) + var = np.var(data) + theta0 = var / mu + alpha0 = mu / theta0 + else: + alpha0 = init_vals[0] + theta0 = init_vals[1] + + def mle(params): + return -np.sum(np.log(self.pmf(data, params[0], params[1]))) + + # Bounded fmin? + alpha, theta = optim.fmin(mle, x0=[alpha0, theta0], disp=0) + + return alpha, theta + + def _pmf(self, x, alpha, theta): + + b = 1e5 + alpha = np.atleast_1d(alpha) + theta = np.atleast_1d(theta) + b = np.atleast_1d(b) + x = np.atleast_1d(x) + + eq = lambda val, talpha, ttheta: np.exp((talpha - 1) * np.log(val) - + (val / ttheta)) + + # eq = lambda val, talpha, ttheta: val**(talpha - 1) * \ + # np.exp((-1 / ttheta)*val) + + norm = np.sum(eq(np.arange(1, b[0] + 1), alpha[0], theta[0])) + + pmf = eq(x, alpha, theta) / norm + return pmf + + def _cdf(self, x, alpha, theta): + + alpha = np.atleast_1d(alpha) + theta = np.atleast_1d(theta) + x = np.atleast_1d(x) + + max_x = np.max(x) + pmf_list = self.pmf(np.arange(1, np.int(max_x) + 1), alpha[0], + theta[0]) + full_cdf = np.cumsum(pmf_list) + + cdf = np.array([full_cdf[tx - 1] if tx != 0 else 0 for tx in x]) + + return cdf + + def _argcheck(self, alpha, theta): + + # TODO: Can theta or alpha be 0 in the discrete version? + return (theta > 0) + +dgamma = dgamma_gen(name='dgamma', shapes='alpha, theta') + + +class nbinom_gen(rv_discrete_meco): + r""" + A negative binomial discrete random variable. + + This implementation of the negative binomial distribution differs from that + in `scipy.stats`, as the distribution here uses the more common ecological + parameterization. + + .. math:: + + p(x) = \frac{\gamma (k + x)}{\gamma(k) x!} + \left(\frac{k}{k+\mu}\right)^k \left(\frac{\mu}{k+\mu}\right)^x + + for ``x >= 0``. In the traditional parameterization, ``n = k_agg`` (the + size parameter) and ``p = k_agg / (k_agg + mu)``. the ``loc`` parameter is + not used. + + Methods + ------- + translate_args(mu, k_agg) + not used, returns mu and k_agg. + fit_mle(data, k_range=(0.1,100,0.1)) + ml estimate of shape parameters mu and k_agg given data + %(before_notes)s + mu : float + distribution mean + k_agg : float + clustering parameter + + """ + + @inherit_docstring_from(rv_discrete_meco) + def translate_args(self, mu, k_agg): + return mu, k_agg + + @inherit_docstring_from(rv_discrete_meco) + def fit_mle(self, data, k_array=np.arange(0.1, 100, 0.1)): + """%(super)s + + In addition to data, gives an optional keyword argument k_array + containing the values to search for k_agg. A brute force search is then + used to find the parameter k_agg. + + """ + # todo: check and mention in docstring biases of mle for k_agg + data = np.array(data) + mu = np.mean(data) + return mu, _solve_k_from_mu(data, k_array, nbinom_nll, mu) + + def _get_p_from_mu(self, mu, k_agg): + return k_agg / (k_agg + mu) + + def _rvs(self, mu, k_agg): + p = self._get_p_from_mu(mu, k_agg) + return nprand.negative_binomial(k_agg, p, self._size) + + def _argcheck(self, mu, k_agg): + p = self._get_p_from_mu(mu, k_agg) + return (k_agg >= 0) & (p >= 0) & (p <= 1) + + def _pmf(self, x, mu, k_agg): + return np.exp(self._logpmf(x, mu, k_agg)) + + def _logpmf(self, x, mu, k_agg): + p = self._get_p_from_mu(mu, k_agg) + + coeff =\ + special.gammaln(k_agg+x)-special.gammaln(x+1)-special.gammaln(k_agg) + + return coeff + k_agg*np.log(p) + x*np.log(1-p) + + def _cdf(self, x, mu, k_agg): + p = self._get_p_from_mu(mu, k_agg) + x = np.floor(x) + return special.betainc(k_agg, x+1, p) + + def _ppf(self, q, mu, k_agg): + p = self._get_p_from_mu(mu, k_agg) + vals = np.ceil(special.nbdtrik(q, k_agg, p)) + vals1 = (vals-1).clip(0.0, np.inf) + temp = self._cdf(vals1, k_agg, p) + return np.where(temp >= q, vals1, vals) + + def _stats(self, mu, k_agg): + p = self._get_p_from_mu(mu, k_agg) + Q = 1.0 / p + p = Q - 1.0 + mu = k_agg*p + var = k_agg*p*Q + g1 = (Q+p)/np.sqrt(k_agg*p*Q) + g2 = (1.0 + 6*p*Q) / (k_agg*p*Q) + return mu, var, g1, g2 + +nbinom = nbinom_gen(name='nbinom', shapes='mu, k_agg') + + +def nbinom_nll(data, k_agg, mu): + return -np.sum(nbinom._logpmf(data, mu, k_agg)) + + +class nbinom_ztrunc_gen(rv_discrete_meco): + r""" + The zero-truncated negative binomial random variable. + + This distribution is described by Sampford (1955) [#]_. + + .. math:: + + p(x) = \frac{(k + x - 1)!}{(k - 1)!x!} \left(\frac{p} + {1 + p}\right)^{x} \frac{1}{(1 + p)^{k - 1}} + + for ``x >= 1``. ``p`` can be computed directly from the mean of the + distribution and is calculated internally so that the distribution is + parameterized by ``\mu`` and ``k_agg`` analogous to ``nbinom``. + + Methods + ------- + translate_args(mu, k_agg, return_p=False) + Returns mu and k_agg. Returns p parameter if return_p is True. + fit_mle(data, k_agg0=0.5) + ml estimate of shape parameters mu and k_agg given data + %(before_notes)s + mu : float + distribution mean + k_agg : float + clustering parameter + + Notes + ----- + + References + ---------- + .. [#] + Sampford, M. R. (1955). The truncated negative binomial distribution. + Biometrika, 42(1), 58-69 + + + """ + + @inherit_docstring_from(rv_discrete_meco) + def translate_args(self, mu, k_agg, return_p=False): + """%(super)s + + The keyword argument return_p computes the p values used to define the + the truncated negative binomial + """ + if return_p: + return nbinom_ztrunc_p(mu, k_agg), k_agg + else: + return mu, k_agg + + @inherit_docstring_from(rv_discrete_meco) + def fit_mle(self, data, k_agg0=0.5): + """%(super)s + + In addition to data, gives an optional keyword argument k_agg0 that + specifies the initial value of k_agg used in the optimization. + + """ + + mu = np.mean(data) + + def mle(k): + + return -np.sum(np.log(self.pmf(data, mu, k))) + + k = optim.fmin(mle, x0=k_agg0, disp=0) + + return mu, k[0] + + def _pmf(self, x, mu, k_agg): + + x = np.atleast_1d(x) + + norm = np.exp(special.gammaln(k_agg + x) - ((special.gammaln(k_agg) + + special.gammaln(x + 1)))) + p = nbinom_ztrunc_p(mu, k_agg) + kernel = (p / (1 + p))**x * (1 / ((1 + p)**k_agg - 1)) + pmf = norm * kernel + + pmf[x == 0] = 0 + + return pmf + + def _stats(self, mu, k_agg): + p = nbinom_ztrunc_p(mu, k_agg) + omega = 1 / (1 + p) + eta = 1 - omega + mu = mu + + # From Sampford 1955 + var = (k_agg * eta * (1 + k_agg * eta)) / \ + (omega**2 * (1 - omega**k_agg)) - mu**2 + return mu, var, None, None + +nbinom_ztrunc = nbinom_ztrunc_gen(name='nbinom_ztrunc', shapes='mu, k_agg') + + +def _nbinom_ztrunc_p(mu, k_agg): + """ Calculates p parameter for truncated negative binomial + + Function given in Sampford 1955, equation 4 + + Note that omega = 1 / 1 + p in Sampford + """ + + p_eq = lambda p, mu, k_agg: (k_agg * p) / (1 - (1 + p)**-k_agg) - mu + + # The upper bound needs to be large. p will increase with increasing mu + # and decreasing k_agg + p = optim.brentq(p_eq, 1e-10, 1e10, args=(mu, k_agg)) + return p + +nbinom_ztrunc_p = np.vectorize(_nbinom_ztrunc_p) + + +class cnbinom_gen(rv_discrete_meco): + r""" + The conditional negative binomial random variable. + + This distribution was described by Zillio and He (2010) [#]_ and Conlisk + et al. (2007) [#]_ + + .. math:: + + p(x) = \frac{\binom{x + k - 1}{x} \binom{b - x + k/a - k -1}{b + -x}}{\binom{b + k/a - 1}{b}} + + for ``x >= 0``. In this parameterization ``a = E[p(x)] / b`` where ``b`` is + the upper limit of the distribution. + + Methods + ------- + translate_args(mu, k_agg, b) + not used, returns mu, k_agg, and b. + fit_mle(data, k_array=np.arange(0.1,100,0.1)) + ml estimate of shape parameters mu and k_agg given data + %(before_notes)s + mu : float + distribution mean + k_agg : float + clustering parameter (refered to as ``k`` above) + b : float + Upper bound of distribution + + References + ---------- + .. [#] + Zillio, T. & He, F. (2010). Modeling spatial aggregation of finite + populations. Ecology, 91(12), 3698-3706 + .. [#] + Conlisk, E., Bloxham, M., Conlisk, J, Enquist, E., and Harte, J. + (2007). A new class of models of spatial distribution. Ecological + Monographs, 77(2), 269-284 + """ + + @inherit_docstring_from(rv_discrete_meco) + def translate_args(self, mu, k_agg, b): + return mu, k_agg, b + + @inherit_docstring_from(rv_discrete_meco) + def fit_mle(self, data, b=None, k_array=np.arange(0.1, 100, 0.1)): + + data = np.array(data) + mu = np.mean(data) + + if not b: + b = np.sum(data) + + return mu, _solve_k_from_mu(data, k_array, _cnbinom_nll, mu, b), b + + def _pmf(self, x, mu, k_agg, b): + return np.exp(self._logpmf(x, mu, k_agg, b)) + + def _logpmf(self, x, mu, k_agg, b): + a = mu / b + logpmf = _cnbinom_logpmf(x, b, a, k_agg) + logpmf[x > b] = -np.inf + return logpmf + + def _stats(self, mu, k_agg, b): + mu = mu + var = ((1 - mu / b) * mu * (k_agg + mu)) / (k_agg + (mu / b)) + return mu, var, None, None + +cnbinom = cnbinom_gen(name="cnbinom", shapes="mu, k_agg, b") + + +def _cnbinom_logpmf(n_i, n, a, k_agg): + # Logpmf for cnbinom + return _ln_choose(n_i + k_agg - 1, n_i) + \ + _ln_choose(n - n_i + (k_agg / a) - k_agg - 1, n - n_i) -\ + _ln_choose(n + (k_agg / a) - 1, n) + + +def _cnbinom_nll(data, k_agg, mu, b): + # Negative log likelihood for cnbinom + return -np.sum(cnbinom._logpmf(data, mu, k_agg, b)) + + +def _ln_choose(n, k_agg): + ''' + log binomial coefficient with extended gamma factorials. n and k_agg may be + int or array - if both array, must be the same length. + + ''' + gammaln = special.gammaln + return gammaln(n + 1) - (gammaln(k_agg + 1) + gammaln(n - k_agg + 1)) + + +def _solve_k_from_mu(data, k_array, nll, *args): + """ + For given args, return k_agg from searching some k_range. + + Parameters + ---------- + data : array + k_range : array + nll : function + + args : + + Returns + -------- + :float + Minimum k_agg + + """ + # TODO: See if a root finder like fminbound would work with Decimal used in + # logpmf method (will this work with arrays?) + + nll_array = np.zeros(len(k_array)) + + for i in range(len(k_array)): + nll_array[i] = nll(data, k_array[i], *args) + + min_nll_idx = np.argmin(nll_array) + + return k_array[min_nll_idx] + +class logser_gen(rv_discrete_meco): + """ + A Logarithmic (Log-Series, Series) discrete random variable. + + Notes + ----- + The probability mass function for `logser` is:: + + logser.pmf(k) = - p**k / (k*log(1-p)) + + for ``k >= 1``. + + `logser` takes ``p`` as shape parameter. + + """ + + @inherit_docstring_from(rv_continuous_meco) + def translate_args(self, mu): + eq = lambda p, mu: -p/np.log(1-p)/(1-p) - mu + return optim.brentq(eq, 1e-16, 1-1e-16, args=(mu), disp=True) + + @inherit_docstring_from(rv_continuous_meco) + def fit_mle(self, data): + # Use method of moments + return self.translate_args(np.mean(data)), + + def _rvs(self, p): + # looks wrong for p>0.5, too few k=1 + # trying to use generic is worse, no k=1 at all + return stats.logser.rvs(p, size=self._size) + #return np.random.mtrand.logseries(p, size=self._size) + + def _argcheck(self, p): + return (p > 0) & (p < 1) + + def _pmf(self, x, p): + return stats.logser.pmf(x, p) + # return -np.power(p, x) * 1.0 / x / np.log(1 - p) + + def _cdf(self, x, p): + return stats.logser.cdf(x, p) + + def _stats(self, p): + r = np.log(1 - p) + mu = p / (p - 1.0) / r + mu2p = -p / r / (p - 1.0)**2 + var = mu2p - mu*mu + mu3p = -p / r * (1.0+p) / (1.0 - p)**3 + mu3 = mu3p - 3*mu*mu2p + 2*mu**3 + g1 = mu3 / np.power(var, 1.5) + + mu4p = -p / r * ( + 1.0 / (p-1)**2 - 6*p / (p - 1)**3 + 6*p*p / (p-1)**4) + mu4 = mu4p - 4*mu3p*mu + 6*mu2p*mu*mu - 3*mu**4 + g2 = mu4 / var**2 - 3.0 + return mu, var, g1, g2 + +logser = logser_gen(name="logser", shapes="p") + + +class logser_uptrunc_gen(rv_discrete_meco): + r""" + Upper truncated logseries random variable. + + This distribution was described by Harte (2011) [#]_ + + .. math:: + + p(x) = \frac{1}{Z} \frac{p^n}{n} + + where ``Z`` is the normalizing factor + + Methods + ------- + translate_args(mu, b) + Translates the mean and the upper bound into p and b. + fit_mle(data) + ml estimate of shape parameter p + %(before_notes)s + p : float + p parameter of the logseries distribution + b : float + Upper bound of the distribution + + + Notes + ----- + Code adapted from Ethan White's macroecology_tools and version 0.1 of + macroeco + + References + ----------- + .. [#] + Harte, J. (2011). Maximum Entropy and Ecology: A Theory of + Abundance, Distribution, and Energetics. Oxford, United + Kingdom: Oxford University Press. + + """ + + @inherit_docstring_from(rv_discrete_meco) + def translate_args(self, mu, b): + return _trunc_logser_solver((1 / mu) * b, b), b + + @inherit_docstring_from(rv_discrete_meco) + def fit_mle(self, data, b=None): + """%(super)s +b : float + The upper bound of the distribution. If None, fixed at sum(data) + """ + + data = np.array(data) + length = len(data) + + if not b: + b = np.sum(data) + + return _trunc_logser_solver(length, b), b + + def _pmf(self, x, p, b): + + x = np.atleast_1d(x) + p = np.atleast_1d(p) + b = np.atleast_1d(b) + + if p[0] > 0: + pmf = stats.logser.pmf(x, p) / stats.logser.cdf(b, p) + else: + ivals = np.arange(1, b[0] + 1) + normalization = np.sum(p[0] ** ivals / ivals) + pmf = (p[0] ** x / x) / normalization + + return pmf + + def _cdf(self, x, p, b): + + x = np.atleast_1d(x) + p = np.atleast_1d(p) + b = np.atleast_1d(b) + + if p[0] < 1: + return stats.logser.cdf(x, p) / stats.logser.cdf(b, p) + else: + cdf_list = [sum(self.pmf(range(1, int(x_i) + 1), p[0], b[0])) for + x_i in x] + return np.array(cdf_list) + + def _rvs(self, p, b): + # Code from weecology/macroecotools + + if not self._size: + self._size = 1 + + out = [] + if p < 1: + for i in range(self._size): + rand_logser = stats.logser.rvs(p) + while rand_logser > b: + rand_logser = stats.logser.rvs(p) + out.append(rand_logser) + else: + rand_list = stats.uniform.rvs(size = self._size) + for rand_num in rand_list: + y = lambda x: self.cdf(x, p, b) - rand_num + if y(1) > 0: out.append(1) + else: out.append(int(round(bisect(y, 1, b)))) + return np.array(out) + + def _stats(self, p, b): + + vals = np.arange(1, b + 1) + full_pmf = self.pmf(vals, p, b) + mean, var = _mean_var(vals, full_pmf) + return mean, var, None, None + + +logser_uptrunc = logser_uptrunc_gen(name="logser_uptrunc", shapes="p, b") + + +def _trunc_logser_solver(bins, b): + """ + Given bins (S) and b (N) solve for MLE of truncated logseries + parameter p + + Parameters + ----------- + bins : float + Number of bins. Considered S in an ecological context + b : float + Upper truncation of distribution. Considered N in an ecological context + + Returns + ------- + : float + MLE estimate of p + + Notes + ------ + Adapted from Ethan White's macroecology_tools + """ + + if bins == b: + p = 0 + + else: + BOUNDS = [0, 1] + DIST_FROM_BOUND = 10 ** -15 + m = np.array(np.arange(1, np.int(b) + 1)) + y = lambda x: np.sum(x ** m / b * bins) - np.sum((x ** m) / m) + p = optim.bisect(y, BOUNDS[0] + DIST_FROM_BOUND, + min((sys.float_info[0] / bins) ** (1 / b), 2), + xtol=1.490116e-08, maxiter=1000) + return p + + +class plnorm_gen(rv_discrete_meco): + r""" + Poisson lognormal random variable. + + Methods + ------- + translate_args(mean, sigma) + not implemented + fit_mle(data) + ml estimate of shape parameters mu and sigma + %(before_notes)s + mu : float + mu parameter of the poisson lognormal + sigma : float + sigma parameter of the poisson lognormal + + Notes + ----- + The pmf method was adopted directly from the VGAM package in R. + The VGAM R package was adopted directly from Bulmer (1974) [#]_ + + The fit_mle function was adapted from Ethan White's pln_solver function in + macroeco_distributions (https://github.com/weecology/macroecotools) + + References + ---------- + .. [#] + Bulmer, M. G. (1974). On fitting the poisson lognormal distribution to + species bundance data. Biometrics, 30, 101-110. + + """ + + @inherit_docstring_from(rv_discrete_meco) + def translate_args(self, mean, sigma): + raise NotImplementedError("Translate args not implemented") + + @inherit_docstring_from(rv_discrete_meco) + def fit_mle(self, data): + + mu0 = np.mean(np.log(np.array(data) + 1)) + sig0 = np.std(np.log(np.array(data) + 1)) + + if sig0 == 0: + + sig0 = 1e-5 # can't be zero + + def mle(params): + return -np.sum(self.logpmf(data, params[0], params[1])) + + # Bounded fmin? + mu, sigma = optim.fmin_bfgs(mle, x0=[mu0, sig0], disp=0) + + return mu, sigma + + @inherit_docstring_from(rv_discrete_meco) + @doc_sub(_doc_make_rank) + def rank(self, n, mu, sigma, crit=.5, upper=10000, xtol=1): + """%(super)s + +Additional Parameters +---------------------- + {0} + + """ + + return _make_rank(self, n, mu, sigma, crit=crit, upper=upper, + xtol=xtol) + + def _argcheck(self, mu, sigma): + return True + + def _pmf(self, x, mu, sigma): + + # TODO: Add approx_cut as keyword. Strange parse_args error + approx_cut = 10 + x = np.array(x) + pmf = np.empty(len(x), dtype=np.float) + xbelow = x <= approx_cut + xabove = x > approx_cut + + # If below, use exact answer + if np.sum(xbelow) > 0: + + pmf[xbelow] = plognorm_intg_vec(x[xbelow], mu[xbelow], + sigma[xbelow]) + + # If above, use approximation + if np.sum(xabove) > 0: + + z = (np.log(x[xabove]) - mu[xabove]) / sigma[xabove] + + pmf_above = ((1 + (z**2 + np.log(x[xabove]) - mu[xabove] - 1) / + (2 * x[xabove] * sigma[xabove]**2)) * np.exp(-0.5 * z**2) / + (np.sqrt(2 * np.pi) * sigma[xabove] * x[xabove])) + + pmf[xabove] = pmf_above + + # If pmf is 0 the likelihood might break + # TODO: This should be fixed in likelihood function as it might apply + # to other distributions + pmf[pmf == 0] = 1e-120 + + return pmf + + def _cdf(self, x, mu, sigma): + + mu = np.atleast_1d(mu) + sigma = np.atleast_1d(sigma) + x = np.atleast_1d(x) + + max_x = np.max(x) + pmf_list = self.pmf(np.arange(np.int(max_x) + 1), mu[0], sigma[0]) + full_cdf = np.cumsum(pmf_list) + + cdf = np.array([full_cdf[tx] for tx in x]) + + return cdf + + +plnorm = plnorm_gen(name='plnorm', shapes='mu,sigma') + + +class plnorm_ztrunc_gen(rv_discrete_meco): + r""" + Zero-truncated poisson lognormal random variable. + + Methods + ------- + translate_args(mean, sigma) + not implemented + fit_mle(data) + ml estimate of shape parameters mu and sigma + %(before_notes)s + mu : float + mu parameter of the poisson lognormal + sigma : float + sigma parameter of the poisson lognormal + + Notes + ----- + The pmf method was adopted directly from the VGAM package in R. + The VGAM R package was adopted directly from Bulmer (1974) [#]_ + + The fit_mle function was adapted from Ethan White's pln_solver function in + macroeco_distributions (https://github.com/weecology/macroecotools) + + References + ---------- + .. [#] + Bulmer, M. G. (1974). On fitting the poisson lognormal distribution to + species bundance data. Biometrics, 30, 101-110. + + """ + + @inherit_docstring_from(rv_discrete_meco) + def translate_args(self, mean, sigma): + raise NotImplementedError("Translate args not implemented") + + @inherit_docstring_from(rv_discrete_meco) + def fit_mle(self, data): + + # Copying code...could we make this a generic function with an eval? + # Or would that slow it down too much? + mu0 = np.mean(np.log(data)) + sig0 = np.std(np.log(data)) + + if sig0 == 0: + + sig0 = 1e-5 # can't be zero + + def mle(params): + return -np.sum(np.log(self._pmf(data, params[0], params[1]))) + + # Bounded fmin? + mu, sigma = optim.fmin_bfgs(mle, x0=[mu0, sig0], disp=0) + + return mu, sigma + + @inherit_docstring_from(rv_discrete_meco) + @doc_sub(_doc_make_rank) + def rank(self, n, mu, sigma, crit=0, upper=10000, xtol=1): + """%(super)s + +Additional Parameters +---------------------- + {0} + + """ + + return _make_rank(self, n, mu, sigma, crit=crit, upper=upper, + xtol=xtol) + def _argcheck(self, mu, sigma): + return True + + def _pmf(self, x, mu, sigma): + + x = np.array(x) + mu = np.atleast_1d(mu) + sigma = np.atleast_1d(sigma) + + norm = 1 - plognorm_intg_vec(0, mu[0], sigma[0]) + pmf_vals = plnorm.pmf(x, mu, sigma) / norm + pmf_vals[x < 1] = 0 + + return pmf_vals + + def _cdf(self, x, mu, sigma): + + # Format input + x = np.array(x) + mu = np.atleast_1d(mu) + sigma = np.atleast_1d(sigma) + + # Calculate cdf from plnorm_gen + norm = 1 - plognorm_intg_vec(0, mu[0], sigma[0]) + cdf_vals = (plnorm.cdf(x, mu, sigma) - + plnorm.cdf(0, mu[0], sigma[0])) / norm + + # Values less than one have zero probability + cdf_vals = np.atleast_1d(cdf_vals) + cdf_vals[x < 1] = 0 + + return cdf_vals + +plnorm_ztrunc = plnorm_ztrunc_gen(name="plnorm_ztrunc", + shapes='mu, sigma') + + +def plognorm_intg(x, mu, sigma): + # Integral for plognorm + eq = lambda t, x, mu, sigma: np.exp(t * x - np.exp(t) - 0.5 * + ((t - mu) / sigma) ** 2) + + intg = integrate.quad(eq, -np.inf, np.inf, args=(x, mu, sigma))[0] + + norm = np.exp(-0.5 * np.log(2 * np.pi * sigma ** 2) - + special.gammaln(x + 1)) + + return norm * intg + +plognorm_intg_vec = np.vectorize(plognorm_intg) + + + +# +# Continuous +# + + +class expon_gen(rv_continuous_meco): + r""" + An exponential continuous random variable. + + .. math:: + + f(x) = \lambda e^{-\lambda x} + + for ``x >= 0``. The ``loc`` and ``scale`` parameters are not used. + + Methods + ------- + translate_args(mu) + Shape parameter mu given distribution mean. + fit_mle(data) + ML estimate of shape parameter lam given data. + %(before_notes)s + mu : float + distribution mean + + """ + + @inherit_docstring_from(rv_continuous_meco) + def translate_args(self, mu): + return 1 / mu + + @inherit_docstring_from(rv_continuous_meco) + def fit_mle(self, data): + # MLE is method of moments for exponential + return 1 / (np.sum(data) / len(data)) + + def _rvs(self, lam): + return nprand.exponential(1/lam, self._size) + + def _pdf(self, x, lam): + return lam * np.exp(-lam*x) + + def _cdf(self, x, lam): + return 1 - np.exp(-lam*x) + + def _entropy(self, lam): + return 1 - np.ln(lam) + + def _stats(self, lam): + return lam**-1, lam**-2, 2, 6 + +expon = expon_gen(a=0.0, name='expon', shapes='lam') + + +class expon_uptrunc_gen(rv_continuous_meco): + r""" + An upper-truncated exponential continuous random variable. + + .. math:: + + f(x) = \frac{\lambda e^{-\lambda x}}{1 - e^{-\lambda b}} + + for ``b >= x >= 0``. The ``loc`` and ``scale`` parameters are not used. + + Methods + ------- + translate_args(mu, b) + Shape parameter lam given distribution mean and upper limit. + fit_mle(data, b=sum(data)) + ML estimate of shape parameter lam given data and upper limit. + %(before_notes)s + mu : float + distribution mean + b : float + distribution upper limit, defaults to sum of data + + """ + + @inherit_docstring_from(rv_continuous_meco) + def translate_args(self, mu, b): + return _expon_solve_lam_from_mu_vect(mu, b), b + + @inherit_docstring_from(rv_continuous_meco) + def fit_mle(self, data, b=None): + """%(super)s + + Additional Parameters + ---------------------- + b : float + The upper limit of the distribution + """ + # Take mean of data as MLE of distribution mean, then calculate p + mu = np.mean(data) + if not b: + b = np.sum(data) + lam = _expon_solve_lam_from_mu_vect(mu, b) + + # Just return float, not len 1 array + if len(np.atleast_1d(lam)) == 1: + return float(lam), b + else: + return lam, b + + def _argcheck(self, lam, b): + return True + + def _pdf(self, x, lam, b): + return (lam * np.exp(-lam*x)) / (1 - np.exp(-lam*b)) + + def _cdf(self, x, lam, b): + return (1 - np.exp(-lam*x)) / (1 - np.exp(-lam*b)) + +expon_uptrunc = expon_uptrunc_gen(a=0.0, name='expon_uptrunc', shapes='lam, b') + +def _expon_solve_lam_from_mu(mu, b): + """ + For the expon_uptrunc, given mu and b, return lam. + Similar to geom_uptrunc + """ + + def lam_eq(lam, mu, b): + # Small offset added to denominator to avoid 0/0 erors + lam, mu, b = Decimal(lam), Decimal(mu), Decimal(b) + return ( (1 - (lam*b + 1) * np.exp(-lam*b)) / + (lam - lam * np.exp(-lam*b) + Decimal(1e-32)) - mu ) + + return optim.brentq(lam_eq, -100, 100, args=(mu, b), disp=True) + +_expon_solve_lam_from_mu_vect = np.vectorize(_expon_solve_lam_from_mu) + + +class lognorm_gen(rv_continuous_meco): + r""" + A lognormal random variable. + + .. math:: + + f(x) = \frac{1}{\sigma x \sqrt{2 \pi}} e^{(\log{x} - \mu)^2 / 2 + \sigma^2} + + Methods + ------- + translate_args(mean, sigma) + Shape parameters mu and sigma given mean and sigma + fit_mle(data, b=sum(data)) + ML estimate of shape parameters mu and sigma + %(before_notes)s + mu : float + mu parameter of lognormal distribution. Mean log(x) + sigma : float + sigma parameter of lognormal distribution. sd of log(x) + + """ + + @inherit_docstring_from(rv_continuous_meco) + def translate_args(self, mean, sigma): + return np.log(mean) - (sigma ** 2 / 2), sigma + + @inherit_docstring_from(rv_continuous_meco) + def fit_mle(self, data, fix_mean=False): + """%(super)s + + Additional Parameters + ---------------------- + fix_mean : bool + Default False. If True, fixes mean before optimizing sigma + + """ + + if not fix_mean: + sigma, _, scale = stats.lognorm.fit(data, floc=0) + return np.log(scale), sigma + + else: + mean = np.mean(data) + + # MLE fxn to be optmimized + mle = lambda sigma, x, mean: -1 *\ + np.sum(self._pdf_w_mean(x, mean, sigma)) + + sigma = optim.fmin(mle, np.array([np.std(np.log(data), ddof=1)]), + args=(data, mean), disp=0)[0] + + return self.translate_args(mean, sigma) + + def _pdf_w_mean(self, x, mean, sigma): + """ + Calculates the pdf of a lognormal distribution with parameters mean + and sigma + + Parameters + ---------- + mean : float or ndarray + Mean of the lognormal distribution + sigma : float or ndarray + Sigma parameter of the lognormal distribution + + Returns + ------- + : float or ndarray + pdf of x + """ + + # Lognorm pmf with mean for optimization + mu, sigma = self.translate_args(mean, sigma) + return self.logpdf(x, mu, sigma) + + def _argcheck(self, mu, sigma): + return True + + def _rvs(self, mu, sigma): + return stats.lognorm.rvs(sigma, scale=np.exp(mu), size=self._size) + + def _pdf(self, x, mu, sigma): + return stats.lognorm.pdf(x, sigma, scale=np.exp(mu)) + + def _cdf(self, x, mu, sigma): + return stats.lognorm.cdf(x, sigma, scale=np.exp(mu)) + + def _stats(self, mu, sigma): + return stats.lognorm.stats(sigma, scale=np.exp(mu)) + +lognorm = lognorm_gen(name="lognorm", shapes="mu, sigma") + + +@doc_sub(_doc_make_rank) +def _make_rank(dist_obj, n, mu, sigma, crit=0.5, upper=10000, xtol=1): + """ + Make rank distribution using both ppf and brute force. + + Setting crit = 1 is equivalent to just using the ppf + + Parameters + ---------- + {0} + + """ + qs = (np.arange(1, n + 1) - 0.5) / n + rank = np.empty(len(qs)) + + brute_ppf = lambda val, prob: prob - dist_obj.cdf(val, mu, sigma) + + qs_less = qs <= crit + ind = np.sum(qs_less) + + # Use ppf if qs are below crit + rank[qs_less] = dist_obj.ppf(qs[qs_less], mu, sigma) + + # Use brute force if they are above + for i, tq in enumerate(qs[~qs_less]): + + j = ind + i + try: + # TODO: Use an adaptable lower bound to increase speed + rank[j] = np.abs(np.ceil(optim.brentq(brute_ppf, -1, upper, + args=(tq,), xtol=xtol))) + + except ValueError: + + # If it is above the upper bound set all remaining values + # to the previous value + rank[j:] = np.repeat(rank[j - 1], len(rank[j:])) + break + + return rank + + +def _mean_var(vals, pmf): + """ + Calculates the mean and variance from vals and pmf + + Parameters + ---------- + vals : ndarray + Value range for a distribution + pmf : ndarray + pmf values corresponding with vals + + Returns + ------- + : tuple + (mean, variance) + + """ + + mean = np.sum(vals * pmf) + var = np.sum(vals ** 2 * pmf) - mean ** 2 + return mean, var + + diff --git a/distributions.py b/macroeco/models/distributions_old.py similarity index 88% rename from distributions.py rename to macroeco/models/distributions_old.py index 7b61fec..005d428 100644 --- a/distributions.py +++ b/macroeco/models/distributions_old.py @@ -104,7 +104,7 @@ import numpy as np import scipy.stats as stats import scipy.optimize -import scipy.special +import scipy.special as spec from copy import deepcopy import math as m import scipy.integrate as integrate @@ -512,7 +512,6 @@ def rad(self): return rad - def fit(self, data): ''' Fit method. @@ -1320,6 +1319,93 @@ def pmf(self, n): # TODO: Write cdf method based on cdf of plognorm, similar to above +class canonical_lognorm(Distribution): + __doc__ = Distribution.__doc__ + \ + ''' + Description + ------------ + Lognormal distribution + + Parameters + ---------- + mu : float + The mu parameter of the log normal + sigma : float + The sigma parameter of the log normal + n_samp : int or iterable (optional) + Total number of species / samples + tot_obs: int or iterable (optional) + Total number of individuals / observations + + self.var keywords + ----------------- + mu : list of floats + The mu parameter of the lognormal calculated with + np.log(tot_obs / n_samp) - (sigma**2 / 2). + sigma : list of float + The sigma parameter of the log normal + + Notes + ----- + Currently, lognormal is implemented so that mu is calculated using tot_obs, + n_samp, and sigma. While, mu can be passed in as a keyword argument, this + mu will be ignored. + + ''' + + @doc_inherit + def __init__(self, **kwargs): + self.params = kwargs + self.min_supp = 1 + self.par_num = 2 + self.var = {} + + @doc_inherit + def pmf(self, n): + + # Get parameters + tot_obs, n_samp = self.get_params(['tot_obs','n_samp']) + n = expand_n(n, len(tot_obs)) + + # Calculate sigma + sigma = np.sqrt((2 * np.log(n_samp)) / np.log(2)**2) + + # Calculate mu + mu = np.log(tot_obs / n_samp) - (sigma**2 / 2) + self.var['mu'] = mu + self.var['sigma'] = sigma + + # Calculate pmf + pmf = [] + for tmu, tsigma, tn in zip(mu, sigma, n): + tpmf = stats.lognorm.pdf(tn, tsigma, scale=np.exp(tmu)) + pmf.append(tpmf) + + return pmf + + @doc_inherit + def cdf(self, n): + + # Get parameters + tot_obs, n_samp = self.get_params(['tot_obs','n_samp']) + n = expand_n(n, len(tot_obs)) + + # Calculate sigma + sigma = np.sqrt((2 * np.log(n_samp)) / np.log(2)**2) + + # Calculate mu + mu = np.log(tot_obs / n_samp) - (sigma**2 / 2) + self.var['mu'] = mu + self.var['sigma'] = sigma + + #Calculate cdf + cdf = [] + for tmu, tsigma, tn in zip(mu, sigma, n): + tcdf = stats.lognorm.cdf(tn, tsigma, scale=np.exp(tmu)) + cdf.append(tcdf) + + return cdf + class lognorm(Distribution): __doc__ = Distribution.__doc__ + \ @@ -1739,7 +1825,7 @@ def pmf(self, n): for tn_samp, ttot_obs, tn in zip(n_samp, tot_obs, n): ttot_obs = np.round(ttot_obs, decimals=0) #sumg = sum(eq(np.arange(1, np.floor(ttot_obs) + 1), tn_samp, ttot_obs)) - tpmf = eq(tn, tn_samp, ttot_obs)# / sumg # Normalizing + tpmf = eq(tn, tn_samp, ttot_obs) #/ sumg # Normalizing pmf.append(tpmf) return pmf @@ -2136,7 +2222,9 @@ def nll_nb(k): return self -class nbd_lt(nbd): + +class nbd_lt(Distribution): + __doc__ = Distribution.__doc__ + \ ''' Description ----------- @@ -2153,34 +2241,31 @@ class nbd_lt(nbd): self.var keywords ----------------- - Parameterization differs for different forms of the nbd. We use the - standard ecological form as described by Ben Bolker. Parameters 'a' (1 / - n_samp), 'tot_obs', and k are used to derive the nbd parameter p (see code - for details). Parameters k and p are used to generate distribution. k is - included in self.var if it is calculated in fit. - p : array of floats - p parameters of nbd + p parameters of nbd_lt k : array of floats Aggregation parameter + k is included in self.var if it is calculated in fit. Notes ----- The total species (S) is equivalent to n_samp and the total individuals (N) is equivalent to tot_obs. + Parameterization based on Sampford 1955 and He and Legendre 2002 + ''' + @doc_inherit def __init__(self, **kwargs): self.params = kwargs self.min_supp = 1 - self.par_num = 2 + self.par_num = 2 self.var = {} - - + def pmf(self, n): - ''' + """ Probability mass function method. Parameters @@ -2195,22 +2280,57 @@ def pmf(self, n): List of 1D arrays of probability of observing sample n. See class docstring for more specific information on this distribution. - ''' + + + """ # Get parameters - n_samp, tot_obs, k = self.get_params(['n_samp', 'tot_obs', 'k']) + n_samp, tot_obs, k =\ + self.get_params(['n_samp', 'tot_obs', 'k']) n = expand_n(n, len(n_samp)) - - # TODO: Additional checks? - reg_nbd = nbd(n_samp=n_samp, tot_obs=tot_obs, k=k) - reg_pmf = reg_nbd.pmf(n) - self.var = reg_nbd.var - reg_pmf0 = reg_nbd.pmf(0) + assert np.all(n_samp <= tot_obs), 'n_samp must be <= tot_obs' - trunc_pmf = [(pr / (1 - p0)) for pr, p0 in zip(reg_pmf, reg_pmf0)] + # Calculate pmf + def pmf_eq(n, p, k): - return trunc_pmf + norm = np.exp(spec.gammaln(k + n) - ((spec.gammaln(k) + + spec.gammaln(n + 1)))) + + kernel = (p / (1 + p))**n * (1 / ((1 + p)**k - 1)) + return norm * kernel + + self.var['p'] = [] + + pmf = [] + p_eq = lambda p, k, N, S : (k * p) / (1 - (1 + p)**-k) -\ + (float(N) / S) + + for tn_samp, ttot_obs, tk, tn in zip(n_samp, tot_obs, k, n): + # Find p + + do_it = True + count = 0 + while do_it and count < 20: + + stop = 10**(count + 1) + count += 1 + + try: + tp = scipy.optimize.brentq(p_eq, 1e-10, stop, args=(tk, + ttot_obs, tn_samp)) + do_it = False + except(ValueError): + if count >= 20: + tp = np.nan + + self.var['p'].append(tp) + tpmf = pmf_eq(tn, tp, tk) + + pmf.append(tpmf) + + self.var['p'] = np.array(self.var['p']) + return pmf def cdf(self, n): ''' @@ -2225,24 +2345,74 @@ def cdf(self, n): Returns ------- cdf : list of ndarrays - List of 1D arrays of probability of observing sample n. + List of 1D arrays of cumulative probability of observing sample n. See class docstring for more specific information on this distribution. ''' + + for kw in self.params.iterkeys(): + if not np.iterable(self.params[kw]): + self.params[kw] = make_array(self.params[kw]) - n_samp, tot_obs, k = self.get_params(['n_samp', 'tot_obs', 'k']) - n = expand_n(n, len(n_samp)) + # Expand n argument if needed, assumes all params same length + n = expand_n(n, len(self.params.values()[0])) + + # Calculate pmfs + max_n = [np.max(tn) for tn in n] + n_in = [np.arange(self.min_supp, i + 1) for i in max_n] + + pmf_list = self.pmf(n_in) + + # Calculate cdfs + cdf = [] + for tpmf, tn in zip(pmf_list, n): + full_cdf = np.cumsum(tpmf) + tcdf = np.array([full_cdf[x - self.min_supp] for x in tn]) + cdf.append(tcdf) + + return cdf + + def fit(self, data, guess_for_k=1): + ''' + Fit method. + + Uses input data to get best fit parameters for distribution, and stores + these parameters in params attribute. - # TODO: Additional checks? + Parameters + ---------- + data : list of ndarrays + Data to use to fit parameters of distribution. Even if only one + data array, must be in a list with one element. + guess_for_k : float + Initial guess for parameter k in solver + + See class docstring for more specific information on this distribution. + ''' - reg_nbd = nbd(n_samp=n_samp, tot_obs=tot_obs, k=k) - p0 = reg_nbd.pmf(0) - self.var = reg_nbd.var - reg_cdf = reg_nbd.cdf(n) + super(nbd_lt, self).fit(data) + n_samp, tot_obs = self.get_params(['n_samp', 'tot_obs']) + + data = check_list_of_iterables(data) + tempk = [] - trun_cdf = [(tcdf - tp0) / (1 - tp0) for tcdf, tp0 in zip(reg_cdf, p0)] + for tdata, tn_samp, ttot_obs in zip(data, n_samp, tot_obs): - return trun_cdf + def nll_nb(k): + self.params['tot_obs'] = ttot_obs + self.params['n_samp'] = tn_samp + self.params['k'] = k + return -sum(np.log(self.pmf(tdata)[0])) + + mlek = scipy.optimize.fmin(nll_nb, np.array([guess_for_k]), + disp=0)[0] + tempk.append(mlek) + self.params['k'] = np.array(tempk) + self.params['n_samp'] = n_samp + self.params['tot_obs'] = tot_obs + self.var['k'] = np.array(tempk) + + return self class fnbd(Distribution): __doc__ = Distribution.__doc__ + \ @@ -2519,12 +2689,11 @@ def pmf(self, n): # Get parameters n_samp, tot_obs = self.get_params(['n_samp', 'tot_obs']) n = expand_n(n, len(n_samp)) - - # TODO: Additional checks? + + # Define normalizing constant and pmf functions + z_func = lambda x, ttot_obs: (1 - x ** (ttot_obs + 1)) / (1 - x) + pmf_func = lambda z, x, tn: (1 / z) * (x ** tn) - #NOTE: Overflow warning but not affecting results - eq = lambda x, N, a: ((x / (1 - x)) - (((N + 1) * x ** (N + 1)) / \ - (1 - x ** (N + 1)))) - (N * a) pmf = [] self.var['x'] = [] for tn_samp, ttot_obs, tn in zip(n_samp, tot_obs, n): @@ -2541,25 +2710,49 @@ def pmf(self, n): tpmf[np.where(tn == ttot_obs)[0]] = 1 x = 0 + elif ta < 0.5: + try: + stop = 1 - 1e-10 + # This is very brittle for some reason. Changing the stop + # value can make this fail for strange reasons + x = scipy.optimize.brentq(l_solver, 0, .999999, + args=(ttot_obs, ta), disp=False) + except: + try: + x = scipy.optimize.brentq(l_solver, 0, .95, + args=(ttot_obs, ta), disp=False) + except: + raise ValueError("No solution to " + + "%s.pmf when tot_obs = " % + (self.__class__.__name__) + + "%.2f, n_samp = %.10f and a = %.10f" % + (ttot_obs, tn_samp, ta)) + z = z_func(x, ttot_obs) + tpmf = pmf_func(z, x, tn) else: try: - x = scipy.optimize.brentq(eq, 0, min((sys.float_info[0] * - ta)**(1/float(ttot_obs)), 8), args=(ttot_obs, ta), - disp=False, xtol=1e-60) + x = scipy.optimize.brentq(l_solver, 0, + min((sys.float_info[0] * ta)**(1/float(ttot_obs)), + 8), args=(ttot_obs, ta), disp=False, + xtol=1e-60, max_iter=200) + except: + try: # Allows it to pass, but optimizer starts rounding. # Not Sure why it is doing this. - x = scipy.optimize.brentq(eq, 8.0, 50.0, \ - args=(ttot_obs, ta), disp=False, xtol=1e-60) + x = scipy.optimize.brentq(l_solver, 8.0, 50.0, \ + args=(ttot_obs, ta), disp=False, xtol=1e-60, + max_iter=200) except: - raise ValueError("No solution to %s.pmf when tot_obs = " % - (self.__class__.__name__) + - "%.2f, n_samp = %.10f and a = %.10f" % - (ttot_obs, tn_samp, ta)) - z = (1 - x ** (ttot_obs + 1)) / (1 - x) - tpmf = (1 / z) * (x ** tn) + raise ValueError("No solution to " + + "%s.pmf when tot_obs = " % + (self.__class__.__name__) + + "%.2f, n_samp = %.10f and a = %.10f" % + (ttot_obs, tn_samp, ta)) + z = z_func(x, ttot_obs) + tpmf = pmf_func(z, x, tn) pmf.append(tpmf) self.var['x'].append(x) @@ -2805,7 +2998,25 @@ class gen_sar(Curve): plognorm and plognorm_lt are not supported by gen_sar. If one would like them to be supported, the full pmf for the sad must be calculated in the fit method. - + + Examples + -------- + import distributions as dist + + # Make an SAR with a Logseries SAD and Truncated Geometric SSAD. The + # community has 500 individuals and 14 species + + sar1 = dist.gen_sar(dist.logser(), dist.tgeo(), tot_obs=500, n_samp=14) + + # Number of species in half the base area and double the base area + sar1.vals([.5, 2]) + + # Make an SAR with Logseries and Truncated NBD + sar2 = dist.gen_sar(dist.logser(), dist.tnbd(k=.2), tot_obs=500, n_samp=14) + + # Iterated the SAR 2 doublings from the base scale + sar2.iter_vals(upscale=2) + ''' @@ -3523,12 +3734,8 @@ class nu(Distribution): self.var keywords ----------------- - beta : list of floats - The beta lagrange multiplier lambda_2 : list of floats The lambda2 lagrange multiplier - sigma : list of floats - The sigma lagrange multiplier Notes ----- @@ -3559,36 +3766,23 @@ def pmf(self, e): n_samp, tot_obs, E = self.get_params(['n_samp', 'tot_obs', 'E']) e = expand_n(e, len(n_samp)) - start = 0.3 - stop = 2 - flmax = sys.float_info[0] pmf = [] - self.var['beta'] = [] self.var['lambda_2'] = [] + convert_e = lambda ep, l2: 1 / (l2 * (ep - 1)) + for tn_samp, ttot_obs, tE, te in zip(n_samp, tot_obs, E, e): - k = np.linspace(1, ttot_obs, num=ttot_obs) - try: - tx = scipy.optimize.brentq(beta_solver, start, - min((flmax/tn_samp)**(1/float(ttot_obs)), stop), - args = (k, ttot_obs, tn_samp), disp=True) - except(ValueError): - raise ValueError("No solution to %s.pmf for tot_obs = %.2f" - % (self.__class__.__name__, ttot_obs) + - " and n_samp = %.2f" % (tn_samp)) # Set lagrange multipliers - tbeta = -np.log(tx) tl2 = float(tn_samp) / (tE - ttot_obs) # Harte (2011) 7.26 e_max = 1 + (1 / tl2) e_min = 1 + (1 / (ttot_obs * tl2)) - norm = integrate.quad(nu_pmf_eq, e_min, e_max, (tbeta, tl2, - tn_samp))[0] tpmf = np.empty(len(te), dtype=float) + tns = np.ceil(convert_e(te, tl2)) - # Parse values that aren't in range as set to zero + # Parse values that aren't in range and set to zero ind_tot = np.arange(len(tpmf)) ind_less = np.where(te >= e_min)[0] ind_more = np.where(te <= e_max)[0] @@ -3598,11 +3792,10 @@ def pmf(self, e): tpmf[ind_exclude] = 0 if len(ind_include) != 0: - tpmf[ind_include] =\ - nu_pmf_eq(te[ind_include], tbeta, tl2, tn_samp) / norm + tpmf[ind_include] = logser_ut(tot_obs=ttot_obs, + n_samp=tn_samp).pmf(tns[ind_include])[0] pmf.append(tpmf) - self.var['beta'].append(tbeta) self.var['lambda_2'].append(tl2) return pmf @@ -3613,67 +3806,236 @@ def cdf(self, e): n_samp, tot_obs, E = self.get_params(['n_samp', 'tot_obs', 'E']) e = expand_n(e, len(n_samp)) - start = 0.3 - stop = 2 - flmax = sys.float_info[0] cdf = [] self.var['beta'] = [] self.var['lambda_2'] = [] + convert_n = lambda n, l2: 1 + (1 / (n * l2)) + for tn_samp, ttot_obs, tE, te in zip(n_samp, tot_obs, E, e): - k = np.linspace(1, ttot_obs, num=ttot_obs) - try: - tx = scipy.optimize.brentq(beta_solver, start, - min((flmax/tn_samp)**(1/float(ttot_obs)), stop), - args = (k, ttot_obs, tn_samp), disp=True) - except(ValueError): - raise ValueError("No solution to %s.pmf for tot_obs = %.2f" - % (self.__class__.__name__, ttot_obs) + - " and n_samp = %.2f" % (tn_samp)) - # Set lagrange multipliers - tbeta = -np.log(tx) tl2 = float(tn_samp) / (tE - ttot_obs) # Harte (2011) 7.26 - e_max = 1 + (1 / tl2) - e_min = 1 + (1 / (ttot_obs * tl2)) - tcdf = np.empty(len(te), dtype=float) + # Set all e so you can sum + all_e = convert_n(np.arange(1, ttot_obs + 1), tl2)[::-1] + + pmf_for_all_e = nu(tot_obs=ttot_obs, n_samp=tn_samp, + E=tE).pmf(all_e)[0] + cum_sum = np.cumsum(pmf_for_all_e) + + tcdf = np.array([cum_sum[np.sum(e_val >= all_e) - 1] if sum(e_val + >= all_e) - 1 != -1 else 0 for e_val in te]) + + cdf.append(tcdf) + self.var['lambda_2'].append(tl2) + + return cdf + + def rad(self): + ''' + This rad uses the observed cdf for a given nu distribution and the + predicted cdf to calculate the rank energy distribution. + + Returns + ------- + : list + A list of rank energy distributions + + ''' + + n_samp, tot_obs, E = self.get_params(['n_samp', 'tot_obs', 'E']) + rad = [] + + convert_n = lambda n, l2: 1 + (1 / (n * l2)) + + for tn_samp, ttot_obs, tE in zip(n_samp, tot_obs, E): + + # Set temp params + self.params['n_samp'] = tn_samp + self.params['ttot_obs'] = ttot_obs + self.params['E'] = tE + + tl2 = float(tn_samp) / (tE - ttot_obs) # Harte (2011) 7.26 + all_e = convert_n(np.arange(1, ttot_obs + 1), tl2)[::-1] + tpmf = self.pmf(all_e)[0] + tcdf = np.cumsum(tpmf) + + # Observed cdf. Not quite true if some energies overlap + obs_cdf = np.arange(1 / (2 * (tn_samp)), 1, 1/tn_samp) + + trad = [all_e[sum(oc >= tcdf) - 1] if sum(oc >= tcdf) - 1 != -1 + else all_e[0] for oc in obs_cdf] + + rad.append(trad) + + self.params['n_samp'] = n_samp + self.params['ttot_obs'] = tot_obs + self.params['E'] = E + + return rad - # Parse values that aren't in range as set to 0 or 1 - ind_tot = np.arange(len(tcdf)) - ind_less = np.where(te < e_min)[0] - ind_more = np.where(te > e_max)[0] - ind_combo = np.concatenate((ind_more, ind_less)) - ind_include = np.array(list(set(ind_tot) - set(ind_combo))) + + def fit(self, data): + ''' + Fit the average species energy distribution to data + + Parameters + ---------- + data : list of tuples - if len(ind_less) != 0: - tcdf[ind_less] = 0 - if len(ind_more) != 0: - tcdf[ind_more] = 1 + A list containing tuples of length two or a list containing tuples + of length three. If the tuples are of length two, the first object + in a tuple is an iterable containing the community individual energy + distribution. The second object in a tuple is an iterable + containing the empirical species abundance distribution. If the + tuples are of length three, the first object in the tuple is an + iterable containing the average energy distribution. The second object + in a tuple an iterable containing the community individual energy + distribution. The third object in a tuple is an iterable + containing the empirical species abundance distribution. + + ''' + + # Unpack the list of tuples + # Can either take + if len(data[0]) == 2: + ied, sad = unpack(data) + elif len(data[0]) == 3: + ased, ied, sad = unpack(data) + + # Use base class fit + super(nu, self).fit(sad) - norm = integrate.quad(nu_pmf_eq, e_min, e_max, (tbeta, tl2, - tn_samp))[0] + # Format and check energy data + data_eng = check_list_of_iterables(ied) + + # Store energy data in self.params + E = [np.sum(np.array(edata)) for edata in data_eng] + self.params['E'] = E + + return self + +class omega(Distribution): + """ + This distribution is the distribution of total energy within a species + across all species. The means of this distribution is E / S. + + Parameters + ---------- + n_samp : int or iterable + Total number of species / samples + tot_obs: int or iterable + Total number of individuals / observations + E : int or iterable + Total energy output of community + + self.var keywords + ----------------- + lambda_2 : list of floats + The lambda2 lagrange multiplier + emaxmin : list fo tuples + Each tuple contains the max total energy and min total energy for the + given state variables. + + Notes + ----- + This is a discrete distribution. + + + """ + + def pmf(self, e): + ''' + Notes + ----- + The omega distribution is only defined at e values given by + e = n + (1 / lambda2). While this function will return a pmf + value for all e greater than or equal to one, note that the pmf will + only sum to one when provided with the proper support. lambda2 can be + calculated by the equation: n_samp / (E - tot_obs) or S / (E - N) + + + ''' + + n_samp, tot_obs, E = self.get_params(['n_samp', 'tot_obs', 'E']) + e = expand_n(e, len(n_samp)) + + pmf = [] + self.var['lambda_2'] = [] + self.var['emaxmin'] = [] + + convert_e = lambda ep, l2: ep - (1 / l2) + + for tn_samp, ttot_obs, tE, te in zip(n_samp, tot_obs, E, e): + + # Set lagrange multipliers + tl2 = float(tn_samp) / (tE - ttot_obs) # Harte (2011) 7.26 + e_max = ttot_obs + (1 / tl2) + e_min = 1 + (1 / tl2) + + tpmf = np.empty(len(te), dtype=float) + tns = convert_e(te, tl2) + + # Parse values that aren't in range and set to zero + ind_tot = np.arange(len(tpmf)) + ind_less = np.where(te >= e_min)[0] + ind_more = np.where(te <= e_max)[0] + ind_include = np.intersect1d(ind_more, ind_less) + ind_exclude = np.array(list(set(ind_tot) - set(ind_include))) + if len(ind_exclude) != 0: + tpmf[ind_exclude] = 0 + if len(ind_include) != 0: - tcdf[ind_include] = np.array([integrate.quad(nu_pmf_eq, e_min, se, - (tbeta, tl2, tn_samp))[0] / norm for se in - te[ind_include]]) + tpmf[ind_include] = logser_ut(tot_obs=ttot_obs, + n_samp=tn_samp).pmf(tns[ind_include])[0] + + pmf.append(tpmf) + self.var['lambda_2'].append(tl2) + self.var['emaxmin'].append((e_max, e_min)) + + return pmf + + @doc_inherit + def cdf(self, e): + + n_samp, tot_obs, E = self.get_params(['n_samp', 'tot_obs', 'E']) + e = expand_n(e, len(n_samp)) + + + cdf = [] + self.var['lambda_2'] = [] + + convert_n = lambda n, l2: n + (1 / l2) + + for tn_samp, ttot_obs, tE, te in zip(n_samp, tot_obs, E, e): + + tl2 = float(tn_samp) / (tE - ttot_obs) # Harte (2011) 7.26 + + # Set all e so you can sum + all_e = convert_n(np.arange(1, ttot_obs + 1), tl2) + + pmf_for_all_e = omega(tot_obs=ttot_obs, n_samp=tn_samp, + E=tE).pmf(all_e)[0] + cum_sum = np.cumsum(pmf_for_all_e) + + tcdf = np.array([cum_sum[np.sum(e_val >= all_e) - 1] if sum(e_val + >= all_e) - 1 != -1 else 0 for e_val in te]) cdf.append(tcdf) - self.var['beta'].append(tbeta) self.var['lambda_2'].append(tl2) return cdf - def rad(self, tol=.1): + def rad(self): ''' - This rad uses the observed cdf for a given nu distribution and the + This rad uses the observed cdf for a given omega distribution and the predicted cdf to calculate the rank energy distribution. Parameter ---------- tol : float - Precision interval. The integral of nu is approximated at the + Precision interval. The integral of omega is approximated at the interval tol. Smaller intervals can be more precise, but a tol between 0.1 and 0.5 is more effecient and the results are changed only marginally. @@ -3688,6 +4050,8 @@ def rad(self, tol=.1): n_samp, tot_obs, E = self.get_params(['n_samp', 'tot_obs', 'E']) rad = [] + convert_n = lambda n, l2: n + (1 / l2) + for tn_samp, ttot_obs, tE in zip(n_samp, tot_obs, E): # Set temp params @@ -3696,19 +4060,15 @@ def rad(self, tol=.1): self.params['E'] = tE tl2 = float(tn_samp) / (tE - ttot_obs) # Harte (2011) 7.26 - e_max = 1 + (1 / tl2) - e_min = 1 + (1 / (ttot_obs * tl2)) - - num = np.round((e_max - e_min) / tol, decimals=0) - eng = np.linspace(e_min, e_max + tol, num=num) - diff = eng[1] - eng[0] - - tcdf = np.cumsum(diff * self.pmf(eng)[0]) + all_e = convert_n(np.arange(1, ttot_obs + 1), tl2) + tpmf = self.pmf(all_e)[0] + tcdf = np.cumsum(tpmf) # Observed cdf. Not quite true if some energies overlap obs_cdf = np.arange(1 / (2 * (tn_samp)), 1, 1/tn_samp) - trad = [eng[sum(oc >= tcdf) - 1] for oc in obs_cdf] + trad = [all_e[sum(oc >= tcdf) - 1] if sum(oc >= tcdf) - 1 != -1 + else all_e[0] for oc in obs_cdf] rad.append(trad) @@ -3729,14 +4089,14 @@ def fit(self, data): A list containing tuples of length two or a list containing tuples of length three. If the tuples are of length two, the first object - in a tuple is an iterable containing the community individual energy - distribution. The second object in a tuple is an iterable + in a tuple is an iterable containing the community individual + energy distribution. The second object in a tuple is an iterable containing the empirical species abundance distribution. If the tuples are of length three, the first object in the tuple is an - iterable containing the average energy distribution. The second object - in a tuple an iterable containing the community individual energy - distribution. The third object in a tuple is an iterable - containing the empirical species abundance distribution. + iterable containing the total species energy distribution. The + second object in a tuple an iterable containing the community + individual energy distribution. The third object in a tuple is an + iterable containing the empirical species abundance distribution. ''' @@ -3745,7 +4105,7 @@ def fit(self, data): if len(data[0]) == 2: ied, sad = unpack(data) elif len(data[0]) == 3: - ased, ied, sad = unpack(data) + tsed, ied, sad = unpack(data) # Use base class fit super(nu, self).fit(sad) @@ -3780,6 +4140,28 @@ def nu_pmf_eq(es, beta, l2, s): return (1 / np.log(s / beta)) * (np.exp(-beta / (l2 * (es - 1)))) / \ (es - 1) +def l_solver(x, N, a): + """ + Used with a solver to get the langrange multiplier for a pi distribution + + Parameters + ---------- + x : float + Lagrange multiplier x = e**-lambda + N : float + total balls (individuals) in urn (species) + a : float + area fraction. 1 / n_samp or 1 / urn_number + + Returns + ------- + : float + + + """ + return ((x / (1 - x)) - (((N + 1) * x ** (N + 1)) / \ + (1 - x ** (N + 1)))) - (N * a) + def beta_solver(x, k, tot_obs, n_samp): """ Used with a solver to get the beta lagrange multiplier in the METE distributions. With a solver, this function @@ -3805,12 +4187,20 @@ def beta_solver(x, k, tot_obs, n_samp): return sum(x ** k / float(tot_obs) * n_samp) - sum((x ** k) / k) -def make_array(n): - '''Cast n as iterable array.''' +def make_array(n, dtype=None): + '''Cast n as iterable array. If dtype not none this will be the dtype of + the array. Otherwise it lets python choose. Must be a valid dtype or an + error will be thrown''' if np.iterable(n): - return np.array(n) + if dtype==None: + return np.array(n) + else: + return np.array(n, dtype=dtype) else: - return np.array([n]) + if dtype==None: + return np.array([n]) + else: + return np.array([n], dtype=dtype) def expand_n(n, size): @@ -3952,7 +4342,7 @@ def _ln_choose(n, k): Log binomial coefficient with extended gamma factorials. n and k may be int or array - if both array, must be the same length. ''' - gammaln = scipy.special.gammaln + gammaln = spec.gammaln return gammaln(n + 1) - (gammaln(k + 1) + gammaln(n - k + 1)) def set_up_and_down(anch, a_list, base=2): diff --git a/macroeco/models/test_curves.py b/macroeco/models/test_curves.py new file mode 100644 index 0000000..01e2c0c --- /dev/null +++ b/macroeco/models/test_curves.py @@ -0,0 +1,59 @@ +from __future__ import division + +from numpy.testing import (TestCase, assert_equal, assert_array_equal, + assert_almost_equal, assert_array_almost_equal, + assert_allclose, assert_, assert_raises) + +import numpy as np +from decimal import Decimal +from macroeco.models import * +import scipy as sp +import scipy.stats as stats + + +class METE_SAR(TestCase): + + def test_reversible(self): + S0, N0 = 100, 1e6 + As = np.array([100,50,10]) + Ns = N0 * As / As[0] + + Ss = mete_sar.vals(As, 100, 1e6, approx=True) + + # Start with each smaller base and go directly up to A0 + for A, S, N in zip(As[1:], Ss[1:], Ns[1:]): + assert_almost_equal(S0, + mete_sar.vals([A, As[0]], S, N, approx=True)[1]) + + def test_vals_down(self): + pass + + def test_vals_up(self): + pass + +class METE_iterative_SAR(TestCase): + + def test_reversible(self): + S0, N0 = 100, 1e6 + As = np.array([100,50,10]) + Ns = N0 * As / As[0] + + Ss = mete_sar_iterative.vals(As, 100, 1e6, approx=True) + + assert_array_almost_equal(Ss[::-1], + mete_sar_iterative.vals(As[::-1], Ss[-1], Ns[-1], approx=True)) + + def test_vals_down(self): + pass + + def test_vals_up(self): + # ACARI results from Bassett upscaling paper, see SI + # Note that different approximations are used here and in that analysis + S0, N0 = 86.6, 2015 + As = [0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56] + + Ss = mete_sar_iterative.vals(As, S0, N0, approx=True) + + assert_array_almost_equal(Ss, + [86.6, 106.0327113, 127.1223631, 149.7292838, + 173.7360065, 199.0452844, 225.5766732]) diff --git a/macroeco/models/test_distributions.py b/macroeco/models/test_distributions.py new file mode 100644 index 0000000..f518bdd --- /dev/null +++ b/macroeco/models/test_distributions.py @@ -0,0 +1,749 @@ +from __future__ import division + +from numpy.testing import (TestCase, assert_equal, assert_array_equal, + assert_almost_equal, assert_array_almost_equal, + assert_allclose, assert_, assert_raises) + +import numpy as np +from decimal import Decimal +from macroeco.models import * +from macroeco.models._distributions import _trunc_logser_solver +import matplotlib.pyplot as plt +import scipy as sp +import scipy.stats as stats + + +class TestGeom(TestCase): + + def test_pmf(self): + vals = geom.pmf([0,1,2], 0.25) + assert_array_almost_equal(vals, np.array([0.25, 0.1875, 0.140625])) + + def test_mean(self): + mu1 = geom.mean(0.5) + assert_almost_equal(mu1, 1) + + mu2 = geom.mean(0.25) + assert_almost_equal(mu2, 3) + + def test_cdf(self): + vals = geom.cdf([0,1,2], 0.5) + assert_array_almost_equal(vals, [0.5,0.75,0.875]) + + def test_translate_args(self): + ps = geom.translate_args([10, 20]) + assert_array_almost_equal(ps, [1/11, 1/21]) + + def test_fit_mle(self): + p = geom.fit_mle([1,2,4,5]) + assert_almost_equal(p, 0.25) + + +class TestGeomUptrunc(TestCase): + + def test_pmf(self): + # Expected values are regular geo cdf divided by cdf at b + vals = geom_uptrunc.pmf([0,1,2], 0.25, 2) + assert_array_almost_equal(vals, + np.array([0.25,0.1875,0.140625]) / 0.578125) + + def test_cdf(self): + # Expected values are regular geom cdf divided by cdf at b + vals = geom_uptrunc.cdf([0,1,2], 0.5, 2) + assert_array_almost_equal(vals, np.array([0.5,0.75,0.875]) / 0.875) + + def test_cdf_x_len_1(self): + # cdf should be not throw error even if x is len 1 + vals = geom_uptrunc.cdf(0, 0.5, 2) + assert_almost_equal(vals, 0.5 / 0.875) + + def test_mean(self): + mu1 = geom_uptrunc.mean(0.801, 32) + assert_almost_equal(mu1, 4, decimal=2) + + def test_translate_args_harte_16(self): + # TODO: The Harte figures appear to be inaccurate, generate better + # canonical test case for next two tests and for test_fit_mle and + # test_mean + + # From Harte 2011, Oxford U Press, Tab 7.4, n0=16 row, Eq 7.50 + b = 16 + mu = np.array([2, 1]) # A0/8, A0/16 + expected = np.array([1-0.669, 1-0.500]) + ps, _ = geom_uptrunc.translate_args(mu, b) + assert_almost_equal(ps, expected, decimal=3) + + def test_translate_args_harte_32(self): + # From Harte 2011, Oxford U Press, Tab 7.4, n0=32 row, Eq 7.50 + b = 32 + mu = np.array([4, 2]) # A0/8, A0/16 + expected = np.array([1-0.801, 1-0.667]) + ps, _ = geom_uptrunc.translate_args(mu, b) + assert_almost_equal(ps, expected, decimal=3) + + def test_translate_args_mqwilber_hand_calc(self): + # TODO: Confirm last 4 of tests, which more accurate + b = np.array([60, 340, 34]) + mu = np.array([60*.1, 340*.6, 34*.9]) + expected = np.array([1-.8572, 1-1.0036, 1-1.2937]) + ps, _ = geom_uptrunc.translate_args(mu, b) + assert_almost_equal(ps, expected, decimal=3) + + def test_translate_args_with_sum_of_pmf(self): + p1, b1 = geom_uptrunc.translate_args(341/4, 341) # Issue 33 + assert_array_almost_equal(1,np.sum(geom_uptrunc.pmf(range(342),p1,b1))) + + p2, b2 = geom_uptrunc.translate_args(120, 200) # Arbitrary + assert_array_almost_equal(1,np.sum(geom_uptrunc.pmf(range(201),p2,b2))) + + def test_fit_mle(self): + p1, _ = geom_uptrunc.fit_mle([0,10], 10) + assert_almost_equal(p1, 0) + + p2, _ = geom_uptrunc.fit_mle([1,3], 16) + assert_almost_equal(p2, 1-0.669, decimal=2) + + +class TestNbinom(TestCase): + + def test_pmf(self): + #> dnbinom(c(0,1,2), 3, mu=5) + #[1] 0.05273438 0.09887695 0.12359619 + vals = nbinom.pmf([0,1,2], 5, 3) + assert_array_almost_equal(vals, [0.05273438, 0.09887695, 0.12359619]) + + def test_cdf(self): + #> pnbinom(c(0,1,2),2,mu=30) + #[1] 0.00390625 0.01123047 0.02153015 + vals = nbinom.cdf([0,1,2], 30, 2) + assert_array_almost_equal(vals, [0.00390625, 0.01123047, 0.02153015]) + + def test_mean_var(self): + mu1, var1 = nbinom.stats(20, 2, moments='mv') + assert_array_almost_equal([mu1, var1], [20, 20+(20**2)/2]) + + def test_get_p_from_mu(self): + assert_almost_equal(nbinom._get_p_from_mu(10, 2), 2/12) + + def test_fit_mle_with_rvs(self): + np.random.seed(8) + x = nbinom.rvs(20, 10, size=100) + mu, k = nbinom.fit_mle(x) + assert_array_almost_equal([mu, k], [20, 10], decimal=0) + + def test_fit_mle_with_R(self): + #> library(MASS) + #> fitdistr(seq(49), "negative binomial") + x = np.array(range(1,50)) + mu, k = nbinom.fit_mle(x) + assert_array_almost_equal([mu, k], [25, 2.4337345], decimal=1) + + def test_fit_mle_with_manual_calc(self): + x = np.array([6,17,14,12,8,10,4,9,3,12,4,2,12,8,14,16,9,10,8,5,6]) + mu, k = nbinom.fit_mle(x, k_array=np.arange(0.01,10,0.01)) + assert_array_almost_equal([mu, k], [9, 8.54], decimal=2) + + def test_alternative_rvs(self): + rand_alt = nbinom.rvs_alt(5, 1, l=0, size=10000) + rand = nbinom.rvs(5, 1, size=10000) + + alt_k = nbinom.fit_mle(rand_alt, k_array=np.arange(0.5, 1.5, 0.01)) + k = nbinom.fit_mle(rand, k_array=np.arange(0.5, 1.5, 0.01)) + + assert_almost_equal(alt_k, k, decimal=1) + + +class TestNbinom_ztrunc(TestCase): + + def test_pmf(self): + # Test pmf gives back expected mean + tpmf = nbinom_ztrunc.pmf(np.arange(1, 500), 4, 1) + tmean = np.sum(np.arange(1, 500) * tpmf) + assert_almost_equal(tmean, 4) + + # Test pmf of 0 is 0 + tpmf = nbinom_ztrunc.pmf(0, 1, 1) + assert_equal(tpmf, 0) + + def test_cdf(self): + + # Test cdf and pmf agree! + tpmf = np.sum(nbinom_ztrunc.pmf(np.arange(1, 20), 20, 10)) + tcdf = nbinom_ztrunc.cdf(19, 20, 10) + assert_equal(tpmf, tcdf) + + def test_get_p_from_mu(self): + + # Test the fit p values are equal to those given in He and Legendre + # 2002 + test_values = [205.9878, 410.9853, 794.7613, 1210.0497, + 1945.9970, 3193.8362] + test_ks = [2, 1, 0.5, 0.3, 0.1363, 0.01] + + ps = np.array([nbinom_ztrunc.translate_args(335356 / 814., tk, + return_p=True)[0] for tk in test_ks]) + + assert_array_almost_equal(ps, test_values, decimal=0) + + def test_fit_mle(self): + + # Test fit returns something close the input + rvs_data = nbinom_ztrunc(10, 1).rvs(size=1000) + ml_mean, ml_k = nbinom_ztrunc.fit_mle(rvs_data) + assert_almost_equal(ml_mean, np.mean(rvs_data)) + assert_almost_equal(ml_k, 1, decimal=0) + + rvs_data = nbinom_ztrunc(20, 10).rvs(size=1000) + ml_mean, ml_k = nbinom_ztrunc.fit_mle(rvs_data) + assert_almost_equal(ml_mean, np.mean(rvs_data)) + assert_almost_equal(ml_k, 10, decimal=0) + + +class TestCnbinom(TestCase): + + def test_pmf(self): + # Test pmf sums to one + pmf = cnbinom.pmf(np.arange(0, 101), 20, 1, 100) + assert_almost_equal(np.sum(pmf), 1) + + def test_cdf(self): + # Test cdf is one at appropriate value + cdf = cnbinom.cdf(100, 20, 1, 100) + assert_almost_equal(cdf, 1) + + def test_fit_of_vector(self): + # Test fit of vector from Issue #3 (github.com/jkitzes/macroeco) + data = np.array([3,2,1,0,0,0,0,0,0,0,0,0,0,0,0]) + k_fit = cnbinom.fit_mle(data)[0] + assert_equal(False, k_fit == -0.26) + + def test_zillio_plots(self): + """ Test the cnbinom function replicated the Zillio and He plots + + References + ---------- + Zillio, T and He, F. 2010. Modeling spatial aggregation of finite + populations. Ecology, 91, 3698-3706 + + """ + + # Define Preliminary a and k to test + a = np.array([0.1, .3, .8]) + k = np.array([.1, 1, 10]) + fnbd_vec = [] + nbd_vec = [] + binm_vec = [] + descrip = [] + + # Get data + for ta in a: + for tk in k: + + fnbd_vec.append(cnbinom.pmf(np.arange(1, 101), + ta * 100, tk, 100)) + nbd_vec.append(nbinom.pmf(np.arange(1, 101), ta * 100, tk)) + binm_vec.append(stats.binom.pmf(np.arange(1, 101), 100, ta)) + + descrip.append("a=%s, k=%s" % (ta, tk)) + + # Loop through the data and plot it + fig, axes = plt.subplots(3, 3, sharex=True) + axes = axes.flatten() + + for i, ax in enumerate(axes): + ax.plot(np.arange(1, 101), fnbd_vec[i]) + ax.plot(np.arange(1, 101), nbd_vec[i], '--') + ax.plot(np.arange(1, 101), binm_vec[i], '.-') + ax.legend(('fnbd', 'nbd', 'binm'), loc='best') + ax.set_xlabel('abundance') + ax.set_ylabel('P(x)') + ax.text(0.6, 0.3, descrip[i], transform=ax.transAxes) + + # plt.tight_layout() + # Uncomment to see save figure + # fig.savefig("test_cbinom") + + +class TestDgamma(TestCase): + + def test_pmf(self): + # import macroeco_distribution as mac + # mac.dis_gamma_ll([1,1,2,5,6,7], 5, .3) + test_val = -32.3085384957 + pred_val = np.sum(dgamma.logpmf([1, 1, 2, 5, 6, 7], 5, .3)) + assert_almost_equal(test_val, pred_val) + + # ab = [1, 1, 1, 1, 2, 4, 4, 4, 4, 4, 45, 267] + # mac.dis_gamma_ll(ab, 0.1, 200) + test_val = -39.889246913391531 + ab = [1, 1, 1, 1, 2, 4, 4, 4, 4, 4, 45, 267] + pred_val = np.sum(dgamma.logpmf(ab, 0.1, 200)) + assert_almost_equal(test_val, pred_val) + + def test_cdf(self): + # Test that cdf gets close to one + assert_almost_equal(dgamma.cdf(1000, 4, .9), 1) + + def test_fit_mle(self): + # mac.dis_gamma_solver([1,1,2,5,6,7]) + fit_alpha = 1.1324749 + fit_theta = 2.86753 + alpha, theta = dgamma.fit_mle([1, 1, 2, 5, 6, 7]) + assert_almost_equal(fit_alpha, alpha, decimal=3) + assert_almost_equal(fit_theta, theta, decimal=3) + + def test_rank(self): + # When alpha is almost zero should be similar to logseries with p = + # e^(-1 / theta) + logseries_rank = logser_uptrunc.rank(10, np.exp(-1 / 3), 1000) + dgamma_rank = dgamma.rank(10, 0.0001, 3) + + assert_array_equal(logseries_rank, dgamma_rank) + +class TestLogser(TestCase): + + def test_pmf(self): + + # Testing against values in Williams 1944, + # Some applications of the logarithmic series and the index of + # diversity to ecological problems, pg. 18. + + # Acridiidae: S = 826, p = 0.92964 (There seems to be an error in + # their data at 3 -> should be 83.3 not 88.3) + test_vals = np.array([289.3, 134.5, 83.3, 58.1, 43.2, 33.5, 26.7, 21.7, + 17.9, 15., 12.7, 10.8, 9.3, 8., 6.9, 6.1, 5.3, 4.6, 4.1, 3.6]) + + pred_pmf = logser.pmf(np.arange(1, 21), 0.92964) + pred_vals = np.round(pred_pmf * 826, decimals=1) + assert_array_equal(test_vals, pred_vals) + + # Mantidae: S = 209, p = 0.89781 + test_vals = np.array([82.3, 36.9, 22.1, 14.9, 10.7, 8., 6.2, 4.8, 3.9, + 3.1, 2.5, 2.1, 1.7, 1.4, 1.2, 1., 0.9, 0.7, 0.6, 0.5]) + + pred_pmf = logser.pmf(np.arange(1, 21), 0.89781) + pred_vals = np.round(pred_pmf * 209, decimals=1) + assert_array_equal(test_vals, pred_vals) + + # Blattidae: S = 197, p = 0.96476 + test_vals = np.array([56.8, 27.4, 17.6, 12.8, 9.8, 7.9, 6.5, 5.5, 4.7, + 4.1, 3.6, 3.2, 2.8, 2.5, 2.3, 2.1, 1.9, 1.7, + 1.6, 1.4, 1.3, 1.2, 1.1, 1., 1., 0.9, 0.8, + 0.8, 0.7, 0.7]) + + pred_pmf = logser.pmf(np.arange(1, 31), 0.96476) + pred_vals = np.round(pred_pmf * 197, decimals=1) + assert_array_equal(test_vals, pred_vals) + + def test_translate_args(self): + + # Using values from Williams 1994 + test_vals = [0.92964, 0.89781, 0.96476, 0.97003] + data = [4112 / 826., 805. / 209, 1612. / 197, 480. / 52] + + pred_vals = [logser.translate_args(td) for td in data] + + assert_array_almost_equal(test_vals, pred_vals, decimal=5) + + def test_fit_mle(self): + + test_val = .97003 # Value from Williams 1944 + x = np.arange(1, 53.) + norm_x = x / sum(x) + data = norm_x * (480) + pred_val = logser.fit_mle(data) + assert_almost_equal(test_val, pred_val, decimal=5) + + +class TestLogserUptrunc(TestCase): + + def test_pmf(self): + # import macroeco_distributions as mac + # mac.trunc_logser(.8, 100).pmf(4) + test_val = logser_uptrunc(.8, 100).pmf(4) + assert_almost_equal(test_val, 0.063624697299) + + # import macroeco_distributions as mac + # mac.trunc_logser(.45, 3).pmf(3) + test_val = logser_uptrunc(.45, 3).pmf(3) + assert_almost_equal(test_val, 0.052224371373307543) + + def test_cdf(self): + # import macroeco_distributions as mac + # mac.trunc_logser(.8, 100).cdf(4) + test_val = logser_uptrunc(.8, 100).cdf(4) + assert_almost_equal(test_val, 0.86556098617469057) + + # import macroeco_distributions as mac + # mac.trunc_logser(.45, 3).cdf(2) + test_val = logser_uptrunc(.45, 3).cdf(2) + assert_array_almost_equal(test_val, 0.9477756286266924) + + def test_mean(self): + # Expected mean is N / S + + N = 500 + S = 30. + p = logser_uptrunc.translate_args(N / S, N)[0] + mean = logser_uptrunc.stats(p, N)[0] + assert_almost_equal(mean, N / S, decimal=5) + + def test_fit_mle(self): + # Should return same result as translate args + data = np.arange(1, 40) + N = np.sum(data) + S = len(data) + + fits = logser_uptrunc.fit_mle(data) + assert_array_almost_equal(fits, + logser_uptrunc.translate_args(N / S, N), + decimal=5) + + def test_translate_args(self): + # Test that values equal values from John's book (Harte 2011) + + lg = logser_uptrunc.translate_args(4 * 4 / 4, 4 * 4)[0] + assert_almost_equal(-np.log(lg), 0.0459, decimal=4) + + lg = logser_uptrunc.translate_args(2 ** 4 * 4 / 4, 2 ** 4 * 4)[0] + assert_almost_equal(-np.log(lg), -0.00884, decimal=5) + + lg = logser_uptrunc.translate_args(2 ** 8 * 4 / 4, 2 ** 8 * 4)[0] + assert_almost_equal(-np.log(lg), -0.00161, decimal=5) + + lg = logser_uptrunc.translate_args(2 ** 8 * 16 / 16, 2 ** 8 * 16)[0] + assert_almost_equal(-np.log(lg), 0.000413, decimal=6) + + lg = logser_uptrunc.translate_args(2 ** 12 * 64 / 64, 2 ** 12 * 64)[0] + assert_almost_equal(-np.log(lg), 0.0000228, decimal=7) + + lg = logser_uptrunc.translate_args(20 / 20, 20)[0] + assert_equal(0, 0) + + def test_n_close_to_s(self): + # Test the solver doesn't fail when N is very close to S + + _trunc_logser_solver(2, 3) + _trunc_logser_solver(3, 4) + _trunc_logser_solver(100, 101) + + def test_rank(self): + # Test rank against values generated by hand + exp_vals = np.array([1., 1., 2., 3., 4., 7., 11., 18., 31., 62.]) + + # Test values generated + test_vals = logser_uptrunc.rank(10, .99, 100) + + assert_array_equal(exp_vals, test_vals) + + def test_rvs(self): + + # Make sure random number generator is returning what is expected + res1 = logser_uptrunc.rvs(.9, 100) + assert_equal(1, len(np.atleast_1d(res1))) + + res2 = lognorm.rvs(.9, 100, size=5) # Should be length 5 + assert_equal(5, len(res2)) + + +class TestLognorm(TestCase): + + def test_pmf(self): + # R pmf: dlnorm(c(1:10), 2, 2) + r_output = np.array([0.1210, .0806, .0601, 0.0476, 0.0391, .0331, + 0.0285, 0.0249, 0.0221, 0.0197]) + + test1 = lognorm.pdf(np.arange(1, 11), 2, 2) + assert_array_almost_equal(test1, r_output, decimal=4) + + # R pmf: dlnorm(5, -3, 5) + r_ans = 0.0104333 + test2 = lognorm.pdf(5, -3, 5) + assert_almost_equal(test2, r_ans) + + def test_cdf(self): + # R cdf: plnorm(c(1,1,4,5,12), 1.2, 3.45) + r_output = np.array([0.3639854, 0.3639854, 0.5215318, 0.5472346, + 0.6452161]) + + test = lognorm.cdf([1, 1, 4, 5, 12], 1.2, 3.45) + assert_array_almost_equal(test, r_output, decimal=7) + + def test_translate_args(self): + + mean = 67; sigma = 2 + mu, sigma = lognorm.translate_args(mean, sigma) + + # Expected mu: np.log(mean) - (sigma**2 / 2) + exp_mu = 2.2046926 + assert_almost_equal(mu, exp_mu) + + def test_fit_mle(self): + ''' + # R code + pmf <- function(x, N, S, sigma){ + mu = log(N / S) - (sigma^2 / 2) + dlnorm(x, meanlog=mu, sdlog=sigma) + } + + mle <- function(sdlog, x, N, S){ + -sum(log(pmf(x, N, S, sdlog))) + } + + params <- function(x){ + N = sum(x); + S = length(x); + optimize(mle, interval=c(0,5), x, N, S) + } + + data = # some data + params(data)''' + + data1 = [1, 1, 1, 1, 1, 2, 2, 3, 3, 4, 5, 6, 123, 456] + data2 = [2, 2, 2, 4, 67, 34, 152, 9] + + r_fits = [2.07598, 1.59213] # data1, data2 + + testfit1 = lognorm.fit_mle(data1, fix_mean=True)[1] + testfit2 = lognorm.fit_mle(data2, fix_mean=True)[1] + + assert_almost_equal(r_fits[0], testfit1, decimal=5) + assert_almost_equal(r_fits[1], testfit2, decimal=5) + + # Scipy code: stats.lognorm.fit(data1, floc=0) + scipy_ans = 1.79518287 + test1 = lognorm.fit_mle(data1)[1] + assert_almost_equal(scipy_ans, test1) + + def test_rvs(self): + + # Test that multiple random numbers can be returned without error + res1 = lognorm.rvs(5, 5) # Should be length 1 + assert_equal(1, len(np.atleast_1d(res1))) + + res2 = lognorm.rvs(5, 5, size=5) # Should be length 5 + assert_equal(5, len(res2)) + + +class TestPlnorm(TestCase): + + def test_pmf(self): + + # Test against R VGAM fxn: dpolono(c(1:10), -1, 3) + r_res = [0.121392844, 0.057692006, 0.035586652, 0.024863530, + 0.018681089, 0.014721035, 0.011998072, 0.010027588, 0.008545518, + 0.007396607] + + test = plnorm.pmf(np.arange(1, 11), -1, 3) + assert_array_almost_equal(r_res, test) + + # Test against macroeco_distributions.pln: + # pln.pmf([0, 50, 1000], 2.34, 5, 0) + + md_res = np.array([2.86468926e-01, 1.51922299e-03, 5.25717609e-05]) + test = plnorm.pmf([0, 50, 1000], 2.34, 5) + assert_array_almost_equal(md_res, test) + + # Unit test from test_macroeco_distributions + + # Test values for Poisson lognomal are chosen from Table 1 and Table 2 + # in Grundy Biometrika 38:427-434. + # In Table 1 the values are deducted from 1 which give p(0). + pln_table1 = [[-2.0, 2, '0.9749'], + [-2.0, 8, '0.9022'], + [-2.0, 16, '0.8317'], + [0.5, 2, '0.1792'], + [0.5, 8, '0.2908'], + [0.5, 16, '0.3416'], + [3, 2, '0.0000'], + [3, 8, '0.0069'], + [3, 16, '0.0365']] + + pln_table2 = [[-2.0, 2, '0.0234'], + [-2.0, 8, '0.0538'], + [-2.0, 16, '0.0593'], + [0.5, 2, '0.1512'], + [0.5, 8, '0.1123'], + [0.5, 16, '0.0879'], + [3, 2, '0.0000'], + [3, 8, '0.0065'], + [3, 16, '0.0193']] + + for vals in pln_table1: + test = plnorm.pmf(0, np.log(10 ** vals[0]), vals[1] ** .5) + assert_almost_equal(test, float(vals[2]), decimal=4) + + for vals in pln_table2: + test = plnorm.pmf(1, np.log(10 ** vals[0]), vals[1] ** .5) + assert_almost_equal(test, float(vals[2]), decimal=4) + + + def test_cdf(self): + + # Test against R VGAM fxn: ppolono(c(0, 15, 10000), .1, 2) + r_res = [0.3954088, 0.9048902, 0.9999973] + test = plnorm.cdf([0, 15, 10000], .1, 2) + assert_array_almost_equal(r_res, test, decimal=5) + + # Test against macroeco_distributions: + # pln.cdf([1,2,3], 20, 4, 0) + + md_res = np.array([7.34761277e-07, 1.18860746e-06, 1.67083480e-06]) + test = plnorm.cdf([1, 2, 3], 20, 4) + assert_array_almost_equal(md_res, test, decimal=5) + + def test_fit_mle(self): + + # Test against R poilog: poilogMLE(data, zTrune=FALSE) + data = np.array([1,1,1,1,1,2,2,2,3,3,4,4,5,5,6,6,12,45,67]) + Rfits = (1.31928, 1.18775) + fits = plnorm.fit_mle(data) + assert_array_almost_equal(Rfits, fits, decimal=3) + + # Test against macroeco_distributions + # pln_solver(data, lower_trunc=False) + md_res = (1.3195580310886075, 1.1876019842774048) + assert_array_almost_equal(md_res, fits, decimal=4) + + def test_rank(self): + # This should be a slow test! + + # Test against ppf. + # >>> n = 50 + # >>> vals = (np.arange(1, n+1) - 0.5) / n + # >>> plnorm.ppf(vals, 1, 1) + test_case = np.array([ 0., 0., 0., 0., 0., 0., 0., 0., + 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., + 2., 2., 2., 2., 3., 3., 3., 3., 3., 3., 4., 4., + 4., 4., 5., 5., 5., 6., 6., 6., 7., 7., 8., 9., + 10., 11., 13., 15., 19., 29.]) + + pred_res = plnorm.rank(50, 1, 1, crit=0.5, upper=40) + + # Test the values are within one + diff = np.abs(pred_res - test_case) + zeros = np.sum(diff == 0) + ones = np.sum(diff == 1) + assert_equal(zeros + ones, len(diff)) + + +class TestPlnormZtrunc(TestCase): + + def test_pmf(self): + + # Test against macroeco_distributions: + # pln.pmf([0, 50, 1000], 2.34, 5, 1) + md_res = np.array([0, 2.12916164e-03, 7.36783061e-05]) + test = plnorm_ztrunc.pmf([0, 50, 1000], 2.34, 5) + + assert_array_almost_equal(md_res, test) + + def test_cdf(self): + + # Test against dpolonorm + # ppolono(c(1,2,3), 4.3, 100) / (1 - ppolono(0, 4.3, 100)) + r_res = [0.007670365, 0.011507417, 0.014065948] + + test = plnorm_ztrunc.cdf(np.arange(1, 4), 4.3, 100) + assert_array_almost_equal(r_res, test) + + def test_fit_mle(self): + + data = np.array([1,1,1,4,4,4,4,5,5,5,12,44,55,112]) + + # macroeco_distributions fit: pln_solver(data) + md_fits = (1.068510556981163, 1.8800439687956865) + test = plnorm_ztrunc.fit_mle(data) + assert_array_almost_equal(test, md_fits, decimal=4) + + # R poilog: poilogMLE(data) + r_fits = (1.067620, 1.880646) + assert_array_almost_equal(test, r_fits, decimal=3) + + def test_rank(self): + + # TODO: Can't test this against ppf because ppf is too slow + pass + + +class TestExpon(TestCase): + + def test_pdf(self): + vals = expon.pdf([0,1,2], 2.5) + assert_almost_equal(vals, [2.5, 0.205212497, 0.016844867]) + + def test_mean(self): + mu1 = expon.mean(0.5) + assert_almost_equal(mu1, 2) + + mu2 = expon.mean(0.25) + assert_almost_equal(mu2, 4) + + def test_cdf(self): + vals = expon.cdf([0,1,2], 0.5) + assert_array_almost_equal(vals, [0, 0.39346934, 0.632120559]) + + def test_translate_args(self): + assert_almost_equal(1/13, expon.translate_args(13)) + + def test_fit_mle(self): + assert_almost_equal(1/8, expon.fit_mle([6,7,9,10])) + + +class TestExponUptrunc(TestCase): + + def test_pdf(self): + vals = expon_uptrunc.pdf([0,1,2], 0.2, 10) + assert_almost_equal(vals, [0.231303529, 0.189375312, 0.155047392]) + + def test_pdf_lambda_equal_zero_is_uniform(self): + vals = expon_uptrunc.pdf([0,1,2], 0.0000001, 10) + assert_almost_equal(vals, [0.1, 0.1, 0.1]) + + def test_pdf_integrates_to_one(self): + val1 = sp.integrate.quad(expon_uptrunc.pdf, 0, 10, (0.2, 10)) + assert_almost_equal(val1[0], 1) + + val2 = sp.integrate.quad(expon_uptrunc.pdf, 0, 100, (.000000001, 100)) + assert_almost_equal(val2[0], 1) + + val3 = sp.integrate.quad(expon_uptrunc.pdf, 0, 100, (-5, 100)) + assert_almost_equal(val3[0], 1) + + def test_mean_lambda_equal_zero(self): + # If lam zero (uniform distribution), mean should be 1/2 b + assert_almost_equal(expon_uptrunc.mean(0.0000001, 10), 5, 5) + + def test_mean(self): + def integrand(x, lam, b): + return x * expon_uptrunc.pdf(x, lam, b) + + for lam in [2, 4.5]: + val = sp.integrate.quad(integrand, 0, 5, args=(lam, 10))[0] + assert_almost_equal(expon_uptrunc.mean(lam, 5), val, 4) + + def test_cdf(self): + vals = expon_uptrunc.cdf([0,1,2], 0.2, 10) + assert_array_almost_equal(vals, [0, 0.209641082, 0.381280683]) + + def test_translate_args_uniform_case(self): + lam = expon_uptrunc.translate_args(5, 10) + assert_almost_equal(lam[0], 0) + + def test_translate_args(self): + # mean -> lambda -> mean comparison + lam = expon_uptrunc.translate_args(3, 10) + assert_almost_equal(expon_uptrunc.mean(lam, 10), 3) + + def test_fit_mle_uniform_case(self): + data = [5,5,5] + mean = np.mean(data) + lam = expon_uptrunc.fit_mle(data, 10)[0] + assert_almost_equal(expon_uptrunc.mean(lam, 10), 5, 4) + + def test_fit_mle(self): + data = [4,5,7,8] + mean = np.mean(data) + lam = expon_uptrunc.fit_mle(data, 10)[0] + assert_almost_equal(expon_uptrunc.mean(lam, 10), 6) + diff --git a/test_distributions.py b/macroeco/models/xest_distributions_old.py similarity index 92% rename from test_distributions.py rename to macroeco/models/xest_distributions_old.py index 99ed837..b677262 100644 --- a/test_distributions.py +++ b/macroeco/models/xest_distributions_old.py @@ -24,7 +24,7 @@ import scipy.stats as stats import matplotlib.pyplot as plt -# TODO: Need to add fit functions to tests with new fit functions. +# TODO: Need to add fit functions to tests with new fit functions. # TODO: Do we need to test rad's? Against what? @@ -43,10 +43,10 @@ def setUp(self): 15.87, 24.32, 101.25, 155]) self.sad = np.arange(1, 156) - + def test_logser(self): # Test error raising - self.assertRaises(AssertionError, logser(n_samp=234, tot_obs=67).pmf, + self.assertRaises(AssertionError, logser(n_samp=234, tot_obs=67).pmf, 1) self.assertRaises(AssertionError, logser(n_samp=34, tot_obs=0).pmf, 1) @@ -59,7 +59,7 @@ def test_logser(self): self.assertTrue(np.round(lgser.var['p'][0], decimals=4) == 0.9974) # Test cdf reaches 1 - cdf = np.round(logser(n_samp=45, tot_obs=1200).cdf(1200)[0][0], + cdf = np.round(logser(n_samp=45, tot_obs=1200).cdf(1200)[0][0], decimals=1) self.assertTrue(cdf == 1) @@ -92,13 +92,13 @@ def test_logser_ut(self): pmf = lg.pmf(1) self.assertTrue(np.round(-np.log(lg.var['x'][0]), decimals=6) == 0.000413) lg = logser_ut(n_samp=64, tot_obs=2**12 * 64) - pmf = lg.pmf(1) + pmf = lg.pmf(1) self.assertTrue(np.round(-np.log(lg.var['x'][0]), decimals=7) == 0.0000228) - + # Check that they don't fail logser_ut(n_samp=64, tot_obs=1000).rad() logser_ut(n_samp=64, tot_obs=1000).cdf((1,1,2,4,5,7,12)) - + # Test correct answer when n_samp == tot_obs lg = logser_ut(n_samp=31, tot_obs=31) pmf = lg.pmf([1,2,3,4,5]) @@ -108,9 +108,9 @@ def test_logser_ut(self): def test_logser_ut_appx(self): # Test error raising - self.assertRaises(AssertionError, logser_ut_appx(n_samp=234, + self.assertRaises(AssertionError, logser_ut_appx(n_samp=234, tot_obs=67).pmf, 1) - self.assertRaises(AssertionError, logser_ut_appx(n_samp=34, + self.assertRaises(AssertionError, logser_ut_appx(n_samp=34, tot_obs=0).pmf, 1) # Test that values equal values from John's book (Harte 2011) @@ -140,8 +140,8 @@ def test_logser_ut_appx(self): # Test that they don't fail logser_ut_appx(n_samp=64, tot_obs=1000).rad() logser_ut_appx(n_samp=64, tot_obs=1000).cdf((1,1,2,4,5,7,12)) - - + + def test_plognorm(self): # TODO: Should test against Ethans psolver @@ -160,7 +160,7 @@ def test_plognorm(self): # Test pmf is zero when mu or sigma negative self.assertTrue(sum(np.round(plognorm(mu=-3,sigma=3).\ - pmf([1,2,3,4,5])[0], decimals=3)) == 0) + pmf([1,2,3,4,5])[0], decimals=3)) == 0) self.assertTrue(sum(np.round(plognorm(mu=3,sigma=-3).\ pmf([1,2,3,4,5])[0], decimals=3)) == 0) @@ -178,13 +178,13 @@ def test_plognorm(self): plognorm().fit([self.abund_list[0]]) plognorm(mu=2, sigma=2).cdf(5) - + def test_plognorm_lt(self): #Test our pmf against R's poilog R_zero_trun = [0.11620, 0.07216, 0.05201, 0.04049, 0.02783, 0.02398, 0.00686] - pred_plog = plognorm_lt(mu=2, sigma=3).pmf([1,2,3,4,6,7,23])[0] + pred_plog = plognorm_lt(mu=2, sigma=3).pmf([1,2,3,4,6,7,23])[0] self.assertTrue(np.array_equal(R_zero_trun, np.round(pred_plog, decimals=5))) @@ -211,8 +211,8 @@ def test_plognorm_lt(self): plognorm_lt(mu=2, sigma=2).pmf([2,3,4,5,23]) plognorm_lt().fit([self.abund_list[0]]) plognorm_lt(mu=10, sigma=1).cdf(45) - - + + def test_lognorm(self): # Test pmf against R output @@ -227,15 +227,15 @@ def test_lognorm(self): diff = r_output - lnorm self.assertTrue(np.all(diff == 0)) - lnorm = np.round(lognorm(tot_obs = np.exp(1.5 + (1.2**2 / 2)) * 50, + lnorm = np.round(lognorm(tot_obs = np.exp(1.5 + (1.2**2 / 2)) * 50, n_samp=50,sigma=1.2).pmf([1,2,3,4,5,6,7,12,45])[0], decimals=4) diff = r_output2 - lnorm self.assertTrue(np.all(diff == 0)) # Test cdf against R cdf - rcdf = np.array([0.3319, 0.3319, 0.4869, 0.5127, 0.6124]) - pycdf = np.round(lognorm(tot_obs=np.exp(1.5 + (3.45**2 / 2)), n_samp=1, + rcdf = np.array([0.3319, 0.3319, 0.4869, 0.5127, 0.6124]) + pycdf = np.round(lognorm(tot_obs=np.exp(1.5 + (3.45**2 / 2)), n_samp=1, sigma=3.45).cdf([1,1,4,5,12])[0], decimals=4) diff = rcdf - pycdf self.assertTrue(np.all(diff == 0)) @@ -263,7 +263,7 @@ def test_lognorm(self): pyfit2 = lognorm().fit([fit_array2]).params['sigma'][0] diff = r_lognorm_fits - np.round([pyfit1, pyfit2], decimals=5) self.assertTrue(np.all(diff == 0)) - + # Test that these don't fail lognorm().fit([self.abund_list[0]]) tot_obs=sum(self.abund_list[0]) @@ -274,9 +274,9 @@ def test_lognorm(self): dist = lognorm().fit(self.abund_list) dist.pmf(3) dist.pmf([[3],[4],[5],[6]]) - self.assertTrue(len(dist.params['tot_obs']) == 4) + self.assertTrue(len(dist.params['tot_obs']) == 4) + - def test_geo_ser(self): # TODO: Test pmf. # Visually, the CDF should be a straight line on a log(abundance) vs. @@ -305,7 +305,7 @@ def test_geo_ser(self): dist = geo_ser().fit(self.abund_list) self.assertTrue(len(dist.params['k']) == 4) - + def test_broken_stick(self): # Test that n_except throws approriate error if length n_samp and tot_obs are not # the same as length pmf @@ -326,7 +326,7 @@ def test_broken_stick(self): diff = np.array(expt) - bs self.assertTrue(np.all(diff == 0)) - # Test that these don't fail + # Test that these don't fail broken_stick(n_samp=23, tot_obs=500).cdf([1,2,500]) broken_stick(n_samp=23, tot_obs=500).rad() @@ -338,14 +338,14 @@ def test_broken_stick(self): ab in self.abund_list]))) def test_dgamma(self): - + # Don't have any good published graphs to test it against. Test # everything is working obs_sad = [103,115,13,2,67,36,51,8,6,61,10,21,7,65,4,49,92,37,16,6,23,\ 9,2,6,5,4,1,3,1,9,2] dg = dgamma().fit([obs_sad]) - + # Check that the parameters are in vars self.assertTrue('alpha' in dg.var) self.assertTrue('theta' in dg.var) @@ -372,10 +372,10 @@ def test_sugihara(self): self.assertRaises(NotImplementedError, sugihara().cdf, 34) self.assertRaises(NotImplementedError, sugihara().pdf, 23) - + def test_binm(self): # Using scipy.binom which is already unit tested. - + # Check that pdf and cdf give correct answers dist = binm(tot_obs=8123, n_samp=10) self.assertTrue(dist.cdf(8123)[0][0] == 1) @@ -388,7 +388,7 @@ def test_binm(self): # Check that fit works dist = binm().fit(self.abund_list) - + def test_pois(self): # Using scipy.poisson which is already unit tested @@ -421,20 +421,14 @@ def test_nbd(self): geo_data = np.random.geometric(p, size=10000) dist = nbd().fit([geo_data]) self.assertTrue(np.round(dist.params['k'][0], decimals=1) == 1) - + def test_nbd_lt(self): # TODO: test pmf # Test that cdf is about one dist = nbd_lt(tot_obs=2300, n_samp=45, k=3) - self.assertTrue(np.round(dist.cdf(2300)[0][0], decimals=1) == 1.0) - - # Check that k of length one is extended to length 2 based on p - # parameter - dist = nbd_lt(tot_obs=[400, 600], n_samp=[30, 23], k=[3]) - pmf = dist.pmf(1) - self.assertTrue(np.array_equal(np.round(dist.var['p'], decimals=4), - np.array([.1837,.1031]))) + d = dist.cdf(2300)[0][0] + self.assertTrue(np.round(d, decimals=1) == 1.0) # Multiple entries both yield cdf with 1 dist = nbd_lt(tot_obs=[400, 600], n_samp=[30, 23], k=[3,2]) @@ -443,45 +437,34 @@ def test_nbd_lt(self): b = np.round(cdf[0][0], decimals=1) self.assertTrue(a == b) - # Test pmf against scipy - mu = 500 * (1. / 20); k = 2; p = 1. / (mu / k + 1) - scipy_0 = stats.nbinom.pmf(0, k, p) - vals = np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17]) - test_vals = stats.nbinom.pmf(vals, k, p) / (1 - scipy_0) - pred_vals = nbd_lt(tot_obs=500, n_samp=20, k=2).pmf(vals)[0] - self.assertTrue(np.array_equal(test_vals, pred_vals)) - - # Test pmf against Published Truncated NBD. Sampford 1955, The - # Truncated Negative Binomial Distribution. - def test_pmf(n, p, k): - om = (1 / (1 + (mu/k))); eta = 1 - om - - norm = np.math.gamma(k + n) / (np.math.gamma(k) * - np.math.gamma(n + 1)) - - kernel = (om**k / (1 - om**k)) * (eta**n) - return norm * kernel - - test_vals = np.array([test_pmf(x, p, k) for x in vals]) - test_vals = np.round(test_vals, decimals=7) - pred_vals = np.round(pred_vals, decimals=7) - self.assertTrue(np.array_equal(test_vals, pred_vals)) - - # Test cdf against Published TNBD: - pred_cdf = nbd_lt(tot_obs=500, n_samp=20, k=2).cdf(vals)[0] - pred_cdf = np.round(pred_cdf, decimals=7) - test_vals = np.array([test_pmf(x, p, k) for x in vals]) - test_cdf = np.round(np.cumsum(test_vals), decimals=7) - self.assertTrue(np.array_equal(pred_cdf, test_cdf)) - - - + # Test the fit p values are equal to those given in He and Legendre + # 2002 + # I am rounding to the nearest whole number, those I have confirmed + # that the decimals are very close too + he_values = np.round([205.9878, 410.9853, 794.7613, 1210.0497, 1945.9970, + 3193.8362], decimals=0) + he_ks = [2, 1, 0.5, 0.3, 0.1363, 0.01] + tnbd = nbd_lt(tot_obs=335356, n_samp=814, k=he_ks) + tnbd.pmf(1) + pred = np.round(tnbd.var['p'], decimals=0) + print pred + print he_values + self.assertTrue(np.array_equal(he_values, pred)) + + # Test that fixing the bias leads to the proper mean + ks = np.linspace(.01, 5, num=100) + vals = np.arange(1,1000) + for k in ks: + ob = nbd_lt(tot_obs=500, n_samp=20, k=k) + pred_vals = ob.pmf(vals)[0] + bmean = sum(vals * pred_vals) + self.assertTrue(np.round(bmean, decimals=0) == 500 / 20.) def test_fnbd(self): # Test that no error is thrown if a zero is passed fnbd().fit([[0,1,2,3,4,5,6]]) - + # TypeError if k is not given dist = fnbd(tot_obs=2300, n_samp=20) self.assertRaises(TypeError, dist.pmf, 45) @@ -538,7 +521,7 @@ def test_fnbd(self): plt.clf() # Based on Zillio and He 2010, Calculating a few pmf values by hand. - # Going to test the fnbd against these values. + # Going to test the fnbd against these values. def test_geo(self): # This is just a wrapper function for nbd. Already tested. Will just @@ -548,7 +531,7 @@ def test_geo(self): test = geo().fit([[0,0,0,1,4,67], [1,1,3,5,23]]) self.assertTrue(np.all(test.params['tot_obs'] == np.array([72, 33]))) self.assertTrue(np.all(test.params['n_samp'] == np.array([6,5]))) - + # Test that tot_obs is broadcast test = geo(tot_obs=456, n_samp = [34,56,12]) test.pmf(0) @@ -568,12 +551,12 @@ def test_geo(self): self.assertTrue(np.array_equal(test_geo[i], test_nbd[i])) def test_fgeo(self): - + # Test fit work and returns expected results test = fgeo().fit([[0,0,0,1,4,67], [1,1,3,5,23]]) self.assertTrue(np.all(test.params['tot_obs'] == np.array([72, 33]))) self.assertTrue(np.all(test.params['n_samp'] == np.array([6,5]))) - + # Test that tot_obs is broadcast test = fgeo(tot_obs=456, n_samp = [34,56,12]) test.pmf(0) @@ -594,47 +577,80 @@ def test_fgeo(self): def test_tgeo(self): + # Test against values from Harte 2011 + x_vals = [0.333, 0.434, .568, .707, .823, .901] + tg = tgeo(tot_obs=[1,2,4,8,16,32], n_samp=4) + tg.pmf(0) + pred_vals = np.round(tg.var['x'], 3) + print pred_vals + self.assertTrue(np.array_equal(x_vals, pred_vals)) + + # In Harte 2011 .143 is given as .125, but this is a mistake. Every + # other value is exactly as expected from teh the book + x_vals = [0.143, .220, .344, .505, .669, .801] + tg = tgeo(tot_obs=[1,2,4,8,16,32], n_samp=8) + tg.pmf(0) + pred_vals = np.round(tg.var['x'], 3) + print pred_vals + self.assertTrue(np.array_equal(x_vals, pred_vals)) + + x_vals = [0.067, .115, .201, .334, .5, .667] + tg = tgeo(tot_obs=[1,2,4,8,16,32], n_samp=16) + tg.pmf(0) + pred_vals = np.round(tg.var['x'], 3) + print pred_vals + self.assertTrue(np.array_equal(x_vals, pred_vals)) + # Test tgeo cdf is one dist = tgeo(n_samp=10, tot_obs=2345) self.assertTrue(np.round(dist.cdf(2345)[0][0], decimals=1) == 1.0) + # When n_samp < 2 weird things happen # Testing Lagrange multiplier against values generated by hand # [(n=60, a=.1), (n=340, a=.6), (n=34, a=.9), (n=12, a=.9), (n=2, .9), # (n=1, a=.1),(n=1, a=0.0001), x_vals = np.array([.8572, 1.0036, 1.2937, 1.8298, 5.6056, 0.1111]) - tg = tgeo(tot_obs=[60,340,34,12, 2, 1], + tg = tgeo(tot_obs=[60,340,34,12, 2, 1], n_samp=(1./.1, 1/.6, 1/.9, 1/.9, 1/.9, 1/.1)) tg.pmf(0) pred_vals = np.round(tg.var['x'], decimals=4) self.assertTrue(np.array_equal(x_vals, pred_vals)) - + x_vals = np.array([1.0e-4, 1.0e-5]) tg = tgeo(tot_obs=[1,1], n_samp=[1/.0001, 1/.00001]) tg.pmf(0) pred_vals = np.round(tg.var['x'], decimals=6) self.assertTrue(np.array_equal(x_vals, pred_vals)) - - # Optimizer is starting to round. Tried brentq, bisect and fsolve + + # Optimizer is starting to round. Tried brentq, bisect and fsolve x_vals = np.array([9, 11]) tg = tgeo(tot_obs=[1,10], n_samp=[1/.9, 1/.99]) tg.pmf(0) pred_vals = np.round(tg.var['x'], decimals=4) + print pred_vals self.assertTrue(np.array_equal(x_vals, pred_vals)) + # Test a case that was failing for Erica Newman + x_val = [.9896] + tg = tgeo(tot_obs=341, n_samp=4) + tg.pmf(0) + print tg.var['x'] + self.assertTrue(np.round(tg.var['x'], 4) == x_val[0]) + # Test that pdf and cdf give correct values check = dist.pmf([1,1,2,3,4,5,12,34,65]) self.assertTrue(dist.cdf(0)[0][0] == dist.pmf(0)[0][0]) - self.assertTrue(dist.cdf(23)[0][0] == + self.assertTrue(dist.cdf(23)[0][0] == np.sum(dist.pmf(np.arange(0,24))[0])) # Test that fit provides the correct number of tot_obs. Already have # tested generic fit method. dist = tgeo().fit(self.abund_list) self.assertTrue(len(dist.params['tot_obs']) == 4) - - + + def test_mete_sar_iter(self): - + # Check mete sar against EW values EWsar_down = np.array([8.79, 12.37, 16.71, 21.81, 27.59, 34]) #S = 23, N=3400, anchor_area=123, target_area=2000) @@ -659,21 +675,21 @@ def test_mete_sar_iter(self): , downscale=6) self.assertTrue(len(sar) == 11) - # Check that only halving or doubling results are returned when + # Check that only halving or doubling results are returned when # non_iter=True sar = mete_sar_iter(n_samp=34, tot_obs=1000).iter_vals([1,2,.5,.25,5,.4], non_iter=True) - self.assertTrue(len(sar) == 4) + self.assertTrue(len(sar) == 4) # Check errors are thrown sar = mete_sar_iter(n_samp=34, tot_obs=1000) - # Check that fit method fits correctly with two arguments passed + # Check that fit method fits correctly with two arguments passed sar = mete_sar_iter().fit(self.sad, self.sar) self.assertTrue(sar.params['n_samp'] == 155) self.assertTrue(sar.params['tot_obs'] == sum(np.arange(1, 156))) - # Check that fit method fits correctly with one argument passed + # Check that fit method fits correctly with one argument passed sar = mete_sar_iter().fit(self.sad) self.assertTrue(sar.params['n_samp'] == 155) self.assertTrue(sar.params['tot_obs'] == sum(np.arange(1, 156))) @@ -710,7 +726,7 @@ def test_power_law(self): sar = powerlaw().fit(self.sad, self.sar) g = sar.vals([1]) self.assertTrue(np.round(g['items'][0], decimals=0) == 200) - + # Check that c and z exist and check values of other parameters. sar.params['c']; sar.params['z'] self.assertTrue(sar.params['n_samp'] == 155) @@ -729,10 +745,10 @@ def test_power_law(self): self.assertTrue(not(np.array_equal(res1['x_over_y'], res2['x_over_y']))) self.assertTrue((np.array_equal(res1['z'], res2['z']))) - + def test_gen_sar(self): '''Testing that this actually works''' - + # Testing that gen_sar actually runs. Not sure what values to test it # against. @@ -752,7 +768,7 @@ def test_gen_sar(self): base2 = gnsar.iter_vals([1,2,.8,.2,.3], base=2) base3 = gnsar.iter_vals([1,2,.8,.2,.3], base=3) self.assertTrue(not(np.array_equal(base2['area'], base3['area']))) - + # Test that non_iter=False, returns only areas that match a_list a_list1 = [1,2,.5,.25,.1] @@ -771,7 +787,7 @@ def test_gen_sar(self): # Non_iter should be overridden sar_arr = gnsar.iter_vals(downscale=1, upscale=1, non_iter=True) self.assertTrue(len(sar_arr) == 3) - + # Test vals and fit performs properly. Test that fit ignores all args # but the first one too. @@ -844,7 +860,7 @@ def test_psi(self): ps.rad() def test_nu(self): - + # Test error is raised when pdf called self.assertRaises(NotImplementedError, nu(n_samp=30, tot_obs=400, E=5000).pdf, 0) @@ -859,7 +875,7 @@ def test_nu(self): # Value with no support should equal 0 self.assertTrue(nudist.pmf(1)[0][0] == 0) self.assertTrue(nudist.cdf(1)[0][0] == 0) - + #Check that the last value in cdf is 1 self.assertTrue(np.round(nudist.cdf(E)[0][0], decimals=1) == 1) @@ -878,12 +894,11 @@ def test_nu(self): self.assertTrue(g.params['tot_obs'][0] == 28) self.assertTrue(g.params['n_samp'][0] == 7) self.assertTrue(g.params['E'][0] == 28) - + if __name__ == '__main__': unittest.main() - @@ -891,4 +906,5 @@ def test_nu(self): - + + diff --git a/mecodesktop.py b/mecodesktop.py new file mode 100755 index 0000000..287da55 --- /dev/null +++ b/mecodesktop.py @@ -0,0 +1,187 @@ +""" +Macroeco Desktop - A graphical interface for macroeco + +Open file dialog +http://wiki.wxpython.org/Getting%20Started + +Redirecting stdout and stderr +http://blog.pythonlibrary.org/2009/01/01/wxpython-redirecting-stdout-stderr/ + +Process and stdout to window (see Example at link below) +http://wxpython.org/Phoenix/docs/html/Process.html#process +""" + +import wx +import os, sys +import threading as thread + +from macroeco.main import main + +class RedirectText(object): + def __init__(self,aWxTextCtrl): + self.out=aWxTextCtrl + + def write(self,string): + wx.CallAfter(self.out.WriteText, string) + +# Class for window +class MainWindow(wx.Frame): + + def __init__(self, parent, title): + wx.Frame.__init__(self, parent, title=title) + self.t = None + self.filename = '' + self.dirname = '' + self.InitUI() + self.Show(True) + + + def InitUI(self): + + # Header + sizerhead = wx.BoxSizer(wx.HORIZONTAL) + head_font = wx.Font(18, wx.SWISS, wx.NORMAL, wx.BOLD) + heading = wx.StaticText(self, label='Macroeco Desktop') + sizerhead.Add(heading, 0, wx.EXPAND) + heading.SetFont(head_font) + + # Step 1 + sizer1 = wx.BoxSizer(wx.HORIZONTAL) + + param_text = wx.StaticText(self, + label=("1. Open or create a parameter file\n" + " File can be edited below and saved")) + self.open_button = wx.Button(self, label='Open') + self.new_button = wx.Button(self, label='New') + self.save_button = wx.Button(self, label='Save') + self.save_button.Enable(False) + + sizer1.Add(param_text, 1, wx.EXPAND) + sizer1.Add(self.open_button, 0, wx.EXPAND | wx.RIGHT, 6) + sizer1.Add(self.new_button, 0, wx.EXPAND | wx.RIGHT, 6) + sizer1.Add(self.save_button, 0, wx.EXPAND) + + # Bind open and new buttons + self.Bind(wx.EVT_BUTTON, self.OnOpen, self.open_button) + self.Bind(wx.EVT_BUTTON, self.OnNew, self.new_button) + self.Bind(wx.EVT_BUTTON, self.OnSave, self.save_button) + + # Param window + sizerpfile = wx.BoxSizer(wx.HORIZONTAL) + self.pfile = wx.TextCtrl(self, wx.ID_ANY, size=(600,300), + style=wx.TE_MULTILINE|wx.HSCROLL) + sizerpfile.Add(self.pfile, 1, wx.EXPAND) + + # Step 2 + sizer2 = wx.BoxSizer(wx.HORIZONTAL) + run_text = wx.StaticText(self, label='2. Run analysis') + self.run_button = wx.Button(self, label='Run') + sizer2.Add(run_text, 1, wx.EXPAND) + sizer2.Add(self.run_button, 0, wx.EXPAND) + + # Bind run button + self.Bind(wx.EVT_BUTTON, self.OnRun, self.run_button) + + # Output window + sizerlogbox = wx.BoxSizer(wx.HORIZONTAL) + self.logbox = wx.TextCtrl(self, wx.ID_ANY, size=(600,150), + style=wx.TE_MULTILINE|wx.TE_READONLY|wx.HSCROLL) + sizerlogbox.Add(self.logbox, 1, wx.EXPAND) + + # Redirect text here + redir = RedirectText(self.logbox) + sys.stdout = redir + sys.stderr = redir + + # Restore run button + self.Bind(wx.EVT_IDLE, self.OnIdle) + + # All items + + sizer_main = wx.BoxSizer(wx.VERTICAL) + sizer_main.Add(sizerhead, 0, wx.EXPAND | wx.ALL, 12) + + sizer_main.Add(sizer1, 0, wx.EXPAND | wx.ALL, 12) + sizer_main.Add(sizerpfile, 0, wx.EXPAND | wx.ALL, 12) + sizer_main.Add(sizer2, 0, wx.EXPAND | wx.ALL, 12) + sizer_main.Add(sizerlogbox, 0, wx.EXPAND | wx.ALL, 12) + + # Set up main layout + self.SetSizer(sizer_main) + self.SetAutoLayout(True) + sizer_main.Fit(self) + + def defaultFileDialogOptions(self): + ''' Return a dictionary with file dialog options that can be + used in both the save file dialog as well as in the open + file dialog. ''' + return dict(message='Choose a file', defaultDir=self.dirname, + wildcard='*.*') + + def askUserForFilename(self, **dialogOptions): + dialog = wx.FileDialog(self, **dialogOptions) + if dialog.ShowModal() == wx.ID_OK: + userProvidedFilename = True + self.filename = dialog.GetFilename() + self.dirname = dialog.GetDirectory() + else: + userProvidedFilename = False + dialog.Destroy() + return userProvidedFilename + + def OnOpen(self,e): + if self.askUserForFilename(style=wx.OPEN, + **self.defaultFileDialogOptions()): + parampath = os.path.join(self.dirname, self.filename) + f = open(parampath, 'r') + self.pfile.SetValue(f.read()) + f.close() + + self.save_button.Enable(True) + + self.logbox.SetValue('') + print "File opened at " + os.path.join(self.dirname, self.filename) + + def OnNew(self,e): + if self.askUserForFilename(style=wx.SAVE, + **self.defaultFileDialogOptions()): + self.OnSave(e, new_file=True) + self.save_button.Enable(True) + + def OnSave(self, event, new_file=False): + f = open(os.path.join(self.dirname, self.filename), 'w') + f.write(self.pfile.GetValue()) + f.close() + + self.logbox.SetValue('') + if new_file: + print "File created at "+os.path.join(self.dirname, self.filename) + else: + print "File saved at " + os.path.join(self.dirname, self.filename) + + def OnRun(self,e): + self.logbox.SetValue('') + self.RunMain() + + def RunMain(self): + self.run_button.Enable(False) # Turn the run button off + parampath = os.path.join(self.dirname, self.filename) + self.t = thread.Thread(target=main, args=(parampath,)) + self.t.daemon = True # Kills thread if app exits + self.t.start() + + def OnIdle(self, event): + if self.t: # If a thread has been started + if not self.t.is_alive(): # And it's not alive + self.run_button.Enable(True) # Turn the run button on + +if __name__ == '__main__': + # To execute, run `pythonw -m desktop path/to/parameters.txt` + # With arg, execute main(arg), without arg open GUI window + if len(sys.argv) > 1: + param_path = sys.argv[1] + main(param_path) + else: + app = wx.App(False) + frame = MainWindow(None, 'Macroeco Desktop') + app.MainLoop() diff --git a/mecodesktop_mac.spec b/mecodesktop_mac.spec new file mode 100644 index 0000000..17d38a9 --- /dev/null +++ b/mecodesktop_mac.spec @@ -0,0 +1,28 @@ +# -*- mode: python -*- +a = Analysis(['mecodesktop.py'], + pathex=['/Users/jkitzes/Projects/macroeco'], + hiddenimports=['scipy.special._ufuncs_cxx'], + hookspath=None, + runtime_hooks=None) +pyz = PYZ(a.pure) +exe = EXE(pyz, + a.scripts, + exclude_binaries=True, + name='mecodesktop', + debug=False, + strip=None, + upx=True, + console=False ) +coll = COLLECT(exe, +a.binaries + [('libwx_osx_cocoau-3.0.0.0.0.dylib', + '/Users/jkitzes/anaconda/pkgs/wxpython-3.0-py27_0/lib/libwx_osx_cocoau-3.0.0.0.0.dylib', + 'BINARY')], + a.zipfiles, + a.datas, + strip=None, + upx=True, + name='mecodesktop') +app = BUNDLE(coll, + name='MacroecoDesktop.app', + icon='icon.icns') + diff --git a/output.py b/output.py deleted file mode 100644 index 06d7ed7..0000000 --- a/output.py +++ /dev/null @@ -1,1193 +0,0 @@ -#!/usr/bin/python - -'''This module provides functions for outputting results of macroeco -analyses''' - - -from __future__ import division -import matplotlib.pyplot as plt -import numpy as np -import logging -from macroeco.utils.form_func import output_form, add_field -import copy as cp -import os -import shutil - - -readme_info_plots =\ -''' -FOLDER DESCRIPTION -------------------- - -The folder {3} contains {0} files. There are {1} {4} represented as png -files and {2} csv files which contain the data required to generate each -plot. The csv files have identical names to the png files to which they -correspond. Each file name is a concatenation of the following strings: -analysis name, run name, data name, and {5}. An additional identifier is -appended to the file name after {5} in order to make each file unique. It is -either a species identifier or a number. - -On the right hand side of each plot, you will see a string that begins -'Criteria for plot'. The criteria are either a species name or string that -looks like - -'y': [('>=', 0.0), ('<', 150.0)], 'x': [('>=', 0.0), ('<', 50.0)] - -This can be interpreted as follows. The plot under consideration has 'y' values -greater than or equal to 0 and less than 150 and 'x' values greater than or -equal to 0 and less than 50. Similarly a criteria string of the form - -'year' : ('==' , 1998) - -can be interpreted as the plot under consideration has 'year' values equal to -1998. The criteria is determined by how you decided to divide your plot for the -analysis. A criteria string of the form - -'temperature' : ('==', 'cool') - -can be interpreted as the plot under consideration has 'temperature' values -equal to 'cool'. ''' - -readme_info_summary=\ -u""" -FOLDER DESCRIPTION ------------------- - -The folder {0} contains {1} txt file(s) and {1} csv file(s). Each .txt file -contains a summary for the plot generate by the criteria at the header of the -file. The criteria are either a species name or string that looks like - -'y': [('>=', 0.0), ('<', 150.0)], 'x': [('>=', 0.0), ('<', 50.0)] - -This can be interpreted as follows. The plot under consideration has 'y' values -greater than or equal to 0 and less than 150 and 'x' values greater than or -equal to 0 and less than 50. Similarly a criteria string of the form - -'year' : ('==' , 1998) - -can be interpreted as the plot under consideration has 'year' values equal to -1998. - -Each txt file has a corresponding csv plot with the AIC values in tabular form -for easy analysis. - -Each summary file contains summary statistics for the observed data and each -distribution to which the observed data was compared. Each file name is a -concatenation of the following strings: analysis name, data name and -summary_table or AIC_table. An additional identifier is appended to the file -name after summary_table in order to make each file unique. It is either a -species identifier, a number, or both.""" - -readme_info_rarity =\ -''' -FOLDER DESCRIPTION ------------------- - -The folder {0} contains {1} csv files. Each file contains the -columns 'data_name', 'criteria', 'observed', and any number of columns with -distribution names. These are the distributions to which the data was -compared. The column data_name gives the name of the data being examined, the -column criteria describes the specifications that made the given plot, the -remaining columns describe the number of items that had a value below a -prespecified minimum. The prespecified minimum can be found in the file name -immediately after '_<=_'. Each file name is a concatenation of the following -strings: analysis name, data name and 'rarity_<=_' some minimum. -''' - -readme_info_sar=\ -''' -FOLDER DESCRIPTION ------------------- - -The folder {0} contains {1} png files and {2} csv files. The png file(s) are -log-log SAR-EAR plot(s) with area_fraction on the x-axis and species on the y-axis. -The names of the png file(s) are a concatenation of the following strings: -analysis name, run_name, data_name, and SAR-EAR_plot. A number is appended to the -end of the plot to ensure the filename is unique. The csv -files contain the data required to make the given plot(s). Each csv file -contains two columns, species and area_fraction. area_fraction assigns the -base area a value of 1 and represents all other areas as a fraction of the base -area. The csv file name(s) are a concatenation of the following strings: -analysis_name, run_name, data_name, SAR-EAR_plot_, a unique number, and the SAR-EAR -name. - -''' - - -class DistributionOutput(object): - ''' - This formats and outputs analyses on distributions - - ''' - - def __init__(self, out_dir): - ''' - Parameters - ---------- - out_dir : string - String appended to output directory - ''' - - self.out_dir = out_dir - self.urns = 'Urns' - self.balls = 'Balls' - self.Nmax = 'Nmax' - self.rad_x_axis = 'Rank' - self.rad_y_axis = 'Abundance' - self.cdf_x_axis = 'Abundance' - self.cdf_y_axis = 'Cumulative Probability' - self.variable = 'abundance' - self.dist_name = '' - - - def write_summary_table(self, smry, criteria=None, species=None): - ''' - Parameters - --------- - smry : dict - A dictionary as returned by the function compare_summary within the - CompareDistribution class. - criteria : array-like object - An array-like object in which contains either string or dicts that - tell how each dataset was generated. Describes the subsetting of - an sad and the species ID of an ssad. - species : array_like object - If not None, must be the an array-like object of the same length as - criteria, but containing species strings. Can only be used if - criteria is also not None. - - Notes - ----- - Writes out a formatted txt file - - ''' - # Make output folder - folder_name = self.dist_name + '_summary_statistics_' + self.out_dir - make_directory(folder_name) - - tot_sad = len(smry['observed']['balls']) - if criteria != None: - assert len(criteria) == tot_sad, "len(criteria) must equal" + \ - " number of data arrays under consideration" - if species != None: - assert len(species) == tot_sad, "len(species) must equal" + \ - " number of data arrays under consideration" - ob = smry['observed'] - - count = 0 - for i in xrange(tot_sad): - if criteria != None and species != None: - filename = os.path.join(folder_name, self.out_dir + \ - '_summary_table_' + str(species[i]) + '_' + str(i) + - '.txt') - filename_aic = os.path.join(folder_name, self.out_dir + \ - '_AIC_table_' + str(species[i]) + '_' + str(i)) - - elif criteria != None and np.all([type(crt) != dict for crt in - criteria]): - filename = os.path.join(folder_name, self.out_dir + \ - '_summary_table_' + str(criteria[i]) + '.txt') - filename_aic = os.path.join(folder_name, self.out_dir + \ - '_AIC_table_' + str(criteria[i])) - - - else: - filename = os.path.join(folder_name, self.out_dir + - '_summary_table_' + str(i) + '.txt') - filename_aic = os.path.join(folder_name, self.out_dir + - '_AIC_table_' + str(i)) - - - fout = open(filename, 'w') - logging.info('Writing summary table %s' % filename) - - - if criteria != None and species != None: - - fout.write('CRITERIA: ' + str(criteria[i]) + '\n' + - 'SPECIES: ' + str(species[i]) + '\n\n') - - elif criteria != None: - fout.write('CRITERIA: ' + str(criteria[i]) + '\n\n') - - else: - fout.write('CRITERIA: NONE ' + str(i) + '\n\n') - - # Getting rarity - ob_rare = {} - for mins in ob['tot_min'].iterkeys(): - ob_rare['<=' + str(mins)] = ob['tot_min'][mins][i] - - fout.write('EMPIRICAL VALUES:\n' + self.urns + ' = ' + - str(ob['urns'][i]) + '\n' + self.balls + ' = ' + - str(ob['balls'][i]) + '\nObserved ' + self.Nmax + ' = ' + - str(ob['max'][i]) + '\nObserved Rarity = ' + - str(ob_rare) + '\n\n') - - - # Also output AIC values in for each table. Could add other - # measures to this table as well. - # Might break this out later - aic_vals = {} - - for kw in smry.iterkeys(): - if kw != 'observed': - dt= smry[kw] - # set relevant aic values for table output - aic_vals[kw]={'AIC_weights' : dt['aic_w'][i], 'Delta_AIC' : - dt['aic_d'][i], 'Parameter_number' : - dt['par_num'][i], 'Corrected_AIC' : - dt['aic'][i]} - # Getting rarity - dt_rare = {} - for mins in dt['tot_min'].iterkeys(): - dt_rare['<=' + str(mins)] = dt['tot_min'][mins][i] - dt_vars = {} - for key in dt['vars'].iterkeys(): - dt_vars[key] = dt['vars'][key][i] - - fout.write('PREDICTED DISTRIBUTION : ' + kw + '\n' + - self.urns + ' = ' + str(dt['urns'][i]) + '\n' + - self.balls + ' = ' + str(dt['balls'][i]) + - '\nAIC = ' + str(dt['aic'][i]) + '\nDelta_AIC = ' + - str(dt['aic_d'][i]) + '\nAIC_weight = ' + - str(dt['aic_w'][i]) + '\nNumber of Parameters = ' + - str(dt['par_num'][i]) + '\nPredicted '+ self.Nmax + ' = ' + - str(dt['max'][i]) + '\nPredicted Rarity = ' + - str(dt_rare) + '\nOther Variables = ' + - str(dt_vars) + '\n\n') - fout.close() - count += 1 - - # Make and print AIC table - dtype = [('Model', 'S30'), ('Parameter_number', np.float), - ('Corrected_AIC', np.float), ('AIC_weights', np.float), - ('Delta_AIC', np.float)] - aic_array = np.empty(len(aic_vals), dtype=dtype) - for j, model_name in enumerate(aic_vals.iterkeys()): - aic_array['Model'][j] = model_name - aic_array['Parameter_number'][j] =\ - aic_vals[model_name]['Parameter_number'] - aic_array['Corrected_AIC'][j] =\ - aic_vals[model_name]['Corrected_AIC'] - aic_array['AIC_weights'][j] =\ - aic_vals[model_name]['AIC_weights'] - aic_array['Delta_AIC'][j] =\ - aic_vals[model_name]['Delta_AIC'] - output_form(aic_array, filename_aic) - - fout = open(os.path.join(folder_name, 'README'), 'w') - fout.write(readme_info_summary.format(folder_name, count)) - fout.close() - - - - def plot_rads(self, rads, criteria=None, species=None): - ''' - Plotting the observed and predicted rank abundance distributions - - Parameters - ---------- - rads : dict - A dictionary that is returned from the function compare_rads in the - CompareDistribution class. - - criteria : list of objects - If not none, the objects in criteria will be printed as strings in - the plots and/or file names. They will only be included in the - file name if they are strings. - - species : list - A list of species names to be included in the csv file. Must - contain the same number of iterables - - Notes - ----- - Saves RAD plots to given out_dir. Saves as many plots as there are - observed distributions. - - ''' - folder_name = 'rank_abundance_plots_' + self.out_dir - make_directory(folder_name) - - tot_sad = len(rads['observed']) - recs = make_rec_from_dict(rads, tot_sad, species=species) - - if criteria != None: - assert len(criteria) == tot_sad, "len(criteria) must equal" + \ - " number of data arrays under consideration" - count = 0 - for i, data in enumerate(recs): - - # Plot all columns of the rec array - plot_rec_columns(data) - plt.semilogy() - plt.ylabel('Log ' + self.rad_y_axis) - plt.xlabel(self.rad_x_axis) - - if criteria != None and np.all([type(crt) != dict for crt in - criteria]): - plt.title('Rank abundance distribution for ' + str(criteria[i])) - filename = os.path.join(folder_name, self.out_dir + - '_rank_abundance_plot_' + str(criteria[i])) - - logging.info('Saving figure and csv ' + filename) - plt.savefig(filename) - output_form(recs[i], filename) - count += 2 - - elif criteria != None and np.all([type(crt) == dict for crt in - criteria]): - plt.title('Rank abundance distribution') - plt.figtext(.97, .5, 'Criteria for plot: ' + str(criteria[i]), - rotation='vertical', size=8, - horizontalalignment='center', - verticalalignment='center') - - filename = os.path.join(folder_name, self.out_dir + - '_rank_abundance_plot_' + str(i)) - logging.info('Saving figure ' + filename) - plt.savefig(filename) - output_form(recs[i], filename) - count += 2 - - else: - plt.title('Rank abundance distribution: plot number ' + str(i)) - - filename = os.path.join(folder_name, self.out_dir + - '_rank_abundance_plot_' + str(i)) - logging.info('Saving figure ' + filename) - plt.savefig(filename) - output_form(recs[i], filename) - count += 2 - - plt.clf() - - fout = open(os.path.join(folder_name, 'README'), 'w') - fout.write(readme_info_plots.format(count, count /2, count/2, - folder_name, 'rank abundance plots (RAD)', 'rank_abundance_plot')) - fout.close() - - - def plot_cdfs(self, cdfs, obs_data, criteria=None, species=None): - - ''' - - Plots observed vs predicted cdfs and returns a csv file with values - used for plotting. - - - Parameters - ---------- - cdfs : dict - A dictionary that is returned from the function compare_cdfs in the - CompareDistribution class. - - obs_data : list - A list of arrays. The observed data - (CompareDistribution.observed_data) - - criteria : dict or None - The criteria for splitting the data. Can be species names. If not - None, the criteria will be printed on the plots - - species : array-like object or None - The species names that will be added to the csv files. - - ''' - # Make directory - folder_name = self.dist_name + '_cdf_plots_' + self.out_dir - make_directory(folder_name) - - # SEDOutput could pass in tuple - spp = None - if type(cdfs) == type((1,)) and len(cdfs) == 2: - spp = cdfs[1] - cdfs = cdfs[0] - - tot_sad = len(cdfs['observed']) - recs = make_rec_from_dict(cdfs, tot_sad, add_rank=False) - if criteria != None: - assert len(criteria) == tot_sad, "len(criteria) must equal" + \ - " number of data arrays under consideration" - - count = 0 - for i, data in enumerate(recs): - - names = data.dtype.names - for nm in names: - fig = plt.plot(np.sort(obs_data[i]), np.sort(data[nm]), '-o') - - # Formatting - fig[0].axes.xaxis.tick_bottom() - fig[0].axes.yaxis.tick_left() - ylim = list(plt.ylim()) - if ylim[0] == 0: - ylim[0] = -.1 - plt.ylim((ylim[0], 1.1)) - xlim = plt.xlim() - plt.xlim((.9, xlim[1] + 10)) - plt.legend(names, loc='best') - plt.semilogx() - plt.ylabel(self.cdf_y_axis) - plt.xlabel('Log ' + self.cdf_x_axis) - - # Add observed to cdf array - if species != None: - sorted_ab, sorted_spp = sort_rank_abund([obs_data[i]], - [species[i]]) - n_rec = add_field(data, [(self.variable, np.float)]) - n_rec = add_field(n_rec, [('species', 'S40')]) - n_rec[self.variable] = sorted_ab[0] - n_rec['species'] = sorted_spp[0] - else: - n_rec = add_field(data, [(self.variable, np.float)]) - n_rec[self.variable] = np.sort(obs_data[i]) - - # Used for SSAD - if criteria != None and spp == None and np.all([type(crt) != dict - for crt in criteria]): - - plt.title('Cumulative density function for species ' + str(criteria[i])) - - filename = os.path.join(folder_name, self.out_dir + - '_cdf_plot_' + str(criteria[i])) - logging.info('Saving figure and csv ' + filename) - plt.savefig(filename) - output_form(n_rec, filename) - count += 2 - - # Used for SAD - elif criteria != None and spp == None and np.all([type(crt) == dict - for crt in criteria]): - plt.title('Cumulative Density Function') - plt.figtext(.97, .5, 'Criteria for plot: ' + str(criteria[i]), - rotation='vertical', size=8, - horizontalalignment='center', - verticalalignment='center') - - filename = os.path.join(folder_name, self.out_dir + - '_cdf_plot_' + str(i)) - logging.info('Saving figure ' + filename) - plt.savefig(filename) - output_form(n_rec, filename) - count += 2 - - # Used for SED - elif criteria != None and spp != None and np.all([type(crt) == dict - for crt in criteria]): - - plt.title('Cumulative Density Function for species ' + - str(spp[i])) - - plt.figtext(.97, .5, 'Criteria for plot: ' + str(criteria[i]), - rotation='vertical', size=8, - horizontalalignment='center', - verticalalignment='center') - - filename = os.path.join(folder_name, self.out_dir + - '_cdf_plot_' + str(spp[i]) + '_' + str(i)) - logging.info('Saving figure ' + filename) - plt.savefig(filename) - output_form(n_rec, filename) - count += 2 - - - else: - plt.title('CDF: plot number ' + str(i)) - filename = os.path.join(folder_name, self.out_dir + - '_cdf_plot_' + str(i)) - logging.info('Saving figure and csv ' + filename) - plt.savefig(filename) - output_form(n_rec, filename) - count += 2 - - plt.clf() - - fout = open(os.path.join(folder_name, 'README'), 'w') - fout.write(readme_info_plots.format(count, count/2, count/2, - folder_name, 'cumulative density plots (cdf)', 'cdf_plot')) - fout.close() - -class SADOutput(DistributionOutput): - ''' - Derived class for SAD output - ''' - - def __init__(self, out_dir): - ''' - Parameters - ---------- - out_dir : string - String appended to output directory - - ''' - self.out_dir = out_dir - self.urns = 'Species' - self.balls = 'Total Individuals' - self.Nmax = 'Nmax' - self.rad_x_axis = 'Rank' - self.rad_y_axis = 'Abundance' - self.cdf_x_axis = 'Abundance' - self.cdf_y_axis = 'Cumulative Probability' - self.variable = 'abundance' - self.dist_name = 'sad' - -class SSADOutput(DistributionOutput): - ''' - Derived class for SSAD output - ''' - - def __init__(self, out_dir): - ''' - Parameters - ---------- - out_dir : string - String appended to output directory - - ''' - self.out_dir = out_dir - self.urns = 'Cells' - self.balls = 'Individuals' - self.Nmax = 'Nmax' - self.rad_x_axis = 'Rank' - self.rad_y_axis = 'Abundance' - self.cdf_x_axis = 'Abundance' - self.cdf_y_axis = 'Cumulative Probability' - self.variable = 'abundance' - self.dist_name = 'ssad' - -class SAROutput(object): - ''' - This object interacts with CompareSARCurves - ''' - - def __init__(self, out_dir): - ''' - Parameters - ---------- - out_dir : string - String appended to output directory - ''' - self.out_dir = out_dir - - def plot_sars(self, sars, names=[], form='sar'): - ''' - Plots observed vs predicted sars - - Parameters - ---------- - sars : list of dicts - The output of CompareSARCurve method compare_curves - - names : list or strings - If not None, names is a list of the same length as sars. Gives the - desired names for the plots. - - ''' - - if form == 'sar': - file_str = '_SAR_plot_' - ylab = 'log(Species Number)' - stype = 'species' - folder_name = 'sar_plots_' + self.out_dir - make_directory(folder_name) - elif form == 'ear': - file_str = '_EAR_plot_' - ylab = 'log(Endemic Species Number)' - stype = 'endemic_species' - folder_name = 'ear_plots_' + self.out_dir - make_directory(folder_name) - else: - raise ValueError("Parameter 'form' must be 'ear' or 'sar' not '%s'" - % form) - - if len(names) != 0: - assert len(names) == len(sars); "Length of names must equal" + \ - "length of sars" - count = 0 - for i, sar in enumerate(sars): - filename = os.path.join(folder_name, self.out_dir + file_str + - str(i)) - legend = [] - for kw in sar.iterkeys(): - legend.append(kw) - if kw == 'observed': - fig = plt.plot(sar[kw]['area'], sar[kw]['items'], '-o') - else: - fig = plt.plot(sar[kw]['area'], sar[kw]['items']) - - # Change dtype names and output - defnm = sar[kw].dtype.names - sar[kw].dtype.names = (stype, 'area_fraction') - output_form(sar[kw], filename + '_' + kw) - sar[kw].dtype.names = defnm - - # Plot formatting - fig[0].axes.xaxis.tick_bottom() - fig[0].axes.yaxis.tick_left() - - plt.loglog() - plt.legend(tuple(legend), loc='best') - plt.xlabel('log(Area Fraction)') - plt.ylabel(ylab) - if len(names) != 0: - plt.title(names[i]) - else: - plt.title(form.upper() + ' plot %i' % (i)) - filename = os.path.join(folder_name, self.out_dir + file_str + - str(i)) - logging.info('Saving figure ' + filename) - plt.savefig(filename) - plt.clf() - count += 1 - - fout = open(os.path.join(folder_name, 'README'), 'w') - fout.write(readme_info_sar.format(folder_name, count, count * len(sar))) - fout.close() - - -class ASEDOutput(DistributionOutput): - ''' - Class outputs the average species energy distributions by interacting with - CompareASED - ''' - - def __init__(self, out_dir): - ''' - Parameters - ---------- - out_dir : string - String appended to output directory - ''' - self.out_dir = out_dir - self.urns = 'Species' - self.balls = 'Sum of Species Average Energies' - self.Nmax = 'Max Average Energy' - self.cdf_x_axis = 'Average Energy' - self.cdf_y_axis = 'Cumulative Probability' - self.variable = 'average energy' - self.dist_name = 'ased' - - def plot_rads(self, *args, **kwargs): - ''' - Not implemented for this class object - ''' - - raise NotImplementedError('plot_rads is not implemented for object %s' - % (self.__class__.__name__)) - - def plot_reds(self, reds, criteria=None, species=None): - ''' - Plotting the observed and predicted rank abundance distributions - - Parameters - ---------- - reds : dict - A dictionary that is returned from the function compare_reds in the - CompareASED class. - - criteria : list of objects - If not none, the objects in criteria will be printed a strings in - the plots and file names. - - Notes - ----- - Saves RAD plots to given out_dir. Saves as many plots as there are - observed distributions. - - ''' - folder_name = 'ased_rank_energy_plots_' + self.out_dir - make_directory(folder_name) - - tot_sad = len(reds['observed']) - recs = make_rec_from_dict(reds, tot_sad, species=species) - - if criteria != None: - assert len(criteria) == tot_sad, "len(criteria) must equal" + \ - " number of data arrays under consideration" - count = 0 - for i, data in enumerate(recs): - - # Plot all columns of the rec array - plot_rec_columns(data) - plt.semilogy() - plt.ylabel('Log Energy') - plt.xlabel('Rank') - - if criteria != None and np.all([type(crt) != dict for crt in - criteria]): - plt.title('ASED rank energy distribution for ' + - str(criteria[i])) - filename = os.path.join(folder_name, self.out_dir + - '_rank_abundance_plot_' + str(criteria[i])) - - logging.info('Saving figure and csv ' + filename) - plt.savefig(filename) - output_form(recs[i], filename) - count += 2 - - elif criteria != None and np.all([type(crt) == dict for crt in - criteria]): - plt.title('ASED rank energy distribution') - plt.figtext(.97, .5, 'Criteria for plot: ' + str(criteria[i]), - rotation='vertical', size=8, - horizontalalignment='center', - verticalalignment='center') - - filename = os.path.join(folder_name, self.out_dir + - '_ased_rank_energy_plot_' + str(i)) - logging.info('Saving figure ' + filename) - plt.savefig(filename) - output_form(recs[i], filename) - count += 2 - - else: - plt.title('ASED rank energy distribution: plot number ' + str(i)) - - filename = os.path.join(folder_name, self.out_dir + - '_ased_rank_energy_plot_' + str(i)) - logging.info('Saving figure ' + filename) - plt.savefig(filename) - output_form(recs[i], filename) - count += 2 - - plt.clf() - - fout = open(os.path.join(folder_name, 'README'), 'w') - fout.write(readme_info_plots.format(count, count /2, count/2, - folder_name, - 'average species energy distribution (ASED) rank' + - ' energy plots', 'ased_rank_energy_plot')) - fout.close() - - - -class IEDOutput(DistributionOutput): - ''' - Class outputs individual energy distributions by interacting with - CompareIED - - ''' - - def __init__(self, out_dir): - ''' - Parameters - ---------- - out_dir : string - String appended to output directory - ''' - self.out_dir = out_dir - self.urns = 'Individuals' - self.balls = 'Energy' - self.Nmax = 'Max Energy' - self.cdf_x_axis = 'Energy' - self.cdf_y_axis = 'Cumulative Probability' - self.variable = 'energy' - self.dist_name = 'ied' - - def plot_rads(self, *args, **kwargs): - ''' - Not implemented for this class object - ''' - - raise NotImplementedError('plot_rads is not implemented for object %s' - % (self.__class__.__name__)) - - - def plot_reds(self, reds, criteria=None): - ''' - Saves plot and csv file with predicted and empirical rank energy data - - Parameters - ---------- - reds : tuple - The output from the CompareIED.compare_rads method - criteria : list or None - A list of dicts with the criteria for divisions. See Patch.sad - - Output - ------ - This method outputs both a plot and a csv that compare observed and - predicted individual rank energy curves for the entire community at the - given subset. - - ''' - folder_name = 'ied_rank_energy_plots_' + self.out_dir - make_directory(folder_name) - - - tot_reds = len(reds['observed']) - recs = make_rec_from_dict(reds, tot_reds) - if criteria != None: - assert len(criteria) == tot_reds, "len(criteria) must equal" + \ - " number of reds under consideration" - count = 0 - for i, data in enumerate(recs): - - #Plot all data in a single rec array - plot_rec_columns(data) - - # Make appropriate title for figure - if criteria != None: - plt.title('Rank Energy Distribution') - plt.figtext(.97, .5, 'Criteria for plot: ' + str(criteria[i]), - rotation='vertical', size=8, - horizontalalignment='center', - verticalalignment='center') - else: - plt.title('Rank Energy Distribution') - plt.figtext(.97, .5, 'Plot number: ' + str(i), - rotation='vertical', size=8, - horizontalalignment='center', - verticalalignment='center') - - plt.loglog() - plt.ylabel('Log Energy') - plt.xlabel('Log Rank') - - filename = os.path.join(folder_name, self.out_dir + - '_ied_rank_energy_' + str(i)) - - logging.info('Saving figure ' + filename) - plt.savefig(filename) - plt.clf() - output_form(recs[i], filename) - count += 2 - - fout = open(os.path.join(folder_name, 'README'), 'w') - fout.write(readme_info_plots.format(count, count/2, count/2, - folder_name, - 'individual energy distribution (IED) rank energy plots', - 'ied_rank_energy')) - fout.close() - -class SEDOutput(DistributionOutput): - ''' - Class outputs species-level energy distributions by interacting with - CompareSED - - ''' - - def __init__(self, out_dir): - ''' - Parameters - ---------- - out_dir : string - String appended to output directory - ''' - self.out_dir = out_dir - self.urns = 'Individuals in Species' - self.balls = 'Energy' - self.Nmax = 'Max Energy' - self.cdf_x_axis = 'Energy' - self.cdf_y_axis = 'Cumulative Probability' - self.variable = 'energy' - self.dist_name = 'sed' - - def plot_rads(self, *args, **kwargs): - ''' - Not implemented for this class object - ''' - - raise NotImplementedError('plot_rads is not implemented for object %s' - % (self.__class__.__name__)) - - - def plot_reds(self, reds, criteria=None): - ''' - Saves plot and csv file with predicted and empirical rank energy data - - Parameters - ---------- - reds : tuple - The output from the CompareSED.compare_rads method with - return_spp=True. - criteria : list or None - A list of dicts with the criteria for divisions. See Patch.sad - - Output - ------ - This method outputs both a plot and a csv that compare observed and - predicted species-level rank energy curves. - - ''' - folder_name = 'sed_rank_energy_plots_' + self.out_dir - make_directory(folder_name) - - if type(reds) != type((1,)): - raise TypeError("Input reds must be a tuple. Set return_spp=True" + - " in CompareSED.compare_rads") - spp = reds[1] - tot_reds = len(reds[0]['observed']) - recs = make_rec_from_dict(reds[0], tot_reds) - if criteria != None: - assert len(criteria) == tot_reds, "len(criteria) must equal" + \ - " number of reds under consideration" - count = 0 - for i, data in enumerate(recs): - - plot_rec_columns(data) - plt.semilogx() - plt.ylabel('Energy') - plt.xlabel('Log Rank') - - if spp != None: - if criteria != None: - plt.title('Rank Energy Distribution for species ' + - str(spp[i])) - plt.figtext(.97, .5, 'Criteria for plot: ' + - str(criteria[i]), rotation='vertical', size=8, - horizontalalignment='center', - verticalalignment='center') - else: - plt.title('Rank Energy Distribution for species ' + - str(spp[i])) - - filename = os.path.join(folder_name, self.out_dir + - '_sed_rank_energy_' + str(spp[i]) + '_' + str(i)) - - logging.info('Saving figure ' + filename) - plt.savefig(filename) - output_form(recs[i], filename) - count += 2 - - elif spp == None: - if criteria != None: - plt.title('Criteria: ' + str(criteria[i])) - else: - plt.title('Plot number ' + str(i)) - - filename = os.path.join(folder_name, self.out_dir + - '_sed_rank_energy_' + str(i)) - logging.info('Saving figure ' + filename) - plt.savefig(filename) - output_form(recs[i], filename) - plt.clf() - - fout = open(os.path.join(folder_name, 'README'), 'w') - fout.write(readme_info_plots.format(count, count/2, count/2, - folder_name, - 'species-level energy distribution (SED) rank energy plots', - 'sed_rank_energy')) - fout.close() - -class OutputRarity(object): - ''' - This object accepts output from the Compare.compare_rarity method to - output rarity - - ''' - - def __init__(self, out_dir): - ''' - - Parameters - ---------- - out_dir : string - String appended to output directory - - ''' - - self.out_dir = out_dir - - def output_rarity(self, rarity, data_path, data, criteria=None): - ''' - Outputs csv files containing rarity measures - - Parameters - ---------- - rarity : a CompareRarity object - - data_path : str - data_path string for identifying data in csv file - - data : list - A list of observed species abundance distributions - - criteria : dict or None - The criteria for how the plot was split - - ''' - folder_name = 'rarity_values_' + self.out_dir - make_directory(folder_name) - - keys = list(rarity.viewkeys()) - dtype = [(kw, np.int) for kw in keys] - dtype.insert(0, ('criteria', 'S90')) # arbitrary length - dtype.insert(0, ('data_name', 'S90')) # arbitrary length - - # Get a list of my minimums - rare_list = [] - mins = list(rarity['observed'].viewkeys()) - for mn in mins: - rarity_array = np.empty(len(data), dtype=dtype) - rarity_array['criteria'] = criteria - nm = os.path.split(data_path)[1].split('.')[0] - rarity_array['data_name'] = np.repeat(nm, len(rarity_array)) - for kw in keys: - rarity_array[kw] = rarity[kw][mn] - rare_list.append(rarity_array) - - # Output results - count = 0 - for i, rare in enumerate(rare_list): - filename = os.path.join(folder_name, self.out_dir + '_rarity_<=_' + - str(mins[i])) - logging.info('Saving rarity data ' + filename) - output_form(rare, filename) - count += 1 - - fout = open(os.path.join(folder_name, 'README'), 'w') - fout.write(readme_info_rarity.format(folder_name, count)) - fout.close() - -def make_rec_from_dict(dist_dict, num, species=None, dt=np.float, add_rank=True): - ''' - Makes a structured/rec array from a dictionary - - Parameters - ---------- - dist_dict : dict - A dictionary with each keyword referencing a list of arrays - - num : int - Number of rec_arrays to return in list - - species : None or list of iterables - If not None, species should be a list of iterables that is the same - length as the list of iterables in any keyword in dist_dict. - - Returns - ------- - : structured array - - ''' - - # Check that species has the appropriate length - if species != None: - species = cp.deepcopy(species) - for val in dist_dict.itervalues(): - if len(species) != len(val): - raise TypeError('Species must contain the same number of ' + - 'iterables as each value in dist_dict') - # Sort Observed and species list - if species != None: - dist_dict['observed'], species = sort_rank_abund(dist_dict['observed'], - species) - recs = [] - names = list(dist_dict.viewkeys()) - dtype = zip(names, np.repeat(dt, len(names))) - if species != None: - dtype.insert(0, ('species', 'S40')) - if add_rank: - dtype.insert(0, ('rank', dt)) - for i in xrange(num): - temp = np.empty(len(dist_dict[names[0]][i]), dtype=dtype) - if species != None: - temp['species'] = species[i] - if add_rank: - temp['rank'] = np.arange(1,len(temp) + 1)[::-1] - for kw in dist_dict.iterkeys(): - temp[kw] = np.sort(dist_dict[kw][i]) - recs.append(temp) - return recs - -def sort_rank_abund(abund_list, spp_list): - ''' - Sorts and returns two lists based on abundance - - Parameters - ---------- - abund_list : list of arrays - - spp_list : list of arrays - - Returns - ------- - :tuple - sorted_abund, sorted_spp - - ''' - - assert len(abund_list) == len(spp_list), 'Lengths of arguments not equal' - assert np.all([len(a) == len(b) for a,b in zip(abund_list, spp_list)]),\ - 'Lengths of all corresponding iterables not equal' - abund_list = [np.array(t) for t in abund_list] - spp_list = [np.array(t) for t in spp_list] - - sorted_abund = [] - sorted_spp = [] - for i in xrange(len(abund_list)): - temp = np.array(zip(abund_list[i], spp_list[i]), dtype=[('a', - abund_list[i].dtype), ('s', spp_list[i].dtype)]) - temp_sorted = np.sort(temp, order='a') - sorted_abund.append(temp_sorted['a']) - sorted_spp.append(temp_sorted['s']) - - return sorted_abund, sorted_spp - -def plot_rec_columns(rec_array): - ''' - Function plots the columns in a rec array. - ''' - - # Available plotting symbols - plot_symbols = ['+', 's', 'd', '*', 'x', '8', 'H', '1', 'p', '2', '3', - '4', '|', 4, 5, 6, 7] - names = rec_array.dtype.names - legend = [] - - # If there are more arrays than symbols just change colors of lines - if len(names) > len(plot_symbols): - for nm in names: - if nm != 'species' and nm != 'rank': - if nm == 'observed': - fig = plt.plot(np.arange(1, len(rec_array) + 1), - np.sort(rec_array[nm])[::-1], '-o', - color='black') - legend.append(nm) - else: - fig = plt.plot(np.arange(1, len(rec_array) + 1), - np.sort(rec_array[nm])[::-1], '-o') - legend.append(nm) - - # Else, use different symbols/markers for each line - elif len(names) <= len(plot_symbols): - - # Counter is 0 - cnt = 0 - for nm in names: - if nm != 'species' and nm != 'rank': - if nm == 'observed': - - fig = plt.plot(np.arange(1, len(rec_array) + 1), - np.sort(rec_array[nm])[::-1], '-o', - color='black') - legend.append(nm) - else: - fig = plt.plot(np.arange(1, len(rec_array) + 1), - np.sort(rec_array[nm])[::-1], '-' + - str(plot_symbols[cnt]), - markeredgecolor='none') - legend.append(nm) - cnt += 1 - # Include ticks only on bottom and left - fig[0].axes.xaxis.tick_bottom() - fig[0].axes.yaxis.tick_left() - - plt.legend(tuple(legend), loc='best') - -def make_directory(folder_name): - '''Makes a directory named folder_name. If the directory exists it - is overwritten - - folder_name - Name of the directory - ''' - - try: - os.mkdir(folder_name) - except OSError: - shutil.rmtree(folder_name) - os.mkdir(folder_name) - diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..a79a8df --- /dev/null +++ b/setup.py @@ -0,0 +1,37 @@ +from setuptools import setup, find_packages +from macroeco import __version__ + +setup( + name = 'macroeco', + version = __version__, + packages = find_packages(), + entry_points = {'console_scripts': ['mecodesktop=macroeco:mecodesktop',],}, + package_data = {'': ['*.txt', '*.csv']}, + + author = 'Justin Kitzes and Mark Wilber', + author_email = 'jkitzes@berkeley.edu', + description = 'Ecological pattern analysis in Python', + long_description = open('README.rst').read(), + license = 'BSD', + keywords = ('ecology biology environment conservation biodiversity ' + 'informatics data science'), + url = 'http://github.com/jkitzes/macroeco', + + classifiers = [ + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Development Status :: 4 - Beta", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: BSD License",], + + install_requires = [ + 'numpy>=1.6', + 'scipy>=0.12', + 'pandas>=0.13', + 'matplotlib', + # 'shapely', # Do not force install if user doesn't have + 'configparser', + 'decorator', + 'twiggy'], +) + +# python setup.py sdist bdist_egg upload -r https://testpypi.python.org/pypi \ No newline at end of file diff --git a/test_compare.py b/test_compare.py deleted file mode 100644 index be09aaa..0000000 --- a/test_compare.py +++ /dev/null @@ -1,710 +0,0 @@ -#!/usr/bin/python - -#Testing Compare Module - -import unittest -from macroeco.compare import * -import numpy as np -import scipy.stats as stats -import copy -import macroeco.distributions as dist -import numpy.testing as nt - -class TestCompare(unittest.TestCase): - '''Test classes and methods in compare.py''' - - def setUp(self): - self.sad_data = [[1,1,1,1,1,2,3,4,5,6], [2,2,2,2,2,2,2,2,2,2]] - self.ssad_data= [[0,0,0,1,1,2,3,5,12], (0,1,1,1,2,6,12)] - - - def test_CompareSAD_init(self): - - # Test that not passing in patch object object works - sad_c = CompareSAD(self.sad_data, ['logser']) - - # Check that sad_data became self.observed_data - sums = np.array([sum(x) for x in sad_c.observed_data]) - test_sums = np.array([sum(x) for x in self.sad_data]) - self.assertTrue(np.all(sums == test_sums)) - - # Test that that other attributes were set correctly - self.assertTrue(sad_c.criteria == None) - self.assertTrue(sad_c.sad_spp_list == None) - - # Test that distribution object was fit - self.assertTrue(np.all(sad_c.dist_list[0].params['tot_obs'] == - test_sums)) - self.assertTrue(np.all(sad_c.dist_list[0].params['n_samp'] == - np.array([10,10]))) - - # Test if patch is true! - - # Replica of patch output - patch_true = [({'test' : 'criteria'}, np.array([1,1,1,2,3,5]), - np.array(['a', 'b', 'c', 'd', 'e', 'g'])), ({'test' : - 'criteria'}, np.array([1,1,1,2,5]), np.array(['a', 'b', - 'c', 'd', 'g']))] - sad_c = CompareSAD(patch_true, dist_list=['logser'], patch=True) - - # Test that the parsing happened correctly - self.assertTrue(len(sad_c.criteria) == 2) - self.assertTrue(len(sad_c.sad_spp_list) == 2) - self.assertTrue(len(sad_c.observed_data) == 2) - - # Check that parameter values were fit correctly - self.assertTrue(np.all(sad_c.dist_list[0].params['n_samp'] == - np.array([6,5]))) - self.assertTrue(np.all(sad_c.dist_list[0].params['tot_obs'] == - np.array([13, 10]))) - - # Check that the species lists were set correctly - self.assertTrue(np.all(sad_c.sad_spp_list[0] == - np.array(['a', 'b', 'c', 'd', 'e', 'g']))) - self.assertTrue(np.all(sad_c.sad_spp_list[1] == - np.array(['a', 'b', 'c', 'd', 'g']))) - - def test_CompareSSAD_init(self): - - # Test that SSAD parses correctly when patch is False - # Test that not passing in patch object object works - ssad_c = CompareSSAD(self.ssad_data, ['binm']) - - # Check that sad_data became self.observed_data - sums = np.array([sum(x) for x in ssad_c.observed_data]) - test_sums = np.array([sum(x) for x in self.ssad_data]) - self.assertTrue(np.all(sums == test_sums)) - - # Test that that other attributes were set correctly - self.assertTrue(ssad_c.criteria == None) - self.assertTrue(ssad_c.sad_spp_list == None) - - # Test that distribution object was fit - self.assertTrue(np.all(ssad_c.dist_list[0].params['tot_obs'] == - test_sums)) - self.assertTrue(np.all(ssad_c.dist_list[0].params['n_samp'] == - np.array([9,7]))) - - # Test that ssad parses correctly if patch=True - ssad_patch = (np.array([{}, {}, {}, {}, {}]), {'spp1' : - np.array([0,0,1,2,4]), 'spp2' : np.array([1,1,1,1,1])}) - - ssad_c = CompareSSAD(ssad_patch, dist_list = ['tgeo', 'binm'], - patch=True) - - spp_list = np.array(['spp1', 'spp2']) - self.assertTrue(np.all(spp_list == np.sort(ssad_c.sad_spp_list))) - - # Test that distribution object was fit - self.assertTrue(np.all(ssad_c.dist_list[0].params['tot_obs'] == - np.array([7, 5]))) - self.assertTrue(np.all(ssad_c.dist_list[0].params['n_samp'] == - np.array([5,5]))) - # Test that distribution object was fit - self.assertTrue(np.all(ssad_c.dist_list[1].params['tot_obs'] == - np.array([7,5]))) - self.assertTrue(np.all(ssad_c.dist_list[1].params['n_samp'] == - np.array([5,5]))) - - self.assertTrue(len(ssad_c.criteria) == 5) - - def test_CompareIED_init(self): - - # Test the CompareIED init parses correctly - ied_data = [(np.arange(10,100), np.arange(1,40)), (np.arange(1,20), - np.arange(40,60))] - ied_c = CompareIED(ied_data, dist_list=['psi']) - - # Check the first item in tuple became observed data - sums = np.array([sum(x) for x in ied_c.observed_data]) - test_sums = np.array([sum(np.arange(10,100)), sum(np.arange(1,20))]) - self.assertTrue(np.all(sums == test_sums)) - - self.assertTrue(ied_c.criteria == None) - self.assertTrue(ied_c.sad_spp_list == None) - - # Test that distribution object was fit including E parameter - self.assertTrue(np.all(ied_c.dist_list[0].params['tot_obs'] == - np.array([sum(np.arange(1,40)), sum(np.arange(40,60))]))) - self.assertTrue(np.all(ied_c.dist_list[0].params['n_samp'] == - np.array([39,20]))) - self.assertTrue(np.all(ied_c.dist_list[0].params['E'] == - np.array([sum(np.arange(10,100)),sum(np.arange(1,20))]))) - - # If patch is True, make sure the fit works - patch_sad = [({'test' : 'criteria'}, np.array([1,1,1,2,3,5]), - np.array(['a', 'b', 'c', 'd', 'e', 'g'])), ({'test' : - 'criteria'}, np.array([1,1,1,2,5]), np.array(['a', 'b', - 'c', 'd', 'g']))] - - patch_ied = [({}, np.arange(1,40), np.repeat('a', 39)), ({}, - np.arange(1,30), np.repeat('b', 29))] - - ied_c = CompareIED((patch_ied, patch_sad), dist_list=['nu'], patch=True) - - # Check ied_list and spp_list - sad_spp = [np.array(['a', 'b', 'c', 'd', 'e', 'g']), - np.array(['a', 'b', 'c', 'd', 'g'])] - bools = [np.all(a == b) for a,b in zip(np.array(ied_c.sad_spp_list), - np.array(sad_spp))] - self.assertTrue(np.all(bools)) - - ied_spp = [np.repeat('a',39), np.repeat('b',29)] - bools = [np.all(a == b) for a,b in zip(ied_spp, ied_c.ied_spp_lists)] - self.assertTrue(np.all(bools)) - - # check criteria is right length - self.assertTrue(len(ied_c.criteria) == 2) - - # Check that observed data is correct - bools = [np.all(a == b) for a,b in zip(ied_c.observed_data, - [np.arange(1,40), np.arange(1,30)])] - self.assertTrue(np.all(bools)) - - # Check the fit of distribution - self.assertTrue(np.all(ied_c.dist_list[0].params['tot_obs'] == - np.array([13, 10]))) - self.assertTrue(np.all(ied_c.dist_list[0].params['n_samp'] == - np.array([6,5]))) - self.assertTrue(np.all(ied_c.dist_list[0].params['E'] == - np.array([sum(np.arange(1,40)),sum(np.arange(1,30))]))) - - def test_CompareSED_init(self): - - # Test that all attributes are set correctly (sed, ied, sad) - sed_data = [(np.arange(1,20), np.arange(1,40), np.arange(5,25)), - (np.arange(1,30), np.arange(5,30), np.arange(4,64))] - - sed_c = CompareSED(sed_data, dist_list=['theta']) - - # Did other attributes set correctly? - self.assertTrue(sed_c.criteria == None) - self.assertTrue(sed_c.sad_spp_list == None) - - # Check if observed sed data set correctly - test_obs = [np.arange(1,20), np.arange(1,30)] - bools = [np.all(a == b) for a,b in zip(sed_c.observed_data, test_obs)] - self.assertTrue(np.all(bools)) - - # Check that distribution fit correctly - self.assertTrue(np.all(sed_c.dist_list[0].params['tot_obs'] == - np.array([sum(np.arange(5,25)), sum(np.arange(4,64))]))) - self.assertTrue(np.all(sed_c.dist_list[0].params['n_samp'] == - np.array([len(np.arange(5,25)), len(np.arange(4,64))]))) - self.assertTrue(np.all(sed_c.dist_list[0].params['n'] == - np.array([len(np.arange(1,20)), len(np.arange(1,30))]))) - self.assertTrue(np.all(sed_c.dist_list[0].params['E'] == - np.array([sum(np.arange(1,40)), sum(np.arange(5,30))]))) - - # Test if patch == True - patch_sed = [({}, {'a' : np.arange(1,10), 'b' : np.arange(1,20), 'c': - np.arange(1,30), 'd' : np.arange(1,40)}), ({}, - {'a' : np.arange(1,10), 'b' : np.arange(1,20), 'c': - np.arange(1,30), 'd' : np.arange(1,40)})] - - patch_sad = [({}, np.arange(1,50), np.repeat('d',20))] - patch_ied = [({}, np.arange(4,67), np.repeat('y', 60))] - - # An error should be raised if sed,ied, and sad don't have the same - # length - self.assertRaises(IndexError, CompareSED, (patch_sed, patch_ied, - patch_sad), dist_list=['theta'], patch=True) - - - patch_sad = [({}, np.arange(1,50), np.repeat('d',20)), - ({}, np.arange(1,50), np.repeat('d',20))] - patch_ied = [({}, np.arange(4,67), np.repeat('y', 60)), - ({}, np.arange(4,67), np.repeat('y', 60))] - - sed_c = CompareSED((patch_sed, patch_ied, patch_sad), - dist_list=['theta'], patch=True) - - # Check that observed data is set correctly - self.assertTrue(len(sed_c.observed_data) == 8) - test_obs = [np.arange(1,10), np.arange(1,20), np.arange(1,30), - np.arange(1,40)] - test_obs += test_obs - bools = [np.all(a == b) for a,b in zip(test_obs, sed_c.observed_data)] - self.assertTrue(np.all(bool)) - - # Check distributions fit correctly - nt.assert_array_equal(sed_c.dist_list[0].params['n'], np.array([9, - 19, 29, 39, 9, 19, 29, 39])) - nt.assert_array_equal(sed_c.dist_list[0].params['E'], - np.repeat(sum(np.arange(4,67)), 8)) - nt.assert_array_equal(sed_c.dist_list[0].params['tot_obs'], - np.repeat(sum(np.arange(1,50)), 8)) - nt.assert_array_equal(sed_c.dist_list[0].params['n_samp'], - np.repeat(len(np.arange(1,50)), 8)) - - # Check that the species list is correct - nt.assert_array_equal(np.array(['a', 'b', 'c', 'd', 'a', 'b', 'c', - 'd']), np.array(sed_c.sad_spp_list)) - - # Check that criteria is correct length - self.assertTrue(len(sed_c.criteria) == 8) - - def test_CompareASED_init(self): - - # Test that ased fits correctly - - ased_data = [(np.arange(1,10), np.arange(4,56), np.arange(1,20)), - (np.arange(1,34), np.arange(3,20), np.arange(1,56))] - - ased_c = CompareASED(ased_data, dist_list=['nu']) - - # Did other attributes set correctly? - self.assertTrue(ased_c.criteria == None) - self.assertTrue(ased_c.sad_spp_list == None) - - # Check if observed ased data set correctly - test_obs = [np.arange(1,10), np.arange(1,34)] - bools = [np.all(a == b) for a,b in zip(ased_c.observed_data, test_obs)] - self.assertTrue(np.all(bools)) - - # Check that distribution fit correctly - self.assertTrue(np.all(ased_c.dist_list[0].params['tot_obs'] == - np.array([sum(np.arange(1,20)), sum(np.arange(1,56))]))) - self.assertTrue(np.all(ased_c.dist_list[0].params['n_samp'] == - np.array([len(np.arange(1,20)), len(np.arange(1,56))]))) - self.assertTrue(np.all(ased_c.dist_list[0].params['E'] == - np.array([sum(np.arange(4,56)), sum(np.arange(3,20))]))) - - # Test if patch == True - patch_ased = [({}, np.arange(1,50), np.repeat('d',20)), - ({}, np.arange(1,50), np.repeat('e',20))] - patch_sad = [({}, np.arange(1,50), np.repeat('d',20)), - ({}, np.arange(1,50), np.repeat('e',20))] - patch_ied = [({}, np.arange(4,67), np.repeat('y', 60)), - ({}, np.arange(4,67), np.repeat('y', 60))] - - ased_c = CompareASED((patch_ased, patch_ied, patch_sad), - dist_list=['nu'], patch=True) - - # Test that species list is correct - test_spp = [np.repeat('d', 20), np.repeat('e', 20)] - nt.assert_array_equal(test_spp, ased_c.sad_spp_list) - - # Test that observed data is correct - nt.assert_array_equal(ased_c.observed_data, [np.arange(1,50), - np.arange(1,50)]) - - # Test that fit distribution is correct - nt.assert_array_equal(ased_c.dist_list[0].params['tot_obs'], - np.array([1225, 1225])) - nt.assert_array_equal(ased_c.dist_list[0].params['n_samp'], - np.array([49, 49])) - nt.assert_array_equal(ased_c.dist_list[0].params['E'], - np.array([sum(np.arange(4,67)), - sum(np.arange(4,67))])) - - def test_CompareSAR(self): - - # Test if patch == False - area_list = [(np.arange(1,10), np.arange(9,18)), (np.arange(1,10), - np.arange(9,18))] - - full_sad = [np.arange(1,40), np.arange(1,60)] - - sar_c = CompareSAR(area_list, ['mete_sar_iter', 'logser-binm'], - full_sad) - - # Max area should be 1 - nt.assert_array_equal(np.array([1,1]), np.array([np.max(a) for a in - sar_c.a_list])) - - sar_c = CompareSAR(area_list, ['mete_sar_iter', 'logser-binm'], - full_sad, max_a=False) - - # Max area should be 9 - nt.assert_array_equal(np.array([9,9]), np.array([np.max(a) for a in - sar_c.a_list])) - - # Check species numbers - bools = [np.all(a == b) for a,b in zip(sar_c.sar_list, - [np.arange(9,18), np.arange(9,18)])] - self.assertTrue(np.all(bools)) - - # Test if patch == True - - rec_sar = np.array(zip(np.arange(1,8), np.arange(4,11)), - dtype=[('items', np.float), ('area', np.float)]) - - sar_c = CompareSAR([(rec_sar, [])], ['mete_sar_iter'], - [np.arange(1,50)], max_a=False, patch=True) - - # check species numbers - nt.assert_array_equal(np.arange(1,8), sar_c.sar_list[0]) - - # Check area numbers - nt.assert_array_equal(np.arange(4,11), sar_c.a_list[0]) - - # check that error is thrown if curve is bad - self.assertRaises(NameError, CompareSAR, [(rec_sar, [])], ['logser_binm'], - [np.arange(1,50)], max_a=False, patch=True) - - # Test compare_curves method - sar_c = CompareSAR([(rec_sar, [])], ['logser-binm'], - [np.arange(1,50)], patch=True) - - # Test with iter_val=False and use_rad=False and all combos - sar_c.compare_curves() - sar_c.compare_curves(use_rad=True) - sar_c.compare_curves(iter_vals=True, use_rad=False) - sar_c.compare_curves(iter_vals=True, use_rad=True) - - def test_compare_mse(self): - - sad_c = CompareSAD(self.sad_data, ['logser', 'lognorm']) - - # Test that mse output has the appropriate formatted data - mse = sad_c.compare_mse(mse_base='cdf') - self.assertTrue(len(mse) == 2) - self.assertTrue(len(mse['lognorm']) == 2 and len(mse['logser']) == 2) - - # Test the same thing for a rad base - mse = sad_c.compare_mse(mse_base='rad') - self.assertTrue(len(mse) == 2) - self.assertTrue(len(mse['lognorm']) == 2 and len(mse['logser']) == 2) - - # Test is the the distribution has no cdf MSE is set to NaN - sad_c = CompareSAD(self.sad_data, ['logser', 'sugihara']) - mse = sad_c.compare_mse(mse_base='cdf') - self.assertTrue(np.all(np.isnan(mse['sugihara']))) - - # Test that is works for if base = 'rad' - sad_c = CompareSAD(self.sad_data, ['logser', 'sugihara']) - mse = sad_c.compare_mse(mse_base='rad') - self.assertTrue(type(mse['sugihara'][0] == np.float)) - - # Test that compare mse works with ssads - ssad_c = CompareSSAD(self.ssad_data, ['binm', 'tgeo']) - # Test that mse output has the appropriate formatted data - mse = ssad_c.compare_mse(mse_base='cdf') - self.assertTrue(len(mse) == 2) - self.assertTrue(len(mse['binm']) == 2 and len(mse['tgeo']) == 2) - - # Test the same thing for a rad base - mse = ssad_c.compare_mse(mse_base='rad') - self.assertTrue(len(mse) == 2) - self.assertTrue(len(mse['binm']) == 2 and len(mse['tgeo']) == 2) - - def test_compare_rad_cdf(self): - - sad_c = CompareSAD(self.sad_data, ['logser']) - - tdist_list = copy.copy(sad_c.dist_list) - sad_c.dist_list = [] - - # Check that rad, cdf work with empty dist list - rads = sad_c.compare_rads() - cdfs = sad_c.compare_cdfs() - self.assertTrue(len(rads) == 1 and len(cdfs) == 1) - self.assertTrue('observed' in rads and 'observed' in cdfs) - self.assertTrue(rads == sad_c.rads) - self.assertTrue(cdfs == sad_c.cdfs) - - # Check that rad, cdf work with something in dist_list - sad_c.dist_list = tdist_list - sad_c.rads = None - sad_c.cdfs = None - rads = sad_c.compare_rads() - cdfs = sad_c.compare_cdfs() - self.assertTrue(len(rads) == 2 and len(cdfs) == 2) - self.assertTrue('observed' in rads and 'logser' in rads) - self.assertTrue('observed' in cdfs and 'logser' in cdfs) - self.assertTrue(rads == sad_c.rads) - self.assertTrue(cdfs == sad_c.cdfs) - - # Check that if dist doesn't have cdf empty arrays are returned - sad_c = CompareSAD(self.sad_data, ['logser', 'sugihara']) - cdfs = sad_c.compare_cdfs() - self.assertTrue(len(cdfs['sugihara']) == 2) - self.assertTrue(len(cdfs['sugihara'][0]) == 0 and - len(cdfs['sugihara'][1]) == 0) - - # check that observed rads are in the right order - true_vals = np.array([np.all(x == np.array(y)) for x,y in - zip(rads['observed'], self.sad_data)]) - - self.assertTrue(np.all(true_vals)) - - # Testing that SED object returns a species list in compare_rads - patch_sed = [({}, {'a' : np.arange(1,10), 'b' : np.arange(1,20), 'c': - np.arange(1,30), 'd' : np.arange(1,40)}), ({}, - {'a' : np.arange(1,10), 'b' : np.arange(1,20), 'c': - np.arange(1,30), 'd' : np.arange(1,40)})] - - patch_sad = [({}, np.arange(1,50), np.repeat('d',20)), - ({}, np.arange(1,50), np.repeat('d',20))] - patch_ied = [({}, np.arange(4,67), np.repeat('y', 60)), - ({}, np.arange(4,67), np.repeat('y', 60))] - - sed_c = CompareSED((patch_sed, patch_ied, patch_sad), - dist_list=['theta'], patch=True) - - # Both returns should have a species list - cdfs = sed_c.compare_rads(return_spp=True) - rads = sed_c.compare_cdfs(return_spp=True) - nt.assert_array_equal(np.array(['a', 'b', 'c', 'd', 'a', 'b', 'c', - 'd']), np.array(cdfs[1])) - nt.assert_array_equal(np.array(['a', 'b', 'c', 'd', 'a', 'b', 'c', - 'd']), np.array(cdfs[1])) - nt.assert_array_equal(np.array(['a', 'b', 'c', 'd', 'a', 'b', 'c', - 'd']), np.array(rads[1])) - - - def test_compare_aic(self): - - - # Add another distribution and check the order of the AIC output - sad_c = CompareSAD(self.sad_data, ['logser', 'most_even', 'nbd_lt']) - - aic_out = sad_c.compare_aic(crt=True) - # Most even should have the lowest AIC value for the second dataset - self.assertTrue(aic_out[1][1] == np.min(aic_out[1])) - - aic_m = sad_c.compare_aic_measures(crt=True) - - # Most even should have the a zero delta AIC for the second dataset - self.assertTrue(aic_m[1][1][1] == np.min(aic_m[1][1])) - - # Most even should have the highest wieght for the second dataset - self.assertTrue(aic_m[0][1][1] == np.max(aic_m[0][1])) - - # if I don't have any distributions I should get three empty lists for - # compare_aic_measures - sad_c = CompareSAD(self.sad_data, []) - aic_m = sad_c.compare_aic_measures(crt=True) - self.assertTrue(aic_m == ([],[],[])) - - # If distribution that is passed doesn't have a pmf of pdf, check inf - # aic values are returned - sad_c = CompareSAD(self.sad_data, ['logser', 'sugihara']) - aic_m = sad_c.compare_aic_measures() - self.assertTrue(aic_m[2][0][1] == np.inf and aic_m[2][1][1] == np.inf) - - def test_compare_LRT(self): - - # Testing compare LRT with logser null model - sad_c = CompareSAD(self.sad_data, ['nbd_lt']) - - # Is output properly formatted? - lrt_out = sad_c.compare_LRT(dist.logser()) - self.assertTrue(len(lrt_out) == 1 and 'logser, nbd_lt' in lrt_out) - - def test_compare_rarity(self): - - #Test compare_rarity - - sad_c = CompareSAD(self.sad_data, ['logser', 'most_even', 'nbd_lt']) - rare = sad_c.compare_rarity(1) - - # Observed should have 5 - self.assertTrue(rare['observed'][1][0] == 5) - - # Most even should have 10 species <= 2 - rare = sad_c.compare_rarity((1,2)) - self.assertTrue(rare['observed'][1][0] == 5) - self.assertTrue(rare['most_even'][2][1] == 10) - - def test_compare_moments(self): - - # Test the compare_moments output is formatted correctly - sad_c = CompareSAD(self.sad_data, ['logser', 'nbd_lt']) - mom = sad_c.compare_moments() - self.assertTrue(len(mom) == 3) - - # Test that observed and all distributions are considered - lengths = np.array([len(mom[x]) for x in mom.iterkeys()]) - - self.assertTrue(np.array_equal(lengths, np.repeat(3, 3))) - - def test_summary(self): - - # Test that summary output is correct - # Test is there are no dists in dist_list - sad_c = CompareSAD(self.sad_data, []) - sumry = sad_c.summary() - # Test that there is only observed in summary dict - self.assertTrue(len(sumry) == 1 and 'observed' in sumry) - - # Test if we have two distributions but one doesn't have a cdf - sad_c = CompareSAD(self.sad_data, ['logser', 'sugihara']) - smry = sad_c.summary() - self.assertTrue(len(smry) == 3) - - # Logseries dict and sugihara dict should have 9 kw - self.assertTrue(len(smry['logser']) == 9 and len(smry['sugihara']) == - 9) - - # AIC values for sugihara should in inf - self.assertTrue(np.all(smry['sugihara']['aic'] == np.array([np.inf, - np.inf]))) - # IED should be able to call summary - ied_data = [(np.arange(10,100), np.arange(1,40)), (np.arange(1,20), - np.arange(40,60))] - ied_c = CompareIED(ied_data, dist_list=['psi']) - smry = ied_c.summary() - self.assertTrue(smry['observed']['balls'] == [4905, 190]) - - def test_nll(self): - - # Test against R result: sum(dnorm(c(1,2,3,4,5), log=TRUE)) - R_res = 32.09469 - test_vals = stats.norm.pdf((1,2,3,4,5)) - lglk = nll([test_vals])[0] - self.assertTrue(R_res == np.round(lglk, decimals=5)) - - def test_empirical_cdf(self): - - #Test against R's ecdf function - test_data = [1,1,1,1,2,3,4,5,6,6] - R_res = [.4,.4,.4,.4,.5,.6,.7,.8,1,1] - res = empirical_cdf(test_data) - self.assertTrue(np.array_equal(R_res, res)) - - test_data = [3,3,3,3] - R_res = [1,1,1,1] - res = empirical_cdf(test_data) - self.assertTrue(np.array_equal(R_res, res)) - - def test_aic(self): - - # Test that passing either a pmf of nll gives the same result - test_vals = stats.norm.pdf((1,2,3,4,5,6,7,8)) - aic1 = aic([test_vals], 2, loglik=False) - aic2 = aic(nll([test_vals]), 2, loglik=True) - - self.assertTrue(aic1[0] == aic2[0]) - # Expected AIC for test_vals - expected = 6.837877066 # Calculated by hand - self.assertTrue(np.round(aic1[0], decimals=9), expected) - - test_vals = stats.gamma.pdf((1,1,1,4,5,7,12),2) - aic1 = aic([test_vals], 2, loglik=False) - expected = 51.146902 - self.assertTrue(np.round(aic1[0], decimals=6), expected) - - def test_aicc(self): - - # Test that passing either a pmf of nll gives the same result - test_vals = stats.norm.pdf((1,2,3,4,5,6,7,8)) - aic1 = aicc([test_vals], 2, loglik=False) - aic2 = aicc(nll([test_vals]), 2, 8, loglik=True) - - self.assertTrue(aic1[0] == aic2[0]) - - # Test that aicc gives the correct values - expected = 225.10302 - self.assertTrue(expected == np.round(aic1[0], decimals=5)) - - # Test Assertion error is thrown if no n param - self.assertRaises(AssertionError, aicc, 56, 2) - - - def test_aic_weights(self): - - vals = [1,1,1,2,3,4,7,23,78] - aic_vals = aicc([stats.norm.pdf(vals, scale=100), stats.norm.pdf(vals, - scale=99)], - [2,2],loglik=False) - aicw, delta_aic = aic_weights(aic_vals) - pred = np.array([ 0.47909787, 0.52090213]) - self.assertTrue(np.array_equal(np.round(aicw, decimals=8), pred)) - - - def test_ks_two_sample(self): - # Unittested in scipy, testing that this function works - - d, p = ks_two_sample([1,1,2,3,4,5,6,12], [1,2,3,4,5,5,5,5,5,7,8,9]) - - def test_likelihood_ratio(self): - - # Test against what the lrtest() R function returns - model1 = 158.0494 - model0 = 139.806 - R_chisquare = 36.4868 - R_p = 1.537e-09 - - pred_chi, pred_p = likelihood_ratio(model0, model1, 1)[0] - - self.assertTrue(np.round(pred_chi, decimals=4) == R_chisquare) - pred_p = np.round(pred_p, decimals=12) - self.assertTrue(pred_p == R_p) - - - def test_variance(self): - - # Test that I get back the correct values - data = [[0,1,2,3,4,45,18,56,24,56], [1,1,1,1,56,78,23,23]] - expt = [] - expt.append(np.var(data[0], ddof=1)) - expt.append(np.var(data[1], ddof=1)) - resulting_vals = variance(data) - self.assertTrue(np.array_equal(np.array(expt), - np.array(resulting_vals))) - # Using np.var which is optimized and unittested - - def test_skew(self): - - # Using the scipy.stats definition which is optimized and unittested - data = [[0,1,2,3,4,45,18,56,24,56], [1,1,1,1,56,78,23,23]] - expt = [] - expt.append(stats.skew(data[0])) - expt.append(stats.skew(data[1])) - resulting_vals = skew(data) - self.assertTrue(np.array_equal(np.array(expt), - np.array(resulting_vals))) - - def test_kurtosis(self): - - # Using the scipy.stats definition which is optimized and unittested - data = [[0,1,2,3,4,45,18,56,24,56], [1,1,1,1,56,78,23,23]] - expt = [] - expt.append(stats.kurtosis(data[0])) - expt.append(stats.kurtosis(data[1])) - resulting_vals = kurtosis(data) - self.assertTrue(np.array_equal(np.array(expt), - np.array(resulting_vals))) - - def test_mean_square_error(self): - - # Test against R mse function - pred = np.arange(1,9) - obs = np.arange(7, 15) - - comp_val = 36 - pred = mean_squared_error(pred, obs) - self.assertEqual(pred, comp_val) - - def test_bootstrap_moment(self): - - data1 = np.arange(1, 31) - data2 = np.arange(20, 50) - # Test the return is empty if wrong keyword is given - bs_vals = bootstrap_moment(data1, data2, ['men', 'vaiance', - 'sew', 'kurtoss'], num_samp=100) - - self.assertTrue(len(bs_vals) == 0) - - # Test bootstrap moment against William Rice's (UCSB) bootstrap - # programs in Statistics 101. Just testing the mean, but the - # implementation is the same for all of them - test_ci = np.array([-23.4, -14.6]) - - bs_vals = bootstrap_moment(data1, data2, ['mean', 'variance', - 'skew', 'kurtosis'], num_samp=50000) - - # Check that Bill Rice's and our 95% CIs match - self.assertTrue(np.array_equal(test_ci, np.round(bs_vals['mean'][1], - decimals=1))) - - # Check that the deltas match - self.assertTrue(-19 == bs_vals["mean"][0]) - - # Check that the length is right - self.assertTrue(len(bs_vals) == 4) - -if __name__ == '__main__': - unittest.main() diff --git a/test_data.py b/test_data.py deleted file mode 100644 index 1c2c0ab..0000000 --- a/test_data.py +++ /dev/null @@ -1,129 +0,0 @@ -''' -Unit tests for data.py -''' - -import unittest -import os -import numpy as np -from matplotlib.mlab import csv2rec -from macroeco.data import DataTable, Metadata - -class TestDataTable(unittest.TestCase): - - def setUp(self): - '''Write test xytable csv file.''' - - self.xyfile1 = open('xyfile1.csv','w') - self.xyfile1.write('''spp_code, x, y, count - 0, 0, 0, 1 - 0, 0, 0, 2 - 0, 0, 1, 1 - 1, 0, 0, 1 - 1, 1, 0, 2''') - self.xyfile1.close() - self.xyarr1 = csv2rec('xyfile1.csv') - - def tearDown(self): - os.remove('xyfile1.csv') - - def test_error_if_file_type_not_csv(self): - self.assertRaises(TypeError, DataTable, 'file.txt') - - def test_meta_None_if_no_meta_file(self): - xy1 = DataTable('xyfile1.csv') - self.assertEqual(xy1.meta, None) - - def test_table_is_correct(self): - xy1 = DataTable('xyfile1.csv') - np.testing.assert_array_equal(xy1.table, self.xyarr1) - - def test_get_subtable(self): - xy1 = DataTable('xyfile1.csv') - xy1.meta = {('x', 'maximum'): 1, - ('x', 'minimum'): 0, - ('x', 'precision'): 1, - ('y', 'maximum'): 1, - ('y', 'minimum'): 0, - ('y', 'precision'): 1} - - # Whole table - sub = xy1.get_subtable({}) - np.testing.assert_array_equal(sub, self.xyarr1) - - sub = xy1.get_subtable({'x': [('>=', 0),('<', 2)], 'y': [('>=', 0), - ('<', 2)]}) - np.testing.assert_array_equal(sub, self.xyarr1) - - # Subset - sub = xy1.get_subtable({'spp_code': ('==', 0)}) - np.testing.assert_array_equal(sub, self.xyarr1[0:3]) - - sub = xy1.get_subtable({'spp_code': ('==', 0), 'x': ('>', 0)}) - np.testing.assert_array_equal(sub, self.xyarr1[2]) - -class TestMetadata(unittest.TestCase): - - def setUp(self): - '''Write test data and metadata file.''' - - self.xyfile1 = open('xyfile1.csv','w') - self.xyfile1.write('''x, y - 0, 0 - 0, 0 - 0, 0 - 1, 0 - 1, 1''') - self.xyfile1.close() - - self.xymeta = open('xyfile1.xml','w') - self.xymeta.write(''' - - -Unittest XML - -NA --79.5915 --79.5915 -8.975 -10 - - - - -y -cell -x0.0 -99.90.1 -''') - self.xymeta.close() - - def tearDown(self): - os.remove('xyfile1.csv') - os.remove('xyfile1.xml') - - def test_metadata_correct_read(self): - # Should read values correctly from sample file, including None for - # attributes that do not exist and elements that do not exist. - xy1 = DataTable('xyfile1.csv') - self.assertEqual(len(xy1.meta), 8) - self.assertEqual(xy1.meta, {('x', 'maximum'): 99.9, - ('x', 'minimum'): 0.0, - ('x', 'precision'): 0.1, - ('x', 'type'): 'interval', - ('y', 'maximum'): None, - ('y', 'minimum'): None, - ('y', 'precision'): None, - ('y', 'type'): 'ordinal'}) - - def test_physical_coverage(self): - meta = Metadata('xyfile1.csv', []) - edges = meta.get_physical_coverage() - self.assertEqual(edges, [8.975, -79.5915, 10, -79.5915]) - - def test_title(self): - meta = Metadata('xyfile1.csv', []) - self.assertEqual(meta.get_title(), 'Unittest XML') diff --git a/test_empirical.py b/test_empirical.py deleted file mode 100644 index 281cda8..0000000 --- a/test_empirical.py +++ /dev/null @@ -1,541 +0,0 @@ -''' -Unit tests for empirical.py -''' - -from __future__ import division -import unittest -import os -gcwd = os.getcwd -pd = os.path.dirname -jp = os.path.join -from empirical import * -import numpy as np - - -class TestPatch(unittest.TestCase): - - def setUp(self): - self.xyfile5 = open('xyfile5.csv','w') - self.xyfile5.write('''spp_code, x, y, count -grt, .1, .1, 2 -grt, .1, .2, 1 -grt, .1, .3, 1 -rty, .1, .2, 1 -rty, .2, .3, 1''') - self.xyfile5.close() - self.xymeta5 = {('x', 'maximum'): .2, ('x', 'minimum'): .1, ('x', - 'precision'): .1, ('x', 'type'): 'interval', ('y', 'maximum'): .3, - ('y', 'minimum'): .1, ('y', 'precision'): .1, ('y', 'type'): 'interval', - ('spp_code', 'maximum'): None, ('spp_code', 'minimum'): None, - ('spp_code', 'precision'): None, ('spp_code', 'type'): 'ordinal', - ('count', 'maximum'): None, ('count', 'minimum'): None, ('count', - 'precision'): None, ('count', 'type'): 'ratio'} - - self.pat1 = Patch('xyfile5.csv') - # Line below sets metadata manually-no metadata file loaded - self.pat1.data_table.meta = self.xymeta5 - - self.xyfile6 = open('xyfile6.csv', 'w') - self.xyfile6.write('''spp_code, x, y, count -a, 0, 0, 1 -b, 0, 0, 1 -c, 0, 0, 0 -d, 0, 0, 3 -a, 0, 1, 0 -b, 0, 1, 4 -c, 0, 1, 0 -d, 0, 1, 1 -a, 1, 0, 1 -b, 1, 0, 0 -c, 1, 0, 3 -d, 1, 0, 1 -a, 1, 1, 0 -b, 1, 1, 1 -c, 1, 1, 3 -d, 1, 1, 1''') - self.xyfile6.close() - self.xymeta6 = {('x', 'maximum'): 1, ('x', 'minimum'): 0, ('x', - 'precision'): 1, ('x', 'type'): 'interval', ('y', 'maximum'): 1, - ('y', 'minimum'): 0, ('y', 'precision'): 1, ('y', 'type'): 'interval', - ('spp_code', 'maximum'): None, ('spp_code', 'minimum'): None, - ('spp_code', 'precision'): None, ('spp_code', 'type'): 'ordinal', - ('count', 'maximum'): None, ('count', 'minimum'): None, ('count', - 'precision'): None, ('count', 'type'): 'ratio'} - self.pat2 = Patch('xyfile6.csv') - self.pat2.data_table.meta = self.xymeta6 - - self.xyfile7 = open('xyfile7.csv', 'w') - self.xyfile7.write('''spp_code, x, y, count -tery, 1, 1, 1 -1, 1, 1, 1 -2, 1, 1, 0 -3, 1, 1, 3 -0, 1, 2, 0 -1, 1, 2, 4 -2, 1, 2, 0 -tery, 1, 2, 1 -0, 2, 1, 1 -1, 2, 1, 0 -2, 2, 1, 3 -3, 2, 1, 1 -tery, 2, 2, 0 -1, 2, 2, 1 -2, 2, 2, 3 -3, 2, 2, 1''') - self.xyfile7.close() - self.xymeta7 = {('x', 'maximum'): 2, ('x', 'minimum'): 1, ('x', - 'precision'): 1, ('x', 'type'): 'interval', ('y', 'maximum'): 2, - ('y', 'minimum'): 1, ('y', 'precision'): 1, ('y', 'type'): 'interval', - ('spp_code', 'maximum'): None, ('spp_code', 'minimum'): None, - ('spp_code', 'precision'): None, ('spp_code', 'type'): 'ordinal', - ('count', 'maximum'): None, ('count', 'minimum'): None, ('count', - 'precision'): None, ('count', 'type'): 'ratio'} - self.pat3 = Patch('xyfile7.csv') - self.pat3.data_table.meta = self.xymeta7 - - self.xyfile8 = open('xyfile8.csv', 'w') - self.xyfile8.write('''spp_code, x, y, count -0, 0, 0, 1 -1, 0, 0, 1 -2, 0, 0, 0 -3, 0, 0, 3 -0, 0, 1, 0 -1, 0, 1, 4 -2, 0, 1, 0 -3, 0, 1, 1 -0, 1, 0, 1 -1, 1, 0, 0 -2, 1, 0, 3 -3, 1, 0, 1 -0, 1, 1, 0 -1, 1, 1, 1 -2, 1, 1, 3 -3, 1, 1, 1 -0, 2, 0, 0 -1, 2, 0, 0 -2, 2, 0, 2 -3, 2, 0, 4 -0, 2, 1, 0 -1, 2, 1, 0 -2, 2, 1, 0 -3, 2, 1, 1''') - self.xyfile8.close() - self.xymeta8 = {('x', 'maximum'): 2, ('x', 'minimum'): 0, ('x', - 'precision'): 1, ('x', 'type'): 'interval', ('y', 'maximum'): 1, - ('y', 'minimum'): 0, ('y', 'precision'): 1, ('y', 'type'): 'interval', - ('spp_code', 'maximum'): None, ('spp_code', 'minimum'): None, - ('spp_code', 'precision'): None, ('spp_code', 'type'): 'ordinal', - ('count', 'maximum'): None, ('count', 'minimum'): None, ('count', - 'precision'): None, ('count', 'type'): 'ratio'} - self.pat4 = Patch('xyfile8.csv') - self.pat4.data_table.meta = self.xymeta8 - self.xyfile9 = open('xyfile9.csv','w') - self.xyfile9.write('''spp_code, x, y, count, energy, mass -grt, .1, .1, 2, 1, 34 -grt, .1, .2, 1, 2, 12 -grt, .1, .3, 1, 3, 23 -rty, .1, .2, 1, 4, 45 -rty, .2, .3, 1, 5, 110''') - self.xyfile9.close() - self.xymeta9 = {('x', 'maximum'): .2, ('x', 'minimum'): .1, ('x', - 'precision'): .1, ('x', 'type'): 'interval', ('y', 'maximum'): .3, - ('y', 'minimum'): .1, ('y', 'precision'): .1, ('y', 'type'): 'interval', - ('spp_code', 'maximum'): None, ('spp_code', 'minimum'): None, - ('spp_code', 'precision'): None, ('spp_code', 'type'): 'ordinal', - ('count', 'maximum'): None, ('count', 'minimum'): None, ('count', - 'precision'): None, ('count', 'type'): 'ratio'} - - self.pat5 = Patch('xyfile9.csv') - self.pat5.data_table.meta = self.xymeta9 - self.xyfile10 = open('xyfile10.csv', 'w') - self.xyfile10.write('''spp_code, x, y, count -a, 0, 0, 1 -b, 0, 0, 1 -d, 0, 0, 3 -b, 0, 1, 4 -d, 0, 1, 1 -a, 1, 0, 1 -c, 1, 0, 3 -d, 1, 0, 1 -b, 1, 1, 1 -c, 1, 1, 3 -d, 1, 1, 1''') - self.xyfile10.close() - self.xymeta10 = {('x', 'maximum'): 1, ('x', 'minimum'): 0, ('x', - 'precision'): 1, ('x', 'type'): 'interval', ('y', 'maximum'): 1, - ('y', 'minimum'): 0, ('y', 'precision'): 1, ('y', 'type'): 'interval', - ('spp_code', 'maximum'): None, ('spp_code', 'minimum'): None, - ('spp_code', 'precision'): None, ('spp_code', 'type'): 'ordinal', - ('count', 'maximum'): None, ('count', 'minimum'): None, ('count', - 'precision'): None, ('count', 'type'): 'ratio'} - self.pat6 = Patch('xyfile10.csv') - self.pat6.data_table.meta = self.xymeta10 - self.xyfile11 = open('xyfile11.csv', 'w') - self.xyfile11.write('''spp_code, x, y, count, reptile -a, 0, 0, 1, lizard -b, 0, 0, 1, lizard -d, 0, 0, 3, snake -b, 0, 1, 4, lizard -d, 0, 1, 1, turtle -a, 1, 0, 1, snake -c, 1, 0, 3, lizard -d, 1, 0, 1, snake -b, 1, 1, 1, tuatara -c, 1, 1, 3, turtle -d, 1, 1, 1, snake''') - self.xyfile11.close() - self.xymeta11 = {('x', 'maximum'): 1, ('x', 'minimum'): 0, ('x', - 'precision'): 1, ('x', 'type'): 'interval', ('y', 'maximum'): 1, - ('y', 'minimum'): 0, ('y', 'precision'): 1, ('y', 'type'): 'interval', - ('spp_code', 'maximum'): None, ('spp_code', 'minimum'): None, - ('spp_code', 'precision'): None, ('spp_code', 'type'): 'ordinal', - ('count', 'maximum'): None, ('count', 'minimum'): None, ('count', - 'precision'): None, ('count', 'type'): 'ratio', ('reptile', 'maximum') - : None, ('reptile', 'minimum') : None, ('reptile', 'precision'):None, - ('reptile', 'type') : 'ordinal'} - self.pat7 = Patch('xyfile11.csv') - self.pat7.data_table.meta = self.xymeta11 - - self.xyfile12 = open('xyfile12.csv', 'w') - self.xyfile12.write('''spp_code, x, y, count -3, 0, 0, 3 -3, 0, 1, 1 -2, 0, 2, 3 -1, 0, 3, 8 -3, 1, 0, 1 -3, 1, 1, 1 -0, 1, 2, 5 -3, 1, 3, 1 -2, 2, 0, 1 -1, 2, 1, 3 -1, 2, 2, 6 -0, 2, 3, 1 -1, 3, 0, 9 -2, 3, 1, 1 -0, 3, 2, 3 -3, 3, 3, 1''') - self.xyfile12.close() - self.xymeta12 = {('x', 'maximum'): 3, ('x', 'minimum'): 0, ('x', - 'precision'): 1, ('x', 'type'): 'interval', ('y', 'maximum'): 3, - ('y', 'minimum'): 0, ('y', 'precision'): 1, ('y', 'type'): 'interval', - ('spp_code', 'maximum'): None, ('spp_code', 'minimum'): None, - ('spp_code', 'precision'): None, ('spp_code', 'type'): 'ordinal', - ('count', 'maximum'): None, ('count', 'minimum'): None, ('count', - 'precision'): None, ('count', 'type'): 'ratio'} - self.pat8 = Patch('xyfile12.csv') - self.pat8.data_table.meta = self.xymeta12 - - # Data file with three count colums, unique row for each species - self.xyfile13 = open('xyfile13.csv', 'w') - self.xyfile13.write('''spp_code, order, plot1, plot2, plot3 -a, pred, 0, 0, 0 -b, pred, 0, 0, 1 -c, pred, 0, 1, 0 -d, pred, 0, 2, 3 -e, scav, 0, 1, 0 -f, scav, 0, 1, 4''') - self.xyfile13.close() - self.xymeta13 = {('spp_code', 'maximum'): None, - ('spp_code', 'minimum'): None, - ('spp_code', 'precision'): None, - ('spp_code', 'type'): 'ordinal', - ('order', 'maximum'): None, - ('order', 'minimum'): None, - ('order', 'precision'): None, - ('order', 'type'): 'ordinal', - ('plot1', 'maximum'): None, - ('plot1', 'minimum'): None, - ('plot1', 'precision'): None, - ('plot1', 'type'): 'ratio', - ('plot2', 'maximum'): None, - ('plot2', 'minimum'): None, - ('plot2', 'precision'): None, - ('plot2', 'type'): 'ratio', - ('plot3', 'maximum'): None, - ('plot3', 'minimum'): None, - ('plot3', 'precision'): None, - ('plot3', 'type'): 'ratio'} - self.pat9 = Patch('xyfile13.csv') - self.pat9.data_table.meta = self.xymeta13 - - - - - def tearDown(self): - os.remove('xyfile5.csv') - os.remove('xyfile6.csv') - os.remove('xyfile7.csv') - os.remove('xyfile8.csv') - os.remove('xyfile9.csv') - os.remove('xyfile10.csv') - os.remove('xyfile11.csv') - os.remove('xyfile12.csv') - os.remove('xyfile13.csv') - - # - # init and set_attributes - # - - def test_patch_init(self): - - # Test entire table is loaded - self.assertTrue(len(self.pat1.data_table.table) == 5) - self.assertTrue(len(self.pat2.data_table.table) == 16) - - # Test that subsetting works - pat = Patch('xyfile6.csv', {'spp_code': [('!=','a'), ('!=', 'b'), - ('!=','c')]}) - self.assertTrue(np.all(pat.data_table.table['spp_code'] == 'd')) - pat = Patch('xyfile7.csv', {'spp_code': ('==', "tery")}) - self.assertTrue(sum(pat.data_table.table['count']) == 2) - - # Testing that metadata was set correctly - self.assertTrue(self.pat1.data_table.meta[('x', 'maximum')] == .2) - - def test_sad(self): - - # Test correct result with 'whole' and one division - sad = self.pat1.sad({'spp_code': 'species', 'count': 'count', - 'x': 1}) - self.assertTrue(np.array_equal(sad[0][1], np.array([4,2]))) - sad = self.pat1.sad({'spp_code': 'species', 'count': 'count', - 'x': 'whole'}) - self.assertTrue(np.array_equal(sad[0][1], np.array([4,2]))) - sad = self.pat4.sad({'spp_code': 'species', 'count' :'count', 'x': 1}) - self.assertTrue(np.array_equal(sad[0][2], np.array([0,1,2,3]))) - - # Test correct result with other divisions - sad = self.pat4.sad({'spp_code': 'species', 'count': 'count', 'x': 3, - 'y': 2}) - self.assertTrue(np.array_equal(sad[-1][1], np.array([0,0,0,1]))) - - # Test that 'whole' and ignore give the same result - sad1 = self.pat4.sad({'spp_code': 'species', 'count': 'count'}) - sad2 = self.pat4.sad({'spp_code': 'species', 'count': 'count', 'x' : - 'whole'}) - self.assertTrue(np.array_equal(sad1[0][1], sad2[0][1])) - - # Test that 'split' keyword returns the correct results - sad = self.pat5.sad({'spp_code' :'species', 'energy':'split', 'count' - : 'count'}) - self.assertTrue(len(sad) == 5) - self.assertTrue(np.array_equal(sad[0][1], np.array([2,0]))) - - # Test split and clean on numeric column - sad = self.pat5.sad({'spp_code' :'species', 'energy':'split', 'count' - : 'count'}, clean=True) - self.assertTrue(len(sad) == 5) - self.assertTrue(np.array_equal(sad[0][1], np.array([2]))) - - # Test that cleaning sad and split works on string - sad = self.pat7.sad({'spp_code' : 'species', 'count' : 'count', - 'reptile' : 'split'}, clean=True) - self.assertTrue(len(sad) == 4) - self.assertTrue(np.array_equal(sad[0][1], np.array([1,5,3]))) - self.assertTrue(np.array_equal(sad[2][1], np.array([1]))) - self.assertTrue(sad[2][2][0] == 'b') - - def test_parse_criteria(self): - - # Checking parse returns what we would expect - pars = self.pat4.parse_criteria({'spp_code': 'species', 'count': 'count', - 'x': 1}) - self.assertTrue(pars[1] == 'spp_code') - self.assertTrue(pars[2] == 'count') - - # Test that energy, mass and count col are None - pars = self.pat4.parse_criteria({'spp_code': 'species', - 'y': 'whole'}) - self.assertTrue((pars[2] == None) and (pars[3] == None) and (pars[4] == - None)) - - # If species is not specified correctly an error is thrown - self.assertRaises(ValueError, self.pat3.parse_criteria, {'spp_col' - :'species'}) - # Make sure if count is not passed, no error is thrown - self.pat3.parse_criteria({'spp_code': 'species'}) - - # Check energy and mass returns - pars = self.pat5.parse_criteria({'spp_code': 'species', 'count': - 'count', 'energy': 'energy'}) - - self.assertTrue(pars[3] == 'energy') - self.assertTrue(pars[4] == None) - - # Check that combinations in empty dict if no criteria given - pars = self.pat5.parse_criteria({'spp_code': 'species', 'count': - 'count'}) - self.assertTrue(pars[5] == [{}]) - - # TODO: Test that error is thrown if step < prec - - def test_sar(self): - - # Checking that sar function returns correct S0 for full plot - sar = self.pat3.sar(('x', 'y'), [(1,1)], {'spp_code': 'species', - 'count': 'count'}) - self.assertTrue(sar[0]['items'][0] == 5) - - # Checking for correct result for sar - sar = self.pat3.sar(('x', 'y'), [(1,1), (2,2)], {'spp_code': 'species', - 'count': 'count'}) - self.assertTrue(np.array_equal(sar[1][1], np.array([3,3,2,3]))) - sar = self.pat4.sar(('x', 'y'), [(1,1), (1,2), (3,2)], {'spp_code': - 'species', 'count': 'count'}, form='sar') - self.assertTrue(np.array_equal(sar[1][2], np.array([3,3,2,2,3,1]))) - - # Checking for correct result for ear - ear = self.pat3.sar(('x', 'y'), [(1,1), (2,2)], {'spp_code': 'species', - 'count': 'count'}, form='ear') - self.assertTrue(np.array_equal(ear[1][1], np.array([0,1,0,0]))) - - # Test that returned areas are correct - sar = self.pat1.sar(('x', 'y'), [(1,1)], {'spp_code': 'species', - 'count': 'count'}) - self.assertTrue(np.round(sar[0]['area'][0], decimals=2) == 0.06) - self.assertTrue(sar[0]['items'][0] == 2) - - def test_universal_sar(self): - - # Check that it returns the right length - criteria = {'spp_code': 'species', 'count' : 'count'} - div_cols = ('x', 'y') - vals = self.pat8.universal_sar(div_cols, [(1,1), (1,2), (2,2), (2,4), - (4,4)], criteria) - self.assertTrue(len(vals) == 3) - - # If (1,1) is not passed in it should have a length of zero - vals = self.pat8.universal_sar(div_cols, [(1,2), (2,2)], criteria) - self.assertTrue(len(vals) == 0) - - # If (1,1) is not passed in but include_full == True should have len - # equal to 1 - vals = self.pat8.universal_sar(div_cols, [(1,2), (2,2), (2,4)], criteria, - include_full=True) - self.assertTrue(len(vals) == 2) - - # Test that I get the correct z-value back - vals = self.pat8.universal_sar(div_cols, [(1,1), (1,2), (2,2)], - criteria) - self.assertTrue(np.round(vals['z'][0], decimals=4) == 0.3390) - - # If I pass in something other than a halving I should still get - # something back - vals = self.pat8.universal_sar(div_cols, [(1,1), (2,2), (2,4), (4,4)], - criteria) - self.assertTrue(len(vals) == 2) - - def test_comm_sep(self): - - # Create result recarray - comm = self.pat9.comm_sep({'plot1': (0,0), 'plot2': (0,1), - 'plot3': (3,4)}, - {'spp_code': 'species', 'count': 'count'}) - - # Create result recarray with dec degree locs - comm_decdeg = self.pat9.comm_sep({'plot1': (9.1,79.0), - 'plot2': (9.2,79.5), 'plot3': (12.7,50)}, - {'spp_code': 'species', 'count': 'count'}, - loc_unit='decdeg') - - # Check distances - dist_sort = np.sort(comm['dist']) - np.testing.assert_array_almost_equal(dist_sort, np.array((1,4.242,5)), - 3) - - # Check distances dec degree - # TODO: Find exact third party comparison formula - formulas online use - # different radii, etc. and give approx same answer - dist_sort = np.sort(comm_decdeg['dist']) - #np.testing.assert_array_almost_equal(dist_sort, - # np.array((56.058,3193.507, - # 3245.820)), 3) - - # Check species in each plot - spp_sort = np.sort(np.array(list(comm['spp-a']) + list(comm['spp-b']))) - np.testing.assert_array_equal(spp_sort, np.array((0,0,3,3,4,4))) - - # Check Sorensen - 2 zeros from empty plot1 - sor_sort = np.sort(comm['sorensen']) - np.testing.assert_array_almost_equal(sor_sort, - np.array((0,0,0.571428571)), 5) - - # Check Jaccard - 2 zeros from empty plot1 - jac_sort = np.sort(comm['jaccard']) - np.testing.assert_array_almost_equal(jac_sort, np.array((0,0,0.4)), 5) - - def test_ssad(self): - - # Check that ssad does not lose any individuals - ssad = self.pat2.ssad({'spp_code': 'species', 'count': 'count'}) - sad = self.pat2.sad({'spp_code': 'species', 'count': 'count'}) - sum_ssad = np.array([sum(val) for val in ssad[1].itervalues()]) - self.assertTrue(sum(sad[0][1]) == sum(sum_ssad)) - - ssad = self.pat6.ssad({'spp_code': 'species', 'count': 'count'}) - sad = self.pat6.sad({'spp_code': 'species', 'count': 'count'}) - sum_ssad = np.array([sum(val) for val in ssad[1].itervalues()]) - self.assertTrue(sum(sad[0][1]) == sum(sum_ssad)) - - # Manual checks of correct ssad - ssad = self.pat2.ssad({'spp_code': 'species', 'count': 'count', 'x': - 2, 'y': 2}) - self.assertTrue(set(ssad[1]['a']) == {1, 0, 1, 0}) - self.assertTrue(set(ssad[1]['b']) == {1, 4, 0, 1}) - self.assertTrue(set(ssad[1]['c']) == {0, 0, 3, 3}) - self.assertTrue(set(ssad[1]['d']) == {3, 1, 1, 1}) - - ssad = self.pat6.ssad({'spp_code': 'species', 'count': 'count', 'x' : - 2, 'y': 2}) - self.assertTrue(set(ssad[1]['a']) == {1, 0, 1, 0}) - self.assertTrue(set(ssad[1]['b']) == {1, 4, 0, 1}) - self.assertTrue(set(ssad[1]['c']) == {0, 0, 3, 3}) - self.assertTrue(set(ssad[1]['d']) == {3, 1, 1, 1}) - - def test_ied(self): - - # Test correct length of result - eng = self.pat5.ied({'spp_code': 'species', 'count': 'count', - 'energy': 'energy'}) - self.assertTrue(len(eng[0][1]) == 6) - - # Test error if energy column is missing - self.assertRaises(ValueError, self.pat5.ied, - {'spp_code': 'species', 'count': 'count'}) - - # Test normalize is working - eng = self.pat5.ied({'spp_code': 'species', 'count': 'count', - 'energy': 'energy', 'x': 2}) - self.assertTrue(np.array_equal(eng[1][1], np.array([1]))) - self.assertTrue(len(eng[0][1]) == 5) - - # Test mass column and normalize - eng = self.pat5.ied({'spp_code': 'species', 'count': 'count', - 'mass' : 'mass'}, exponent=1, normalize=False) - self.assertTrue(np.array_equal(eng[0][1], np.array([17,17,12,23,45, - 110]))) - - # Test that energy overrides mass - eng = self.pat5.ied({'spp_code': 'species', 'count': 'count', - 'mass' : 'mass', 'energy' : 'energy'}, normalize=False) - self.assertTrue(np.array_equal(eng[0][1], np.array([.5,.5,2,3,4,5]))) - - def test_sed(self): - - # Check correct result - eng = self.pat5.sed({'spp_code': 'species', 'count': 'count', - 'energy': 'energy'}) - self.assertTrue(np.array_equal(eng[0][1]['grt'], - np.array([1,1,4,6]))) - self.assertTrue(np.array_equal(eng[0][1]['rty'], - np.array([8,10]))) - - eng = self.pat5.sed({'spp_code': 'species', 'count': 'count', - 'energy': 'energy', 'x': 2}) - self.assertTrue(np.array_equal(eng[1][1]['rty'], np.array([1]))) - self.assertTrue(len(eng[1][1]) == 2) - -if __name__ == "__main__": - unittest.main() - - diff --git a/utils/__init__.py b/utils/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/utils/docinherit.py b/utils/docinherit.py deleted file mode 100644 index 0dfe06a..0000000 --- a/utils/docinherit.py +++ /dev/null @@ -1,69 +0,0 @@ -# Recipe from http://stackoverflow.com/questions/2025562/ -# inherit-docstrings-in-python-class-inheritance -# -# Import module, then call doc_inherit = DocInherit - -""" -doc_inherit decorator - -Usage: - -class Foo(object): - def foo(self): - "Frobber" - pass - -class Bar(Foo): - @doc_inherit - def foo(self): - pass - -Now, Bar.foo.__doc__ == Bar().foo.__doc__ == Foo.foo.__doc__ == "Frobber" -""" - -from functools import wraps - -class DocInherit(object): - """ - Docstring inheriting method descriptor - - The class itself is also used as a decorator - """ - - def __init__(self, mthd): - self.mthd = mthd - self.name = mthd.__name__ - - def __get__(self, obj, cls): - if obj: - return self.get_with_inst(obj, cls) - else: - return self.get_no_inst(cls) - - def get_with_inst(self, obj, cls): - - overridden = getattr(super(cls, obj), self.name, None) - - @wraps(self.mthd, assigned=('__name__','__module__')) - def f(*args, **kwargs): - return self.mthd(obj, *args, **kwargs) - - return self.use_parent_doc(f, overridden) - - def get_no_inst(self, cls): - - for parent in cls.__mro__[1:]: - overridden = getattr(parent, self.name, None) - if overridden: break - - @wraps(self.mthd, assigned=('__name__','__module__')) - def f(*args, **kwargs): - return self.mthd(*args, **kwargs) - - return self.use_parent_doc(f, overridden) - - def use_parent_doc(self, func, source): - if source is None: - raise NameError, ("Can't find '%s' in parents"%self.name) - func.__doc__ = source.__doc__ - return func diff --git a/utils/form_func.py b/utils/form_func.py deleted file mode 100644 index 16b91e9..0000000 --- a/utils/form_func.py +++ /dev/null @@ -1,659 +0,0 @@ -#!/usr/bin/python -'''This module contains the functions for formatting data files''' - -import os -import numpy as np -import csv -import matplotlib.mlab as plt -import glob -import sys - -#Hacking this..Oh well -import format_data -loc = format_data.__file__ -gcwd = os.getcwd #get current directory -pd = os.path.dirname #get parent directory -chdir = os.chdir #change directories -jp = os.path.join #Join paths -sys.path.append(pd(pd(loc))) -from data import Metadata -import itertools -import logging - - -#Formatting functions -def get_metadata(asklist, folder_name, dataname): - ''' - This function takes in a list of tuples and returns the appropriate - metadata in a dictionary - - Parameters - ---------- - asklist : list - A list of tuples e.g. [('x', 'precision'), ('y', 'maximum')] - - folder_name : string - Name of the archival folder where data is located e.g. BCIS - - dataname : string - Name of the metadata e.g. BCIS_1984.xml (string) - - Returns - ------- - : dict - A dictionary containing requested metadata values - - ''' - cwd = gcwd() - chdir(jp(pd(pd(gcwd())), 'archival', folder_name)) - meta = Metadata(dataname, asklist) - chdir(cwd) - return meta.get_meta_dict(asklist) - -def get_files(filetype, num, direct, globber='_????'): - ''' - This function gets the filetype files from the data directory - /archival/direct and returns the names of the filetype files in the - directory. - - Parameters - ---------- - filetype : string - A string specifying the type of the file, i.e. 'csv' or 'txt' - - num : int - Expected number of files of type 'direct_????.filetype' - - direct : string - The directory within /data/archival/ where the files are. - Example 'BCIS' or 'COCO' - - globber : string - String of what pattern is to be globbed - - Returns - ------- - : list - A list of strings - - ''' - - assert direct.find('/') == -1, "%s should not contain a '/'" % (direct) - cwd = gcwd(); - filedir = jp(pd(pd(gcwd())), 'archival', direct) - chdir(filedir) - datafiles = glob.glob(direct + globber + '.' + filetype) - chdir(cwd) - if not(len(datafiles) == num): - raise Exception("Must be exactly {0} {1}_*.{2} file in /archival/{1}"\ - .format(num, direct, filetype)) - return datafiles - - -def open_data(filename, delim, names=None): - ''' - This functions takes in the filename and returns a rec array. - - Parameters - ---------- - filename : string - Name of the data file - - delim : string - File delimiter - - names : list - A list of columns names. See csv2rec? - - Returns - ------- - : recarray - A recarray containing the data from the specified file name - - ''' - - data = plt.csv2rec(filename, delimiter=delim, names=names) - return data - -def create_intcodes(speclist, unq_specs, unq_ints, dtype=float): - '''This function converts each value in unq_specs to the corresponding - value in unq_ints. Acts on speclist. - - Parameters - ---------- - - speclist : np.array - a 1D np.array which contains the occurrences of the species within the - plot - - unq_specs : np.array - a 1D np.array of the unique species codes within the plot - - unq_int : np.array - 1D np.array of unique integers referring to the unique species codes - found within the plot - - dtype : type - The type of the tot_int array. Default is float - - - Returns - ------- - : np.array - A 1D np.array of integers that is equivalent to speclist - - ''' - assert len(speclist) > 0, "Species array cannot be empty" - assert len(unq_specs) == len(unq_ints), "unq_specs and unq_ints must be " \ - + "the same length" - speclist = speclist.astype(unq_specs.dtype) - tot_int = np.empty(len(speclist), dtype=dtype) - for s in xrange(len(unq_specs)): - check = (unq_specs[s] == speclist) - for i in xrange(len(check)): - if check[i]: - tot_int[i] = unq_ints[s] - return tot_int - -def output_form(data, filename): - '''This function writes data as a .csv into the current working directory - - Parameters - ---------- - data : structured array - An structured array containing the data to be output - - filename : string - A string representing the name of the file to be output. - - ''' - savedir = jp(gcwd(), filename.split('.')[0] + '.csv') - fout = csv.writer(open(savedir, 'w'), delimiter=',') - fout.writerow(data.dtype.names) - for i in xrange(len(data)): - fout.writerow(data[i]) - -def open_dense_data(filenames, direct, delim=','): - ''' - This function takes in a list of dense data file names, opens - them and returns them as list of rec arrays. - - Parameters - ---------- - - filenames : list - A list of filenames - - direct : string - The directory within data/archival/ where the files are. - Example 'ANBO_2010' or 'LBRI' - - delim : string - The default file delimiter is ',' - - Returns - ------- - : list - A list of rec arrays - - ''' - assert direct.find('/') == -1, "%s should not contain a '/'" % (direct) - filedir = jp(pd(pd(gcwd())), 'archival', direct) - datayears = [] - for name in filenames: - data = plt.csv2rec(jp(filedir, name), delimiter=delim) - datayears.append(data) - return datayears - -def format_dense(datayears, spp_col, num_spp, count_col='count'): - ''' - This function takes a list of data. This functions interates - through the list and formats each year of data and stores the - formatted data into a list containing all years of formatted data. - - Parameters - ---------- - datayears : list - A list of rec arrays containing all years of data - - - spp_col : int - The column in the dense array where the spp_names begin. 0 is the first - column. - - num_spp : tuple or int - Total number of species in plot. Each element in the tuple is the - number of species in the corresponding rec array in data year. - Therefore, len(num_spp) should equal len(datayears). If num_spp is an - int, it is converted to a tuple and extended to len(datayears) - - count_col : str - This string specifies the name of the count column. The default is - 'count'. - - Returns - ------- - : list - A list of formatted structured arrays. - - ''' - # Handle and broadcast num_spp - if type(num_spp) == int: - num_spp = (num_spp,) - else: - num_spp = tuple(num_spp) - - if (len(num_spp) != len(datayears)): - if len(num_spp) == 1: - num_spp = tuple(np.repeat(num_spp[0], len(datayears))) - else: - raise TypeError('len(num_spp) must equal len(datayears)') - - - - data_formatted = [] - for k, data in enumerate(datayears): - ls = len(data.dtype.names[spp_col:spp_col + num_spp[k]]) - if len(data.dtype.names[:spp_col + num_spp[k]]) == \ - len(data.dtype.names): - dtype = data.dtype.descr[:spp_col] + [('spp', 'S22'), (count_col,\ - np.float)] - else: - dtype = data.dtype.descr[:spp_col] + data.dtype.descr[spp_col + \ - num_spp[k]:] + [('spp', 'S22'), (count_col, np.float)] - - data_out = np.empty(ls * len(data), dtype=dtype) - - for s, name in enumerate(data_out.dtype.names[:-2]): - cnt = 0 - for i in xrange(len(data)): - if s == 0: - data_out[name][cnt:(ls*(i+1))] = data[name][i] - data_out['spp'][cnt:(ls*(i+1))] = np.array\ - (data.dtype.names[spp_col:\ - spp_col + num_spp[k]]) - data_out[count_col][cnt:(ls*(i+1))] =\ - np.array(list(data[i]))[spp_col:spp_col +\ - num_spp[k]] - cnt = cnt + ls - else: - data_out[name][cnt:(ls*(i+1))] = data[name][i] - cnt = cnt + ls - #Remove all zeros, they are not needed - data_out = data_out[data_out[count_col] != 0] - data_formatted.append(data_out) - return data_formatted - -def open_nan_data(filenames, missing_value, site, delim, col_labels): - ''' - This function takes in the filenames with nans data file, removes any - NaN values for the x and y coordinates and returns a rec array. - - Parameters - ---------- - - filename : list - A list of filenames which point to data with missing values - - missing_value : string - How a missing value is labeled in the data - - site : string - Site name. Ex. 'COCO' or 'BCIS' - - delim : string - Delimiter for the files - - xylabels : tuple - Tuple with x and y column labels, i.e. ('gx', 'gy') or ('x', 'y') - - Returns - ------- - : list - list of recarrays - - ''' - #NOTE: Might need to get rid of some more NA fields - datadir = jp(pd(pd(gcwd())), 'archival', site) - datayears = [] - for name in filenames: - data = plt.csv2rec(jp(datadir, name), delimiter=delim,\ - missing=missing_value) - for label in col_labels: - notNaN = (False == np.isnan(data[label])) - data = data[notNaN] - datayears.append(data) - - return datayears - -def fractionate(datayears, wid_len_new, step_new, col_names, - wid_len_old=None, min_old=None, step_old=None): - ''' - This function takes in a list of formatted data years and converts the grid - numbers into meter measurements. For example, LBRI is a 16x16 grid and each - cell is labeled with integers. However, the length (and width) of a cell - is 0.5m. This function converts each integer cell number to the appropriate - integer (i.e. for LBRI cell (2,2) (counting from 1) becomes cell (0.5, - 0.5)). - - Parameters - ---------- - datayears : list - A list of formatted structured arrays - - wid_len_new : tuple - A tuple containing the new width (x) in meters and length (y) - in meters of the entire plot. - - step_new : tuple - The new step (or stride length) of the cell width and length - (tuple: (x_step, y_step)). It should be given in terms of meters. Also, - called precision. - - col_names : list - The col_names of the structured array that are to be fractionated. - - wid_len_old : tuple or None - If None, it assumes that a np.unique on datayears[col_name[i]] gives a - array that is the same length as np.arange(0, wid_len_new[i], - step=step_new[i]). If it doesn't, an error will be thrown. If not - None, expects the old maximum length for the given columns. - - min_old : tuple or None - Same as wid_len_old but the old minimum value for each given column - - step_old : tuple or None - Same as wid_len_old but the old step (or stride length/spacing) for - each given column. - - Returns - ------- - : list - A list of converted structured arrays - - Notes - ----- - This function should be used on columnar data - - ''' - - # format column names - col_names = format_headers(col_names) - - frct_array = [] - for data in datayears: - for i, name in enumerate(col_names): - if wid_len_old != None and step_old != None and min_old != None: - nums = np.arange(min_old[i], wid_len_old[i] + step_old[i], - step=step_old[i]) - else: - nums = np.unique(data[name]) - frac = np.arange(0, wid_len_new[i], step=step_new[i]) - #Have to make sure I have the data right type - ind = list(data.dtype.names).index(name) - dt = data.dtype.descr - dt[ind] = (name, 'f8') - data = data.astype(dt) - data[name] = create_intcodes(data[name], nums, frac) - frct_array.append(data) - - return frct_array - -def add_data_fields(data_list, fields_values, descr='S20'): - ''' - Add fields to data based on given names and values - - Parameters - ---------- - data_list : list - List of data to which a field will be appended - - fields_values : dict - dictionary with keyword being the the field name to be added and the - value being a tuple with length data_list specifying the - values to be added to each field in each data set. - - descr : a single data type or a dictionary - A single value will be broadcast to appropriate length. The dictionary - must have the same keywords as fields_values and must be the same - length. Each keyword should lookup a dtype. - - Returns - ------- - : list - A list containing the structured arrays with the new fields appended - - Notes - ----- - All added fields have default dtypes of 'S20' - - ''' - - # Check that dype descriptors are formatted appropriately - if type(fields_values) != dict: - raise TypeError('fields_values must be a dict not %s of type %s' % - (str(fields_values), str(type(fields_values)))) - keys = fields_values.viewkeys() - if type(descr) == dict: - if set(list(descr.viewkeys())) != set(list(keys)): - raise ValueError("descr and fields_values must contain same keys") - elif type(descr) == type or type(descr) == str: - descr = broadcast(len(fields_values), descr) - descr = dict(itertools.izip(keys, descr)) - else: - raise ValueError("Invalid type for descr") - - alt_data = [] - - dlen = len(data_list) - for i, data in enumerate(data_list): - for name in list(fields_values.viewkeys()): - data = add_field(data, [(name, descr[name])]) - - try: - ind = len(fields_values[name]) != dlen - if ind: #broadcast - fields_values[name] = broadcast(dlen, fields_values[name]) - except TypeError: - # Broadcast fields_values. Error is thrown if can't broadcast - fields_values[name] = broadcast(dlen, fields_values[name]) - - data[name] = fields_values[name][i] - alt_data.append(data) - return alt_data - -def merge_formatted(data_form): - ''' - Take in a list of formatted data an merge all data in - the list. The dtypes of the data in the list must - be the same - - Parameters - ---------- - data_form : list - List of formatted structured arrays (or rec_arrays) - - Returns - ------- - : list - A list containing one merged structured array - - ''' - if len(data_form) == 1: - return np.array(data_form[0]) - else: - # Dtypes can be a bit of a pain here - merged = np.copy(np.array(data_form[0])) - for i in xrange(1, len(data_form)): - if merged.dtype != data_form[i].dtype: - if merged.dtype.names != data_form[i].dtype.names: - raise TypeError("Column names of data do not match") - else: # If data dtypes are just different strings they should - # still be able to merge - temp_arr = list(np.copy(merged)) + list(np.copy(data_form[i])) - merge_types = [ty[1] for ty in merged.dtype.descr] - dt_types = [ty[1] for ty in data_form[i].dtype.descr] - con_types = [] - for m,d in zip(merge_types, dt_types): - if m == d: - con_types.append(m) - elif type(m) == str and type(d) == str: - if m[:2] == d[:2]: - if m > d: - con_types.append(m) - else: - con_types.append(d) - # Have to adjust the types appropriately - if len(con_types) == len(merged.dtype.names): - dtype = zip(merged.dtype.names, con_types) - merged = np.empty(len(temp_arr), dtype=dtype) - flipped_temp = zip(*temp_arr) - for i, nm in enumerate(merged.dtype.names): - merged[nm] =\ - np.array(flipped_temp[i]).astype(dtype[i][1]) - else: - raise TypeError('dtypes of data do not match. Merge' \ - + ' failed') - else: - merged = np.concatenate((merged, np.array(data_form[i]))) - return merged - -def add_field(a, descr): - ''' - Add field to structured array and return new array with empty field - - Parameters - ---------- - a : structured array - Orginial structured array - descr : list - dtype of new field i.e. [('name', 'type')] - - Returns - ------- - : structured array - Structured array with field added - - ''' - - if a.dtype.fields is None: - raise ValueError, "'A' must be a structured numpy array" - b = np.empty(a.shape, dtype=descr + a.dtype.descr) - for name in a.dtype.names: - b[name] = a[name] - return b - -def broadcast(length, item): - ''' - Broadcasts item to length = length if possible. Else raises error. - - length -- int - - item -- int of iterable - - ''' - # Handle and broadcast item - if type(item) == int: - item = (item,) - elif type(item) == type: - item = (item,) - elif type(item) == str: - item = (item,) - else: - item = tuple(item) - - if (len(item) != length): - if len(item) == 1: - item = tuple(np.repeat(item[0], length)) - else: - raise ValueError('Could not broadcast %s to length $s' % - (str(item), str(length))) - return item - -def format_headers(headers): - ''' Uses same formatting code that csv2rec uses. Converts the passed in - headers to the same format the csv2rec uses. - - Parameters - ---------- - headers : list - list of strings to be converted - - Return - ------ - : list - converted strings - - Notes - ----- - See csv2rec documentation and code - ''' - - # convert header to list of strings - if type(headers) == str or type(headers) == int or type(headers) == float: - headers = [headers] - headers = [str(i) for i in headers] - - - itemd = { - 'return' : 'return_', - 'file' : 'file_', - 'print' : 'print_', - } - - # remove these chars - delete = set("""~!@#$%^&*()-=+~\|]}[{';: /?.>,<""") - delete.add('"') - - names = [] - seen = dict() - for i, item in enumerate(headers): - item = item.strip().lower().replace(' ', '_') - item = ''.join([c for c in item if c not in delete]) - if not len(item): - item = 'column%d'%i - - item = itemd.get(item, item) - cnt = seen.get(item, 0) - if cnt>0: - names.append(item + '_%d'%cnt) - else: - names.append(item) - seen[item] = cnt+1 - - - return names - -def format_dict_names(old_dict): - ''' - This function formats the names with the format_headers function and - returns a new dictionary with the formatted names. Both dictionaries - contain the same values - - Parameters - ---------- - old_dict : dict - Dictioary with old keywords that will be changed - - Returns - ------- - new_dict : dict - Dictionary with updated keywords - - ''' - new_dict = {} - oldkeys = sorted(old_dict) - newkeys = format_headers(oldkeys) - for i in xrange(len(oldkeys)): - new_dict[newkeys[i]] = old_dict[oldkeys[i]] - - return new_dict - - - - - - - - - diff --git a/utils/format_data.py b/utils/format_data.py deleted file mode 100644 index de9b9e6..0000000 --- a/utils/format_data.py +++ /dev/null @@ -1,1014 +0,0 @@ -#!/usr/bin/python - -'''This module contains 4 separate classes, each built to handle a -canonical data type - -This module provides the user with some formatting functions but does provide -the user with all formatting functions that may be required. This module is -not a substitute for thorough examination of ones data to remove irrelevant -data''' - -import numpy as np -from matplotlib.mlab import csv2rec -import form_func as ff -from numpy.lib.recfunctions import drop_fields -import csv - - -class Columnar_Data: - ''' - This is the data form that the macroeco software package wants the data - file in. All other canonical data sets are converted to columnar data and - then turned into Columnar_Data objects. - - Examples of columnar data include BCIS, LUQU, and COCO - - Multiple data files must have same format if they are to be merged - - ''' - - def __init__(self, datalist, delimiter=',', missingd=None,\ - delete_missing=False, archival=True): - ''' - This __init__ method takes in data and stores it in rec_arrays. - If specified, it will located missing data points and remove them - from the data set. - - Parameters - ---------- - datalist : string, list of strings, or list of ndarrays. - Data filenames or list of data arrays - - delimiter : string - The file delimiter. Default is ',' - - missingd : dict - Dictionary mapping munged column names to field values which - signify that the field does not contain actual data and should be - masked, e.g. '0000-00-00' or 'unused'. The missing value must be - represented as a string. - - delete_missing : bool - If True, deletes all of the missing values. If False, only deletes - the NaNs from the data. - - archival : bool - If True, a copy of self.columnar_data is made and stored in - self.columnar_archival. If dataset is very large, set to False. - - Note - ---- - If column type is integer, missing values are set to -1. If column - type is float, missing values are set to NaN. If column type is - string, missing values are set to ''. If column type is object, - missing values are set to None. - - ''' - if type(datalist) == str: - datalist = [datalist] - - if np.all(np.array([type(x) == str for x in datalist])): - self.columnar_data = [] - self.data_names = [] - for file_name in datalist: - self.columnar_data.append(csv2rec(file_name, delimiter=delimiter,\ - missingd=missingd)) - self.data_names.append(file_name) - if missingd != None: - if delete_missing: - trun_data = [] - for data in self.columnar_data: - for key in missingd.iterkeys(): - try: - # Missing float - notNaN = (False == np.isnan(data[key])) - except: - notNaN = np.ones(len(data[key]), dtype=bool) - notBlank = np.array([it != '' for it in data[key]]) - notMinusOne = (data[key] != -1)# Missing int - # Missing other - notNone = np.array([i != None for i in data[key]]) - ind = np.bitwise_and(notNaN, notBlank) - ind = np.bitwise_and(ind, notMinusOne) - ind = np.bitwise_and(ind, notNone) - data = data[ind] - trun_data.append(data) - self.columnar_data = trun_data - else: - trun_data = [] - for data in self.columnar_data: - for key in missingd.iterkeys(): - try: - notNaN = (False == np.isnan(data[key])) - except: - notNaN = np.ones(len(data[key]), dtype=bool) - data = data[notNaN] - trun_data.append(data) - self.columnar_data = trun_data - elif np.all(np.array([type(x) == np.ndarray for x in datalist])): - self.columnar_data = datalist - - if archival: - self.columnar_archival = [np.copy(data) for data in - self.columnar_data] - else: - self.columnar_archival = [] - - def reset_columnar_data(self): - ''' - Resets self.columnar_data to self.columnar_archival - - Need to be careful about excessive memory usage! - ''' - if len(self.columnar_archival) == 0: - raise ValueError("The self.columnar_archival attribute of this %s" - % (self.__class__.__name__) + " object has not" - + " been initialized") - else: - self.columnar_data = [np.copy(data) for data in - self.columnar_archival] - - def subset_data(self, subset={}): - ''' - Subset any given column of the data - - Parameters - ---------- - subset : dict - Dictionary of permanent subset to data, {'column_name': - 'condition'}, which will limit all analysis to records in which - column_name meets the condition, ie, {'year': ('==', 2005), 'x': - [('>', 20), ('<', 40)]} restricts analysis to year 2005 and x - values between 20 and 40. These conditions can also be passed to - the individual methods, but subsetting the data table up front may - save analysis time. Subsetting on a string would look something - like {'name' : [('==', 'John'), ('==', 'Harry')]} - ''' - - - if subset != {}: - # Format column names - subset = ff.format_dict_names(subset) - - sub_data = [] - for data in self.columnar_data: - valid = np.ones(len(data), dtype=bool) - - for key, value in subset.iteritems(): - if type(value) is not type(['a']): # Make all iterables - value = [value] - - # Merge tuples into a string - merged_values = [] - for val in value: - try: # check if val[1] is a string - eval(str(val[1])) - merged_values.append(val[0] + str(val[1])) - except: - merged_values.append(val[0] + "'" + val[1] + "'") - - for this_value in merged_values: - try: - this_valid = eval("data[key]" + this_value) - valid = np.logical_and(valid, this_valid) - except ValueError: #If key can't be found do nothing - pass - - sub_data.append(data[valid]) - - self.columnar_data = sub_data - - def split_up_data_by_field(self, split_columns=None): - ''' - This function will take in the split-columns list and and split the - data into separate arrays based on the list. For example, if one were - to pass in dbh1, dbh2, dbh3 three copies of the data would be - made, each being identical except that each would only contain one of - the instances of dbh. One could also pass [(dbh1, recr1), (dbh2, recr2), - (dbh3, recr3)]. All other fields in split_columns will be excluded - other than the fields within the tuple under consideration. - - Parameters - ---------- - split_columns : list - a list of tuples specifying the columns by which to split the array - - Notes - ----- - Saves the split array as self.columnar_data. - - ''' - #Note: If they enter the wrong column name nothing will be removed - #Should I error check for this? - if split_columns != None: - # Check if split_columns is a list of strings. If so, change it - # into a list of tuples - split_columns = [(s,) if type(s) == str else tuple(s) for s in - split_columns] - - # Format the names in each tuple - split_columns = [tuple(ff.format_headers(nms)) for nms in - split_columns] - - split_data = [] - given_col_names = [] - for tup in split_columns: - for name in tup: - given_col_names.append(name) - given_col_names = np.array(given_col_names) - - - for data in self.columnar_data: - for tup in split_columns: - ind = np.ones(len(given_col_names), dtype=bool) - for name in tup: - ind = np.bitwise_and((name != given_col_names), ind) - remove_names = given_col_names[ind] - split_data.append(drop_fields(data, list(remove_names))) - self.columnar_data = split_data - - def change_column_names(self, change=None, changed_to=None): - ''' - This function takes a list of column names to be changed and a name - that they should be changed to - - Parameters - ---------- - change : list of tuples or strings - Each tuple or string contains column names. All the column names in - the first tuple will be changed to the first element in the - changed_to list and so on. - changed_to : list - A list of strings that contain the names that the columns in change - will be changed to. - - Notes - ----- - This function is useful if you would like to merge self.columnar_data - but the dtype.names are different. - - ''' - if change != None and changed_to != None: - if len(change) != len(changed_to): - raise ValueError('Length of params change and changed_to must' - + ' be equal') - # Convert to tuples if just received strings - change = [(x,) if type(x) == str else tuple(x) for x in change] - - # Format the names in each tuple - change = [tuple(ff.format_headers(nms)) for nms in change] - - for data in self.columnar_data: - column_names = np.array(data.dtype.names) - for i, name_tup in enumerate(change): - for name in name_tup: - find = np.where((name == column_names))[0] - if len(find) != 0: - max_len = np.max([len(x) for x in column_names]) - if max_len < len(changed_to[i]): - column_names = column_names.astype('S' + - str(len(changed_to[i]))) - column_names[find[0]] = changed_to[i] - data.dtype.names = tuple(column_names) - - def add_fields_to_data_list(self, fields_values=None, descr='S20'): - ''' - This functions adds given fields and values to the data list. If the - length of the value for a given keyword in one, it will be broadcast to - the length of self.columnar_data. Else an error will be thrown. - - Parameters - ---------- - fields_values : dict - dictionary with keyword being the the field name to be added and - the value being a tuple with length self.columnar_data specifying - the values to be added to each field in each data set. - descr : a single data type or a dictionary - A single value will be broadcast to appropriate length. The - dictionary must have the same keywords as fields_values and must be - the same length. Each keyword should lookup a dtype. - ''' - if fields_values != None: - self.columnar_data = ff.add_data_fields(self.columnar_data, - fields_values, descr=descr) - - def remove_columns(self, col_names=None): - ''' - This function will remove the all the columns within with names in - col_names from all the datasets in self.columnar_data. - - Parameters - ---------- - col_names : string or list - The name or names of columns to be removed - - ''' - - if col_names != None: - - if type(col_names) == str: - col_names = [col_names] - else: - col_names = list(col_names) - - # Format column names - col_names = ff.format_headers(col_names) - - removed_data = [] - for data in self.columnar_data: - removed_data.append(drop_fields(data, col_names)) - self.columnar_data = removed_data - - def fractionate_data(self, wid_len=None, step=None, col_names=None, - wid_len_old=None, min_old=None, step_old=None): - ''' - This function converts grid numbers to length measurements in - self.columnar_data - - Parameters - ---------- - wid_len : tuple - A tuple containing the the absolute length of the columns being - converted - step : tuple - The desierd precision (step or stride length) of each grid. The - first element in the step tuple corresponds with the first element - in the wid_len tuple and so on. - col_names : array-like object - An array-like object of strings giving the names of the columns - that will be fractionated - wid_len_old : tuple or None - If None, it assumes that a np.unique on datayears[col_name[i]] - gives a array that is the same length as np.arange(0, - wid_len_new[i], step=step_new[i]). If it doesn't, an error will be - thrown. If not None, expects the old maximum length for the given - columns. - min_old : tuple or None - Same as wid_len_old but the old minimum value for each given column - step_old : tuple or None - Same as wid_len_old but the old step (or stride length/spacing) for - each given column. - - ''' - if wid_len != None and step != None and col_names != None: - self.columnar_data = ff.fractionate(self.columnar_data, wid_len, step, - col_names, wid_len_old=wid_len_old, - min_old=min_old, step_old=step_old) - - - def merge_data(self): - ''' - This function concatenates the data files in data_list. The dtypes of - the data in data_list must be identical or this function will fail. - ''' - - self.merged_data = ff.merge_formatted(self.columnar_data) - - def output_merged_data(self, filename): - ''' - This function merges self.columnar_data and outputs the merged data. - - Parameters - ---------- - filename : string - The filename to be output - - ''' - #Merge data in case it has not been done - self.merge_data() - ff.output_form(self.merged_data, filename) - - def output_columnar_data(self, filenames): - ''' - This function outputs the self.columnar_data - - Parameters - ---------- - filenames : list - A list of filenames - - ''' - assert len(filenames) == len(self.columnar_data), "Number of " + \ - "filenames must be the same as the number of datasets" - for i, name in enumerate(filenames): - ff.output_form(self.columnar_data[i], name) - -class Grid_Data: - '''This class handles data should look like the EarthFlow data after a - census. It is a grid with species abundance data in each cell. - ex. - ARTDRA - 6 - GERTYR - 8 - - ''' - - def __init__(self, filenames, archival=True, spp_sep='\n'): - ''' - Pass in the file name(s) of the grid data that you want converted and - the number of columns in each grid. - - Parameters - ---------- - - filenames : str or list of strings - A filename or list of filenames - - archival : bool - If True, a copy of self.grid_data is made and stored in - self.grid_archival. If dataset is very large, set to False. - - ''' - #NOTE: Handle missing data!!!! - - if type(filenames) == str: - filenames = [filenames] - - assert np.all(np.array([name.split('.')[-1] for name in filenames]) ==\ - 'csv'), "Files must be csv" - - self.grid_data = [] - self.cols = [] - self.rows =[] - - for i, name in enumerate(filenames): - # Sometimes csv.reader reads an extra column so you have to read to - # whole file. Seems stupid to read in the file twice but oh well... - with open(name, 'rb') as csvreader: - reader = csv.reader(csvreader) - rows = [row for row in reader] - min_len = np.min([len(row) for row in rows]) - self.cols.append(min_len) - - self.grid_data.append(csv2rec(name, names=list(np.arange(0,\ - self.cols[i]).astype('S10')))) - self.rows.append(len(self.grid_data[i])) - - #Remove all '\n' from the end of each cell in grid - #Not technically necessary but just being clean - self.grid_data = remove_char(self.grid_data, char=spp_sep) - self.grid_data = remove_white_spaces(self.grid_data) - - if archival == True: - self.grid_archival = [np.copy(data) for data in self.grid_data] - else: - self.grid_archival = [] - - def reset_grid_data(self): - ''' - Resets self.grid_data to self.archival_data - - Need to be careful about excessive memory usage! - ''' - - if len(self.grid_archival) == 0: - raise ValueError("The self.grid_archival attribute of this %s" - % (self.__class__.__name__) + " object has not" - + " been initialized") - else: - self.grid_data = [np.copy(data) for data in self.grid_archival] - - def truncate_grid_cells(self, symbol=None): - ''' - This function will look at each cell in grid list and truncated the - string within the cell at AND after the first instance of a given - symbol. - - Parameters - ---------- - symbol : string or list of strings - The symbol at which to being truncation - - Notes - ----- - symbol is a keyword argument because format_grid_data script gives the - option to run every method. - - ''' - if symbol != None: - - if type(symbol) == str: - symbol = [symbol] - else: - symbol = list(symbol) - - for i in xrange(len(self.grid_data)): - for nm in self.grid_data[i].dtype.names: - for j in xrange(len(self.grid_data[i][nm])): - for sym in symbol: - ind = self.grid_data[i][nm][j].find(sym) - if ind != -1: - self.grid_data[i][nm][j] = \ - self.grid_data[i][nm][j][:ind] - - self.grid_data = remove_char(self.grid_data) - - # List of remove replace tuples? - def remove_and_replace(self, remove=None, replace=''): - ''' - Removes a string from a grid cell and replaces it with another one - - Paramters - --------- - remove : string - String to be removed - replace : string - String to replace removed string - - ''' - - if remove != None and replace != None: - for i in xrange(len(self.grid_data)): - for nm in self.grid_data[i].dtype.names: - for j in xrange(len(self.grid_data[i][nm])): - self.grid_data[i][nm][j] =\ - self.grid_data[i][nm][j].replace(remove, replace) - - def find_unique_spp_in_grid(self, spacer='-', spp_sep='\n'): - ''' - This function finds all of the unique species in the grid. - It assumes that your grid data is in the proper format. - - Parameters - ---------- - spacer : str - The character separating the species code from the species count. - Default value is '-' (n-dash) - - spp_sep : str - The character that separates a speces/count combination from - another species/count combination. Default value is '\n' - - ''' - self.unq_spp_lists = [] - for num, data in enumerate(self.grid_data): - spp_names = [] - for col in data.dtype.names: - for row in xrange(self.rows[num]): - if data[col][row].find(spacer) != -1: - nam_lst = data[col][row].split(spacer) - if len(nam_lst) == 2: - spp_names.append(nam_lst[0].strip()) - else: - spp_names.append(nam_lst[0].strip()) - for i in xrange(1, len(nam_lst) - 1): - spp_names.append(nam_lst[i].split(spp_sep)[1].\ - strip()) - self.unq_spp_lists.append(np.unique(np.array(spp_names))) - - def grid_to_dense(self, spacer='-', spp_sep='\n', archival=True): - ''' - This function converts a the list of gridded data sets into dense - data sets and stores them in dense_data. In addition, it - makes a Dense_Data object out of the newly converted data. - - Parameters - ---------- - spacer : str - The character separating the species code from the species count. - Default value is '-' (n-slash) - - spp_sep : str - The character that separates a speces/count combination from - another species/count combination. Default value is '\n' - - - ''' - - self.find_unique_spp_in_grid(spacer=spacer, spp_sep=spp_sep) - dense_data = [] - for i, data in enumerate(self.grid_data): - dtype_list = [('cell', np.int), ('row', np.int), ('column', np.int)] - for name in self.unq_spp_lists[i]: - tuple_type = (name, np.float) - dtype_list.append(tuple_type) - matrix = np.empty(self.rows[i] * self.cols[i], dtype=dtype_list) - #Iterate through the plot - count = 0 - for col in data.dtype.names: - for row in xrange(self.rows[i]): - matrix['cell'][count] = count - matrix['row'][count] = row - matrix['column'][count] = int(col) - for spp_name in self.unq_spp_lists[i]: - - # Check if cell has species. May be nested occurence! - matrix[spp_name][count] = 0 # Set base to 0 - start = data[col][row].find(spp_name) - if start == -1: # Nothing is there - pass # Count already set to zero - - else: # Something is there, but is it nested? - found = start - while found != -1: - # If this is true, it is nested - if (data[col][row][start + len(spp_name)] !=\ - spacer) or not(start == 0 or \ - data[col][row][start - 1] == spp_sep): - - pass - - else: # Actually a species, so add some - # abundance - - raw = data[col][row][start:].split(spacer)[1] - if raw.find(spp_sep) != -1: - tot_spp = raw.split(spp_sep)[0].strip() - else: - tot_spp = raw.split()[0].strip() - matrix[spp_name][count] += float(tot_spp) - found = data[col][row][start + 1 - :].find(spp_name) - start += found + 1 - count += 1 - dense_data.append(matrix) - self.Dense_Object = Dense_Data(dense_data, archival=archival) - - - def output_grid_data(self, filenames): - ''' - This function prints the data within self.grid_data with the given - filenames. - - Parameters - ----------- - filenames : list - A list of filnames to which the data will be saved - - ''' - - assert len(filenames) == len(self.grid_data), "Number of filenames\ - must be the same as the number of datasets" - for i, data in enumerate(self.grid_data): - ff.output_form(data, filenames[i]) - - -class Dense_Data: - '''This class handles data that are in the dense format. An example of the - dense format is a csv file that has columns named 'row' and 'column' and - the remainder of columns named after each species in the plot. The values - within each species column are the counts within the cell specified by the - columns names 'row' and 'column'. - - Note: Need to consider how I might break this class - ''' - - def __init__(self, datalist, delim=',', replace=None, archival=True): - ''' - - Parameters - ----------- - datalist : string, list of strings or list of arrays - List of filenames to be loaded or list of arrays to be set to - self.dense_data - delim : string - The file delimiter - replace : tuple - A tuple of length 2. The first element is a string that - represents the missing values that you would like to replace. The - second element is the value with which you would like to replace - the missing values. - archival : bool - If True, a copy of self.dense_data is made and stored in - self.dense_archival. If dataset is very large, set to False. - - ''' - #TODO: What kind of files could break this - if type(datalist) == str: - datalist = [datalist] - - if np.all(np.array([type(x) == str for x in datalist])): - self.dense_data = [] - if replace != None: - - assert len(replace) == 2, "Replace must contain 2 elements" - - for name in datalist: - self.dense_data.append(replace_vals(name, replace, - delim=delim)) - else: - for name in datalist: - data = csv2rec(name, delimiter=delim) - self.dense_data.append(data) - - elif np.all(np.array([type(x) == np.ndarray for x in datalist])): - self.dense_data = datalist - - if archival: - self.dense_archival = [np.copy(data) for data in - self.dense_data] - else: - self.dense_archival = [] - - def reset_grid_data(self): - ''' - Resets self.grid_data to self.archival_data - - Need to be careful about excessive memory usage! - ''' - - if len(self.dense_archival) == 0: - raise ValueError("The self.dense_archival attribute of this %s" - % (self.__class__.__name__) + " object has not" - + " been initialized") - else: - self.dense_data = [np.copy(data) for data in self.dense_archival] - - - def dense_to_columnar(self, spp_col_num, num_spp, count_col='count',\ - archival=True): - ''' - This function uses a function in form_func to convert dense data into - columnar data. Stores the columnar data as a Columnar Object. - - Parameters - ---------- - spp_col_num : int - The column number in the dense array where the spp_names begin - - num_spp : tuple or int - Number of species in each dataset in self.dense_data. If it is an - int, it will be broadcasted to the length of self.dense_data - - count_col : str - This string specifies the name of the count column. The default is - 'count'. - - ''' - columnar_data = ff.format_dense(self.dense_data, spp_col_num,\ - num_spp, count_col=count_col) - self.Columnar_Object = Columnar_Data(columnar_data, archival=archival) - - def output_dense_data(self, filenames): - ''' - This function prints the data within self.dense_data with the given - filenames. If self.dense_data has not been filled, error is thrown. - - Parameters - ---------- - filenames : list - A list of filenames to which the data will be saved - - ''' - - assert len(filenames) == len(self.dense_data), "Number of filenames\ - must be the same as the number of datasets" - for i, data in enumerate(self.dense_data): - ff.output_form(data, filenames[i]) - -class Transect_Data: - ''' - This class handles data that are similar to the Breeding Bird survey data. - One column has the species ID, one column has stop and all the other - columns have transects. This class can handle data with "n" nestings, not - just two. For example, the data could have location, transect and stop. - - The "stop" data should all be in consecutive columns - - ''' - - def __init__(self, filenames, delim=',', replace=None, archival=True): - ''' - - Parameters - ---------- - filenames : list - A list of filenames - delim : string - The file delimiter - replace : tuple - A tuple of length 2. The first element is a string which - represents the missing values that you would like to replace. The - second element is the value with which you would like to replace - the missing values. - archival : bool - If True, a copy of self.transect_data is made and stored in - self.transect_archival. If dataset is very large, set to False. - - - ''' - self.transect_data = [] - if type(filenames) == str: - filenames = [filenames] - - if replace != None: - - assert len(replace) == 2, "Replace must contain 2 elements" - replace = (str(replace[0]), replace[1]) - - for name in filenames: - self.transect_data.append(replace_vals(name, replace, - delim=delim)) - else: - for name in filenames: - data = csv2rec(name, delimiter=delim) - self.transect_data.append(data) - - if archival: - self.transect_archival = [np.copy(data) for data in - self.transect_data] - else: - self.transect_archival = [] - - def reset_transect_data(self): - ''' - Resets self.transect_data to self.transect_archival - - Need to be careful about excessive memory usage! - ''' - if len(self.transect_archival) == 0: - raise ValueError("The self.transect_archival attribute of this %s" - % (self.__class__.__name__) + " object has not" - + " been initialized") - else: - self.transect_data = [np.copy(data) for data in - self.transect_archival] - - def transect_to_columnar(self, stop_col_num, tot_stops, stop_name='stop', - count_name='count', archival=True): - ''' - This function takes transect data and convertes it into columnar data. - In addition it saves the columnar data as a Columnar_Data object. - - - Parameters - ---------- - stop_col_num : iterable or int - The column number where the stop counts begin (0 is the first - column). Can be len(transect_data) or length == 1. Broadcast if - length equals 1. - - tot_stops : iterable or int - The number of columns with stops. Can be len(transect_data) or - length == 1. Broadcast if length equals 1. - - stop_name : str - The name of the new stop column in the formatted data - - count_name : str - The name of the count column. Default is "count" - - - Notes - ----- - This function assumes that all data in self.transect_data are formatted - the same way. For example, the column that contains species names or - codes has the same name throughout all data sets. - - ''' - # Broadcast stop_col_num - stop_col_num = ff.broadcast(len(self.transect_data), stop_col_num) - tot_stops = ff.broadcast(len(self.transect_data), tot_stops) - - columnar_data = [] - for j, data in enumerate(self.transect_data): - nstops = tot_stops[j] - dtypes = data.dtype.descr[ : stop_col_num[j] ] - if (len(dtypes) + nstops) != len(data.dtype.names): - #Accounting for data fields after stops - end_dtypes = data.dtype.descr[(len(dtypes) + nstops) : ] - for x in end_dtypes: - dtypes.append(x) - dtypes.append((stop_name, 'S20')) - dtypes.append((count_name, np.int)) - column_data = np.empty(len(data) * nstops, dtype=dtypes) - for i in xrange(len(data)): - for name in column_data.dtype.names: - if name is stop_name: - column_data[name][i * nstops:(i + 1) * nstops] = \ - np.arange(0, nstops) - elif name is count_name: - column_data[name][i * nstops:(i + 1) * nstops] = \ - np.array(list(data[i]))[stop_col_num[j] : \ - -len(end_dtypes)] - else: - column_data[name][i * nstops:(i + 1) * nstops] = \ - data[name][i] - # Remove all zeros - column_data = column_data[column_data[count_name] != 0] - columnar_data.append(column_data) - self.Columnar_Object = Columnar_Data(columnar_data, archival=archival) - - def output_transect_data(self, filenames): - ''' - This function prints the data within self.columnar_data with the given - filenames. If self.columnar_data has not been filled, an error is - thrown. - - Parameters - ---------- - filenames : list - A list of filenames to which the data will be saved. Must be the - same length as self.columnar_data - - ''' - - assert len(filenames) == len(self.transect_data), "Number of filenames\ - must be the same as the number of datasets" - for i, data in self.transect_data: - ff.output_form(data, filenames[i]) - - -def remove_char(grid_list, char='\n'): - ''' - Removes the given char from the end of each cell in grid list - ''' - - for grid in grid_list: - for name in grid.dtype.names: - for i in xrange(len(grid[name])): - while grid[name][i][::-1].find(char) == 0: - grid[name][i] = grid[name][i][:-1] - - return grid_list - -def remove_white_spaces(grid_list): - ''' - Removes all of the white spaces from strings. - ''' - for grid in grid_list: - for name in grid.dtype.names: - for i in xrange(len(grid[name])): - grid[name][i] = ''.join(grid[name][i].split(' ')) - - return grid_list - -def replace_vals(filename, replace, delim=','): - ''' - Replace the values in filename with specified values in replace_values - - Parameters - ---------- - filename : string - Will be read into a rec array - - replace_values : tuple - First object is value to replace and second object is what to replace - it with - - - ''' - data = csv2rec(filename, delimiter=delim, missing=replace[0]) - for nm in data.dtype.names: - try: - # Missing float - isNaN = (np.isnan(data[nm])) - except: - isNaN = np.zeros(len(data[nm]), dtype=bool) - isBlank = np.array([it == '' for it in data[nm]]) - isMinusOne = (data[nm] == -1)# Missing int - # Missing other - isNone = np.array([i == None for i in data[nm]]) - ind = np.bitwise_or(isNaN, isBlank) - ind = np.bitwise_or(ind, isMinusOne) - ind = np.bitwise_or(ind, isNone) - data[nm][ind] = replace[1] - return data - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/utils/global_strings.py b/utils/global_strings.py deleted file mode 100644 index 11087e1..0000000 --- a/utils/global_strings.py +++ /dev/null @@ -1,421 +0,0 @@ -#!/usr/bin/python - -'''This python file contains global strings used in the scripts. Consolidated -in this script for easy maintenance''' - -subset = '''\nYou should examine the columns in your data set and decide if you -would like to subset your data in some particular way before the analysis -begins. It is important to note that only the subsetted data will be analyzed. -For example, if you have a column named 'year' in your data set with values -1998, 1999, and 2000 and you only want to look at the year 2000 for a -particular analysis, you should select the column year from left-hand most -dropdown list, select the == operator from the operator dropdown list and type -2000 in the value field. Similarly, you could use <, >, <=, >=, or != with any -column and value in your data.''' - -criteria = '''\nYou should examine the columns in your dataset and decide if you -would like to divide the data in a particular way for this analysis. For -example, if you have a spatial dataset with x,y coordinates and you are -interested in examining macroecological metrics for two separate halves of your -plot along the x coordinate, you could cut the x coordinate in two halves by -giving the 'x' column a value of 2. - -If the column that you would like to divide contains discrete values (e.g. -year), you could enter the keyword 'split' and each unique value will be -analyzed separately. Conversely, the value 'whole' could be given to specify -the entire column. The value 'whole' is equivalent to 1 or leaving the value -blank. If you would like to divide a given column, please select the word -'division' from the dropdown menu and input a value as discussed above.\n\n - -There are four other special words that can be used on a given column: -'species', 'energy', 'count', and 'mass'. When assigned to a column in your -data set, the special word 'species' indicates the column that contains your -species IDs, the special word 'energy' indicates the column that contains some -type of energy measure, the special word 'mass' indicates a column that -contains some type of mass measure, and the special word 'count' indicates the -column that contains your species counts. These special words can be chosen -from the dropdown menu next to each column header. The special word 'species' -MUST be assigned for every analysis. If the special word 'count' is not -assigned, the species counts are all assumed to be one.\n\n - -If there are columns in your data that are not relevant for this analysis leave -the value in the dropdown box as 'NA'. Columns designated 'NA' will not -influence the analysis.\n\n''' - - -rarity_measure = '''\nThis parameter allows you to specify the counts that you -will consider rare. If, for example, you want to know how many species in your -plot have an abundance of 2 or less you would set this parameter to 2. If you -enter more then one value, each value will be examined. Example input: [2] or -[2, 5]. The brackets MUST be included.''' - -SAD_distributions = '''\n -'logser' : Fisher's logseries distribution; -'logser_ut' : Upper-truncated logseries derived from MaxEnt; -'logser_ut_appx' : Approximation for the upper-truncated logseries; -'lognorm' : Lognormal distribution; -'plognorm_lt' : Poisson lognormal distribution with 0 truncated; -'nbd_lt' : Negative binomial distribution with 0 truncated; -'geo_ser' : Geometric series distribution; -'broken_stick' : McArthur's broken stick distribution -'most_even' : The most even SAD -'most_uneven' : The most uneven SAD -'sugihara' : Sugihara's multi-dimensional breakage model.''' - -SSAD_distributions = '''\n -'binm' : Binomial distribution; -'pois' : Poisson distribution; -'nbd' : Negative binomial distribution; -'fnbd' : Finite-negative binomial distribution; -'geo' : Geometric distribution; -'fgeo' : Finite-geometric distribution; -'tgeo' : Truncated geometric distribution derived from MaxEnt''' - -short_subset = '''\nSpecifications for how you want to subset your data before the -analysis. Note that only the subsetted data will be included in the analysis. -The left-hand dropdown box contains all the columns of your dataset and you may -choose one or more to subset. Please see analysis explanation for more detail -and examples.''' - -short_criteria = '''\nSpecifications for how you want to divide your data during -the analysis. The words you see below are the shared columns of your -dataset(s). You must designate your species column with the special word -'species' found in the dropdown menu. You are not required to fill any -additional columns for this analysis. Please see analysis explanation for more -detail and examples.''' - -optional = ''' Optional parameter. Default value: ''' - -req = ''' Required parameter.''' - -#### Formatting strings #### - -explanation_string = '''This formatting script loads {0} datasets and -reformats them into columnar data using the parameters that you specify below. -We define columnar data as a dataset that has distinct column headers and has -rows that describe the attributes of a single entity (often a species). For -example, a row could describe the spatial location of a species, the -total number of individuals of that species at that spatial location, -attributes about that location, the date the species was censuses, etc. All of -these atttributes are specified by the column headers. Please see the website -http://www.ctfs.si.edu/plots/summary/ for examples of columnar data. - -''' - -output_string = '''This formatting script outputs a formatted csv data file to -specified folder within ../macroeco/data/formatted. You can specify the name -of the output formatted file(s). If you do not, the script will hard code them -with the script name, run name, and some appended string. - -''' - -process_string = ''' -The formatting process is as follows: - -1. The specified {0} data is loaded\n -2. Any {0} data-specific formatting parameters are applied to the {0} -data\n -3. The {0} data is converted into columnar data\n -4. Any columnar data-specific formatting parameters are applied to the columnar -data\n -5. The columnar data is output\n -''' - -delimiter = '''\nThe file delimiter used in the data files. - -Example input: - -1. [','] - -Where ',' is the file delimiter. - -2. ['+'] - -Where '+' is the file delimiter. - -The brackets and quotes MUST be include''' - -missing_values_from_a_given_column = '''\nSpecifies what is a -missing data value in any given column in the data set. The input must be -formatted as a pythonic dictionary. - -Example input: - -1. {'count' : 'NA', 'year' : ''} - -This input says that the data column 'count' has missing values 'NA' and the -data column 'year' has missing values '' (blank). The brackets and semicolons -are required for this parameter''' - - -delete_missing_values = '''\nEither True or False. If True, the missing values -specified in the missing_values_from_a_given_column parameter are removed from -the formatted data (your archival data remains unchanged). If False, only NaN -values are removed from the formatted data. - -Chose either: True or False.''' - -subset = '''\nA permanent subset to the formatted data, {'column_name': -'condition'}, which will limit all analysis to records in which column_name -meets the condition. The condition is a formatted as ('comparison operator', -'value'). Possible comparison operators are '==', '!=', '<, '>', '<=', '>='. -Please note that your archival data will remain unchanged. - -Subsetting examples: - -1. {'year': ('==' , 2005), 'x': [('>' , 20), ('<' , 40)]} - -Restricts analysis to year 2005 and x values between 20 and 40. Note that for -multiple conditions for a column square brackets MUST be included -(i.e. x : [('>', 20), ('<', 40)]). For a single condition on a column they are -optional (i.e. 'year': ('==', 2005)). - -2. {'name' : ('==', 'John')} - -Includes only rows in which column 'name' equals 'John'. When subsetting on a -string, the string should be quoted (i.e. ('==', 'John')) ''' - -columns_to_split = '''\nUse this if you want to split your single dataset into -multiple datasets based on given column names. For example, if you have a -dataset with column names ('x1', 'x2', 'x3','g', 'h') and you want to make -three datasets with column names ('x1', 'g', 'h'), ('x2', 'g', 'h'), and ('x3', -'g', 'h') you could type ['x1', 'x2', 'x3'] and your single data would be made -into three datasets with the columns given above. Notice that ALL columns that -are not specified are included in each new dataset. - -Example input: - -1. ['x1', 'x2', 'x3'] OR [('x1',) ('x2',), ('x3',)] - -Makes three datasets where each one contains only one of the specified columns. -All columns that are not specified are included. The brackets ([]) MUST be -included. - -2. [('x1', 'y1'), ('x2', y2'), ('x3', 'y3'), ('x4', 'y4')] - -Makes four datasets where each data set contains only one of the above pairs -x,y pairs. For example, the first data set would have columns ('x1', 'y1', ... -all unspecified columns) but it would not have columns 'x2', 'x3', 'x4, 'y2', -'y3', or 'y4'. ''' - -change_column_names = '''\nSpecifies the column names that you wish to change and -the names that you wish to change them to. This parameter is useful if you -wish to merge data sets. - -Example input: - -1. (['fred', 'jane'], ['mark', 'mary']) or ['fred', 'jane'], ['mark', 'mary'] - -Changes column 'fred' to 'mark' and column 'jane' to 'mary' in all datasets. -The brackets are required. - -2. ([('x1', 'x2', 'x3'), 'h1'], ['x', 'h']) - -Changes columns 'x1', 'x2', 'x3' to 'x' and column 'h1' to 'h'. All -brackets are required.''' - -add_column_names_and_values = '''\nSpecifies additional columns that you want to -add to the data and the values the column will take for each dataset. - -Example input: - -1. {'year' : (1998, 1999, 2000), 'name' : ('Fred', 'George', 'Ron')} - -Adds the column 'year' and 'name' to all datasets. In this example, there are -three data sets and the values of 'year' for the first, second, and third -dataset are set to 1998, 1999, and 2000, respectively. Similarly, the values of -column 'name' for the first, second, and third dataset are set to 'Fred', -'George', and 'Ron', respectively. The length of values to be assigned (i.e. -(1998, 1999, 2000)) must equal the number of datasets or be one. All brackets and -punctuation must be included - -2. {'year' : 1998} - -Adds the columns 'year' with a value of 1998 to all datasets being -considered. - -''' -names_of_columns_to_be_removed = '''\nRemove any number of columns from the -dataset by specifying the column names. - -Example Input: - -1. 'name' - -Removes the column 'name' from all data sets - -2. ['name', 'species', 'date'] - -Remove the columns 'name', 'species', and 'date' from all data sets -''' - -merge_data = '''\nEither Y/yes or N/no. If Y/yes, attempts to merge all of the -data into one dataset. If the merge is successful, only the single merged data -file will be output. If the merge cannot be completed an error will be -displayed. If N/no, no merge will be attempted and all datasets will be -output.''' - -columnar_params_full =\ -''' -*** delimiter *** - -{0} - -*** missing_values_from_a_given_column *** - -{1} - -*** delete_missing_values *** - -{2} - -*** columns_to_split *** - -{3} - -*** change_column_names *** - -{4} - -*** add_column_names_and_values *** - -{5} - -*** names_of_columns_to_be_removed *** - -{6} - -*** merge_data *** - -{7} - -*** subset *** - -{8} - -'''.format(delimiter, missing_values_from_a_given_column, -delete_missing_values, columns_to_split, change_column_names, -add_column_names_and_values, names_of_columns_to_be_removed, merge_data, -subset) - -columnar_params_med =\ -''' -*** delimiter *** - -{0} - -*** columns_to_split *** - -{1} - -*** change_column_names *** - -{2} - -*** add_column_names_and_values *** - -{3} - -*** names_of_columns_to_be_removed *** - -{4} - -*** merge_data *** - -{5} - -*** subset *** - -{6} - -'''.format(delimiter, columns_to_split, change_column_names, -add_column_names_and_values, names_of_columns_to_be_removed, merge_data, -subset) - -columnar_params_small =\ -''' -*** columns_to_split *** - -{0} - -*** change_column_names *** - -{1} - -*** add_column_names_and_values *** - -{2} - -*** names_of_columns_to_be_removed *** - -{3} - -*** merge_data *** - -{4} - -*** subset *** - -{5} - -'''.format(columns_to_split, change_column_names, -add_column_names_and_values, names_of_columns_to_be_removed, merge_data, -subset) - - - - -def check_columnar_params(params, script): - '''This function checks that all of the parameters required to convert - columnar data have the correct types. This test does not completely - validate parameters. Just check the first level type. - - Parameters - ---------- - params : dict - Parameter dictionary - script : str - Either 'grid', 'dense', 'columnar', or 'transect'. - - ''' - - # Can't check names_of_columns_to_be_removed because it can be a string. - if script == 'grid': - prms_types = [('columns_to_split', type([])), - ('change_column_names', type((2,))), - ('add_column_names_and_values', type({})), - ('merge_data', str), - ('subset', type({}))] - - elif script != 'columnar': - prms_types = [('delimiter' , type([])), - ('columns_to_split', type([])), - ('change_column_names', type((2,))), - ('add_column_names_and_values', type({})), - ('merge_data', str), - ('subset', type({}))] - - else: - prms_types = [('delimiter' , type([])), - ('missing_values_from_a_given_column', type({})), - ('delete_missing_values', type(True)), - ('columns_to_split', type([])), - ('change_column_names', type((2,))), - ('add_column_names_and_values', type({})), - ('merge_data', str), - ('subset', type({}))] - - for i, pair in enumerate(prms_types): - - if type(params[pair[0]]) != pair[1]: - if params[pair[0]] != None: - raise TypeError("Parameter '%s' must be a %s not a %s." % (pair[0], - str(pair[1]), str(type(params[pair[0]]))) + - " Please check the formatting of '%s': %s " % (pair[0], - str(params[pair[0]]))) - - - - - - diff --git a/utils/make_metadata.py b/utils/make_metadata.py deleted file mode 100644 index 1277880..0000000 --- a/utils/make_metadata.py +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env python - -''' -Makes minimal metadata for the user -''' - -import metadata_writer -import sys - - -#The user may want to make minimal metadata for multiple files -if len(sys.argv) == 1: - print "No data files included. Minimal metadata not made" -else: - for i in xrange(len(sys.argv)): - if i > 0: - metawriter = metadata_writer.MetaWriter(sys.argv[i]) - traitlist = [] - typelist = [] - print "Examining file '" + metawriter.filename + "'..." - for name in metawriter.column_names: - cat = raw_input("Is column name '" + name +\ - "' categorical? ") - if cat == "No" or cat == "no" or cat == "n" or\ - cat == "N": - types = (name, {'cat' : False}) - typelist.append(types) - spatial = raw_input("Is column name '" + name +\ - "' spatially explicit? ") - if spatial == "Yes" or spatial == "yes" or spatial == 'Y'\ - or spatial == 'y': - while True: - minimum = raw_input("Please enter the minimum value" +\ - " of column '" + name + "': ") - maximum = raw_input("Please enter the maximum value" +\ - " of column '" + name + "': ") - precision = raw_input("Please enter the precision" +\ - " of column '" + name + "': ") - try: - minimum = float(minimum) - maximum = float(maximum) - precision = float(precision) - break #This might not work - except ValueError: - print "Maximum, minimum, and precision must all" +\ - " be real numbers" - traits = (name, {'precision': str(precision), - 'minimum' : str(minimum),\ - 'maximum' : str(maximum)}) - traitlist.append(traits) - - else: - types = (name, {'cat' : True}) - typelist.append(types) - - metawriter.add_attribute_types(typelist) - metawriter.add_attribute_traits(traitlist) - metawriter.write_meta_data() - - - - - - - - - - - diff --git a/utils/metadata_writer.py b/utils/metadata_writer.py deleted file mode 100644 index aaa8eaa..0000000 --- a/utils/metadata_writer.py +++ /dev/null @@ -1,293 +0,0 @@ -#!/usr/bin/env python - -''' -This module contains a minimal metadata writer class for quickly making -metadata - -''' - - -import xml.etree.ElementTree as ET -import os - -sub = ET.SubElement - - -class MetaWriter: - ''' - Writes a metadata file based on the given filename and user input - - ''' - - def __init__(self, datapath): - ''' - Class takes in a datafile path name and creates an xml tree using the - column heading of the recarray generated from the csv file. - - Parameters - ---------- - datapath : string - Datafile name - - ''' - assert datapath[-4:] == '.csv', "%s must end in .csv" % (datapath) - self.filename = datapath.split('.')[0] - fin = open(datapath, 'r') - self.column_names = fin.readline().strip().split(',') - fin.close() - self.root = ET.Element('eml:eml') - self.root.attrib = {'packageId' : self.filename, 'system' : 'knb', - "xmlns:eml" : "eml://ecoinformatics.org/eml-2.1.0", 'xmlns:xsi': - "http://www.w3.org/2001/XMLSchema-instance", "xsi:schemaLocation" - : "eml://ecoinformatics.org/eml-2.1.0 eml.xsd"} - self.dataset = sub(self.root, 'dataset') - self.title = sub(self.dataset, 'title') - self.title.text = "Data set " + os.path.split(datapath)[1] - - self.creator = sub(self.dataset, 'creator') - self.individualName = sub(self.creator, 'individualName') - self.surName = sub(self.individualName, 'surName') - self.surName.text = "None" - - self.contact = sub(self.dataset, 'contact') - self.individualName2 = sub(self.contact, 'individualName') - self.surName2 = sub(self.individualName2, 'surName') - self.surName2.text = "None" - - self.dataTable = sub(self.dataset, 'dataTable') - - self.entityName = sub(self.dataTable, 'entityName') - self.entityName.text = os.path.split(datapath)[1] - - self.physical = sub(self.dataTable, 'physical') - self.objectName = sub(self.physical, 'objectName') - self.objectName.text = os.path.split(datapath)[1] - self.size = sub(self.physical, 'size') - self.size.attrib = {'unit' : "byte"} - self.size.text = str(os.path.getsize(datapath)) - - # Nested in physical - self.dataFormat = sub(self.physical, 'dataFormat') - self.textFormat = sub(self.dataFormat, 'textFormat') - self.numHeaderLines = sub(self.textFormat, 'numHeaderLines') - self.numHeaderLines.text = "1" - self.recordDelimiter = sub(self.textFormat, 'recordDelimiter') - self.recordDelimiter.text = "#x0A" - self.attributeOrientation = sub(self.textFormat, 'attributeOrientation') - self.attributeOrientation.text = "column" - self.simpleDelimited = sub(self.textFormat, 'simpleDelimited') - self.fieldDelimiter = sub(self.simpleDelimited, 'fieldDelimiter') - self.fieldDelimiter.text = "," - - self.distribution = sub(self.physical, 'distribution') - self.online = sub(self.distribution, 'online') - self.url = sub(self.online, 'url') - self.url.text = "macroeco://" + os.path.split(datapath)[1] - - - self.attributeList = sub(self.dataTable, 'attributeList') - self.attributes = [] - self.attributeTypes = [] - for i, name in enumerate(self.column_names): - attribute = sub(self.attributeList, 'attribute') - attributeName = sub(attribute, 'attributeName') - attributeDefinition = sub(attribute, 'attributeDefinition') - attributeDefinition.text = "None" - measurementScale = sub(attribute, 'measurementScale') - - # Default Ordinal - attributeType = sub(measurementScale, 'ordinal') - nonNumericDomain = sub(attributeType,'nonNumericDomain') - textDomain = sub(nonNumericDomain, 'textDomain') - definition = sub(textDomain, 'definition') - definition.text = "None" - - attributeName.text = name - self.attributes.append(attribute) - self.attributeTypes.append(attributeType) - - self.numberOfRecords = sub(self.dataTable, 'numberOfRecords') - self.numberOfRecords.text = "Unknown" - - def add_attribute_types(self, typelist): - ''' - Sets the type of the attribute to either ordinal (categorical) or - interval (categorical). Initialized in constructor as ordinal. - - Parameters - ---------- - typelist : list - A list of tuples. Each tuple contains 2 elements: a string and a - dict. The dict must contain the keyword cat (categorical) or a - KeyError will be thrown. - - Example of typelist: - - [('x', {'cat' : True}), ('y' : {'cat' : True}), ('year', - {'cat' : False}] - - ''' - - for item in typelist: - for attribute in self.attributes: - tree = ET.ElementTree(attribute) - att = tree.findall('attributeName')[0] - if (att.text == item[0]): - measure = tree.findall('measurementScale')[0] - if item[1]['cat'] == True: - if len(measure.findall('interval')) == 1: - measure.remove(measure.find('interval')) - att_type = sub(measure, 'ordinal') - nonNumericDomain = sub(att_type,'nonNumericDomain') - textDomain = sub(nonNumericDomain, 'textDomain') - definition = sub(textDomain, 'definition') - definition.text = "None" - - elif len(measure.findall('ordinal')) == 1: - measure.remove(measure.find('ordinal')) - att_type = sub(measure, 'ordinal') - nonNumericDomain = sub(att_type,'nonNumericDomain') - textDomain = sub(nonNumericDomain, 'textDomain') - definition = sub(textDomain, 'definition') - definition.text = "None" - - elif item[1]['cat'] == False: - - if len(measure.findall('ordinal')) == 1: - measure.remove(measure.find('ordinal')) - att_type = sub(measure, 'interval') - unit = sub(att_type, 'unit') - standardUnit = sub(unit, 'standardUnit') - standardUnit.text = "dimensionless" - precision = sub(att_type, 'precision') - precision.text = "0" - numericDomain = sub(att_type, 'numericDomain') - numberType = sub(numericDomain, 'numberType') - numberType.text = 'natural' - - - elif len(measure.findall('interval')) == 1: - measure.remove(measure.find('interval')) - att_type = sub(measure, 'interval') - unit = sub(att_type, 'unit') - standardUnit = sub(unit, 'standardUnit') - standardUnit.text = "dimensionless" - precision = sub(att_type, 'precision') - precision.text = "0" - numericDomain = sub(att_type, 'numericDomain') - numberType = sub(numericDomain, 'numberType') - numberType.text = 'natural' - - def add_attribute_traits(self, traitlist): - ''' - Adds traits to the attributes contained in self.attributes as specified - by the traitlist. Traitlist is a list of tuples with each tuple - containting two elements: the attribute name (string) and a dictionary - of traits to be added to the attribute. If the type of the trait - ordinal, nothing will be changed. Only traits with type interval will - be appened too. - - Parameters - ---------- - traitlist : list - A list of 2 element tuples where the first element contains a - string and the second element conatins a dict. See example in - docstring. The only keywords that are recognized are maximum, - minimum, and precision. - - Example of traitlist: - - [('x', {'minimum' : '0', 'maximum' : '100'}), ('y', {'precision' : - '0.1'})] - - ''' - - for item in traitlist: - for attribute in self.attributes: - tree = ET.ElementTree(attribute) - child = tree.findall('attributeName')[0] - if child.text == item[0]: - #TODO:Cleaner way to do this than with if? - measure = tree.findall('measurementScale')[0] - if len(measure.findall('interval')) == 1: - interval = measure.findall('interval')[0] - for key in item[1].iterkeys(): - if key == 'precision': - prec = interval.findall('precision') - if len(prec) == 0: - precision = sub(interval, 'precision') - precision.text = str(item[1][key]) - elif len(prec) == 1: - prec[0].text = str(item[1][key]) - elif key == 'minimum': - numericDomain =\ - interval.findall('numericDomain')[0] - bnd = numericDomain.findall('bounds') - if len(bnd) == 0: - bounds = sub(numericDomain, 'bounds') - minimum = sub(bounds, 'minimum') - minimum.attrib = {'exclusive' : - 'false'} - minimum.text = str(item[1][key]) - elif len(bnd) == 1: - mins = bnd[0].findall('minimum') - if len(mins) == 0: - minimum = sub(bnd[0], 'minimum') - minimum = sub(bnd[0], 'minimum') - minimum.attrib = {'exclusive' : - 'false'} - minimum.text = str(item[1][key]) - elif len(mins) == 1: - bnd[0].remove(mins[0]) - minimum = sub(bnd[0], 'minimum') - minimum.attrib = {'exclusive' : - 'false'} - minimum.text = str(item[1][key]) - elif key == 'maximum': - numericDomain =\ - interval.findall('numericDomain')[0] - bnd = numericDomain.findall('bounds') - if len(bnd) == 0: - bounds = sub(numericDomain, 'bounds') - maximum = sub(bounds, 'maximum') - maximum.attrib = {'exclusive' : - 'false'} - maximum.text = str(item[1][key]) - elif len(bnd) == 1: - maxs = bnd[0].findall('maximum') - if len(maxs) == 0: - maximum = sub(bnd[0], 'maximum') - maximum.attrib = {'exclusive' : - 'false'} - maximum.text = str(item[1][key]) - elif len(maxs) == 1: - bnd[0].remove(maxs[0]) - maximum = sub(bnd[0], 'maximum') - maximum.attrib = {'exclusive' : - 'false'} - maximum.text = str(item[1][key]) - - - - def write_meta_data(self, name=None): - ''' - Writes out the xml tree that is contained in self.root and saves and - .xml file in the currect working directory under the given filename. If - no name is given save the xml as the same name as the input file. - - - ''' - - tree = ET.ElementTree(self.root) - if name == None: - tree.write(self.filename + '.xml') - else: - tree.write(name + '.xml') - - - - - - - - diff --git a/utils/test_form_func.py b/utils/test_form_func.py deleted file mode 100644 index 71d7668..0000000 --- a/utils/test_form_func.py +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/python -#Testing form_func.py - -import unittest -from form_func import * -import numpy as np -import os -from matplotlib.mlab import csv2rec -gcwd = os.getcwd #get current directory -pd = os.path.dirname #get parent directory -chdir = os.chdir #change directories -jp = os.path.join - -class TestFormFunc(unittest.TestCase): - '''Tests the functions with in form_func.py''' - - def setUp(self): - self.spp_array1 = np.array(['as', 'as', 'as', 'as', 'as']) - self.spp_array2 = np.array([2,2,3,5,6,3,4,5,7,8]) - self.spp_array3 = np.array(['as','bn', 'as', 'ty', 'bn']) - self.spp_array4 = np.array([]) - - self.arch1 = open('arch1.csv', 'w') - self.arch1.write('''cell, row, column, AGR, THY, FTW, REW - 1, 1, 1, 0, 1, 1, 0 - 2, 1, 2, 3, 3, 0, 1 - 3, 2, 1, 0, 0, 0, 0 - 4, 2, 2, 1, 5, 1, 0''') - self.arch1.close() - self.arch2 = open('arch2.csv', 'w') - self.arch2.write('''cell, row, column, AGR, THY, FTW, REW - 1, 1, 1, 0, 1, 1, 0 - 2, 1, 2, 3, 3, 0, 1 - 3, 2, 1, 0, 0, 0, 0 - 4, 2, 2, 1, 5, 1, 0''') - self.arch2.close() - - def tearDown(self): - os.remove('arch1.csv') - os.remove('arch2.csv') - - def test_create_intcodes(self): - unq_specs = np.unique(self.spp_array1) - unq_ints = np.linspace(0, len(np.unique(self.spp_array1)) - 1,\ - num=len(np.unique(self.spp_array1))) - tot_int = create_intcodes(self.spp_array1, unq_specs, unq_ints) - self.assertTrue(len(tot_int) == 5) - self.assertTrue(np.unique(tot_int)[0] == .0) - self.assertTrue(np.all(np.equal(tot_int, np.array([.0,.0,.0,.0,.0])))) - unq_specs = np.unique(self.spp_array2) - unq_ints = np.linspace(0, len(np.unique(self.spp_array2)) - 1, \ - num=len(np.unique(self.spp_array2))) - tot_int = create_intcodes(self.spp_array2, unq_specs, unq_ints) - self.assertTrue(len(tot_int) == len(self.spp_array2)) - self.assertTrue(np.all(np.equal(np.unique(tot_int), - np.linspace(0,6,num=7)))) - self.assertRaises(AssertionError, create_intcodes, self.spp_array4, - unq_specs, unq_ints) - - def test_add_field(self): - data = csv2rec('arch1.csv') - data_added = add_field(data, [('test', np.int)]) - names = np.array(data_added.dtype.names) - self.assertTrue(sum(names == 'test') == 1) - - def test_merge_formatted(self): - data1 = csv2rec('arch1.csv') - data2 = csv2rec('arch2.csv') - dl = [data1, data2] - merged = merge_formatted(dl) - self.assertTrue(sum(merged['rew']) == 2) - self.assertTrue(sum(merged['column']) == 12) - - def test_add_data_fields(self): - data1 = csv2rec('arch1.csv') - data2 = csv2rec('arch2.csv') - dl = [data1, data2] - alt_data = add_data_fields(dl, {'year': (1998, 2002)}) - self.assertTrue(np.all(alt_data[0]['year'] == '1998')) - self.assertTrue(np.all(alt_data[1]['year'] == '2002')) - alt_data = add_data_fields(dl, {'year' : (1998, 2002), 'why': ('h', - 'a')}) - self.assertTrue(np.all(alt_data[0]['why'] == 'h')) - - def test_fractionate(self): - data1 = csv2rec('arch1.csv') - data2 = csv2rec('arch2.csv') - dl = [data1, data2] - fr = fractionate(dl, (10, 10), (5, 5), ['row', 'column']) - self.assertTrue(fr[0]['row'][3] == 5) - self.assertTrue(fr[1]['column'][2] == 0) - - def test_format_dense(self): - data1 = csv2rec('arch1.csv') - data2 = csv2rec('arch2.csv') - dl = [data1, data2] - form = format_dense(dl, 3, (4,4)) - self.assertTrue(np.all(form[0]['count'][:4] == np.array([1,1,3,3]))) - self.assertTrue(np.all(form[1]['count'] == - np.array([1,1,3,3,1,1,5,1]))) - - - - - - - - diff --git a/utils/test_format_data.py b/utils/test_format_data.py deleted file mode 100644 index b21cc71..0000000 --- a/utils/test_format_data.py +++ /dev/null @@ -1,475 +0,0 @@ -#!/usr/bin/python - -'''Testing the classes in format_data.py''' - -import unittest -import numpy as np -import format_data as form -import os -import glob -import copy -gcwd = os.getcwd #get current directory -pd = os.path.dirname #get parent directory -chdir = os.chdir #change directories -jp = os.path.join - -class TestFormatData(unittest.TestCase): - '''Tests the classes within format_data.py''' - - def setUp(self): - - self.grid1 = open('grid1.csv', 'w') - self.grid1.write('''Harry-1+Joshua - 6+, hg-4+ty - 67,\nHarry-3+Joshua-1+y-34+ty - 87, hg-23''') - self.grid1.close() - - # Heavily nested names and blank cell - self.grid2 = open('grid2.csv', 'w') - self.grid2.write('''aaaa - 5&aaaa - 4 & aaaa - 3, aa - 2&a - 5, - aaa - 4& aaaa- 3& a - 1, ''') - self.grid2.close() - - # Grid to be cut - self.grid3 = open('grid3.csv', 'w') - self.grid3.write('''aaaa - 5*&aaaa - 4* & aa*aa - *3*$please, aa* -2*&a - 5will I be cut 7658?, - aa*a -* 4*& aa*aa- 3*& a* - 1*%maybe, **''') - self.grid3.close() - - self.dense1 = open('dense1.csv', 'w') - self.dense1.write('''column, row, fry, the, eggs, well, please - 0,0,1,2,3,4,5 - 0,1,0,0,,0,23 - 1,0,,,5,45,0 - 1,1,1,1,1,1,1''') - self.dense1.close() - - self.dense2 = open('dense2.csv', 'w') - self.dense2.write('''column, row, fry, the, eggs, well, please - 0,0,1,2,3,4,5 - 0,1,0,0,NA,0,23 - 1,0,NA,NA,5,45,0 - 1,1,1,1,1,1,1''') - self.dense2.close() - - self.dense3 = open('dense3.csv', 'w') - self.dense3.write('''column, row, fry, the, eggs, well, please, j - 0,0,1,2,3,4,5,2 - 0,1,0,0,NA,0,23,5 - 1,0,NA,NA,5,45,0,6 - 1,1,1,1,1,1,1,7''') - self.dense3.close() - - self.dense4 = open('dense4.csv', 'w') - self.dense4.write('''column, row, fry, the, eggs, well, please, j,h - 0,0,1,2,3,4,5,2,t - 0,1,0,0,0,0,23,5,u - 1,0,1,0,5,45,0,6,k - 1,1,1,1,1,1,1,7,m''') - self.dense4.close() - - self.trans1 = open('trans1.csv', 'w') - self.trans1.write( -'''spp, island, tree, b1, b2, b3, b4, b5, nm, fun -h,Marta,1,1,2,3,4,5,j,56 -t,Marta,2,1,1,1,1,0,k,78 -h,Garry,1,2,3,4,5,6,j,123 -t,Garry,2,0,1,2,0,5,u,456''') - self.trans1.close() - - self.col1 = open('col1.csv', 'w') - self.col1.write('''SPP, x, y, dBh1, dbH%2, john -l,1,1,34,38,g -y,2,1,100,10,g -h,1,2,1,1,g -y,2,2,300,2,f''') - self.col1.close() - - self.col2 = open('col2.csv', 'w') - self.col2.write('''sp+P, x, y, dbh1, dbh2, joH%n -l,1,,34,38,g -y,2,1,100,10,g -h,,2,1,1,NA -y,2,1,300,2,f''') - self.col2.close() - - - - def tearDown(self): - os.remove('grid1.csv') - os.remove('grid2.csv') - os.remove('grid3.csv') - os.remove('dense1.csv') - os.remove('dense2.csv') - os.remove('dense3.csv') - os.remove('dense4.csv') - os.remove('trans1.csv') - os.remove('col1.csv') - os.remove('col2.csv') - - def test_Grid_Data(self): - grid = form.Grid_Data('grid1.csv', spp_sep='+') - grid.find_unique_spp_in_grid(spacer='-', spp_sep='+') - - # Does it find the right species? - spp_list = np.array(['Harry', 'Joshua', 'hg', 'ty', 'y']) - unq_spp = grid.unq_spp_lists[0] - self.assertTrue(np.all(spp_list == unq_spp)) - - # If I don't truncate '+', it still finds the right species - grid = form.Grid_Data('grid1.csv') - grid.find_unique_spp_in_grid(spacer='-', spp_sep='+') - - spp_list = np.array(['Harry', 'Joshua', 'hg', 'ty', 'y']) - unq_spp = grid.unq_spp_lists[0] - self.assertTrue(np.all(spp_list == unq_spp)) - - # Test that the Dense plot is made correctly - grid = form.Grid_Data('grid1.csv', spp_sep='+') - grid.grid_to_dense(spacer='-', spp_sep='+') - columns = ('cell', 'row', 'column', 'Harry', 'Joshua', 'hg', 'ty', - 'y') - test_names = grid.Dense_Object.dense_data[0].dtype.names - self.assertTrue(np.all(test_names == columns)) - - # Test that values are correct - dense_obj = grid.Dense_Object - pred = np.array([0,0,4,23]) - test = dense_obj.dense_data[0]['hg'] - self.assertTrue(np.all(pred == test)) - pred = np.array([1,3,0,0]) - test = dense_obj.dense_data[0]['Harry'] - self.assertTrue(np.all(pred == test)) - pred = np.array([6,1,0,0]) - test = dense_obj.dense_data[0]['Joshua'] - self.assertTrue(np.all(pred == test)) - pred = np.array([0,34,0,0]) - test = dense_obj.dense_data[0]['y'] - self.assertTrue(np.all(pred == test)) - pred = np.array([0,87,67,0]) - test = dense_obj.dense_data[0]['ty'] - self.assertTrue(np.all(pred == test)) - - # Tested heavy nesting and empty cell - grid = form.Grid_Data('grid2.csv', 2) - grid.find_unique_spp_in_grid(spacer='-', spp_sep='&') - unq_spp = np.array(['a', 'aa', 'aaa', 'aaaa']) - pred = grid.unq_spp_lists[0] - self.assertTrue(np.all(unq_spp == pred)) - - grid.grid_to_dense(spacer='-', spp_sep='&') - dense_obj = grid.Dense_Object - pred = np.array([0,1,5, 0]) - test = dense_obj.dense_data[0]['a'] - self.assertTrue(np.all(pred == test)) - pred = np.array([0,0,2, 0]) - test = dense_obj.dense_data[0]['aa'] - self.assertTrue(np.all(pred == test)) - pred = np.array([0,4,0, 0]) - test = dense_obj.dense_data[0]['aaa'] - self.assertTrue(np.all(pred == test)) - pred = np.array([12,3,0, 0]) - test = dense_obj.dense_data[0]['aaaa'] - self.assertTrue(np.all(pred == test)) - - # Testing remove, replace, and truncation functions - grid = form.Grid_Data('grid3.csv', spp_sep='&') - grid.truncate_grid_cells(['$pl', 'will', '%may']) - grid.remove_and_replace('*', '') - - grid.find_unique_spp_in_grid(spacer='-', spp_sep='&') - unq_spp = np.array(['a', 'aa', 'aaa', 'aaaa']) - pred = grid.unq_spp_lists[0] - self.assertTrue(np.all(unq_spp == pred)) - - grid.grid_to_dense(spacer='-', spp_sep='&') - dense_obj = grid.Dense_Object - pred = np.array([0,1,5, 0]) - test = dense_obj.dense_data[0]['a'] - self.assertTrue(np.all(pred == test)) - pred = np.array([0,0,2, 0]) - test = dense_obj.dense_data[0]['aa'] - self.assertTrue(np.all(pred == test)) - pred = np.array([0,4,0, 0]) - test = dense_obj.dense_data[0]['aaa'] - self.assertTrue(np.all(pred == test)) - pred = np.array([12,3,0, 0]) - test = dense_obj.dense_data[0]['aaaa'] - self.assertTrue(np.all(pred == test)) - - # Testing reset to archival - grid.reset_grid_data() - temp_str = 'aaaa-5*&aaaa-4*&aa*aa-*3*$please' - data_str = grid.grid_data[0]['0'][0] - self.assertTrue(temp_str == data_str) - - - # Test that multiple data sets work - - grid = form.Grid_Data(glob.glob('grid*.csv'), archival=False) - - # reset_archival should fail in this case - self.assertRaises(ValueError, grid.reset_grid_data) - - # All the truncation should make the the two data sets equal - grid.truncate_grid_cells(['$pl', 'will', '%may']) - grid.remove_and_replace('*', '') - for col in xrange(grid.cols[0]): - for row in xrange(len(grid.grid_data[0])): - self.assertTrue(grid.grid_data[1][col][row] ==\ - grid.grid_data[2][col][row]) - - - def test_Dense_Data(self): - - # Test that the expected values are read in - dense = form.Dense_Data('dense1.csv', replace=('', 0)) - spp_arr = np.array([1,0,0,1]) - read_in = dense.dense_data[0]['fry'] - self.assertTrue(np.all(spp_arr == read_in)) - spp_arr = np.array([2,0,0,1]) - read_in = dense.dense_data[0]['the'] - self.assertTrue(np.all(spp_arr == read_in)) - spp_arr = np.array([3,0,5,1]) - read_in = dense.dense_data[0]['eggs'] - self.assertTrue(np.all(spp_arr == read_in)) - spp_arr = np.array([4,0,45,1]) - read_in = dense.dense_data[0]['well'] - self.assertTrue(np.all(spp_arr == read_in)) - spp_arr = np.array([5,23,0,1]) - read_in = dense.dense_data[0]['please'] - self.assertTrue(np.all(spp_arr == read_in)) - - # NAs should all be turned to 0's - dense = form.Dense_Data('dense1.csv', replace=('NA', 0)) - spp_arr = np.array([1,0,0,1]) - read_in = dense.dense_data[0]['fry'] - self.assertTrue(np.all(spp_arr == read_in)) - spp_arr = np.array([2,0,0,1]) - read_in = dense.dense_data[0]['the'] - self.assertTrue(np.all(spp_arr == read_in)) - spp_arr = np.array([3,0,5,1]) - read_in = dense.dense_data[0]['eggs'] - self.assertTrue(np.all(spp_arr == read_in)) - spp_arr = np.array([4,0,45,1]) - read_in = dense.dense_data[0]['well'] - self.assertTrue(np.all(spp_arr == read_in)) - spp_arr = np.array([5,23,0,1]) - read_in = dense.dense_data[0]['please'] - self.assertTrue(np.all(spp_arr == read_in)) - - # Test dense_to_columnar - dense = form.Dense_Data(['dense2.csv', 'dense3.csv'], replace=('NA',0)) - dense.dense_to_columnar(2, (5,6)) - col = dense.Columnar_Object - col.merge_data() - unq_spp = np.unique(['eggs', 'fry', 'the', 'well', 'please', 'j']) - pred_unq_spp = np.unique(col.merged_data['spp']) - self.assertTrue(np.all(unq_spp == pred_unq_spp)) - count = [1,2,3,4,5] - self.assertTrue(np.all(count == col.merged_data['count'][:5])) - count = [1,1,1,1,1,7] - self.assertTrue(np.all(count == col.merged_data['count'][-6:])) - self.assertTrue(len(col.merged_data) == 30) - self.assertTrue(col.merged_data['count'][5] == 23) - self.assertTrue(col.merged_data['spp'][5] == 'please') - - # Test correct extension of num_spp - dense = form.Dense_Data(['dense2.csv', 'dense2.csv'], replace=('NA',0)) - self.assertRaises(TypeError, dense.dense_to_columnar, 2, (5,6,7)) - dense.dense_to_columnar(2, 5) - col = dense.Columnar_Object - count = np.array([1,2,3,4,5]) - self.assertTrue(np.all(col.columnar_data[0]['count'][:5] == count)) - self.assertTrue(np.all(col.columnar_data[1]['count'][:5] == count)) - - # Test trailing column after species - dense = form.Dense_Data(['dense4.csv']) - dense.dense_to_columnar(2, 5) - col = dense.Columnar_Object - comp = np.array([2,2,2,2,2,5,6,6,6,7,7,7,7,7]) - self.assertTrue(np.all(comp == col.columnar_data[0]['j'])) - comp = np.array(['t', 't', 't', 't', 't', 'u', 'k', 'k', 'k', 'm', 'm', - 'm', 'm', 'm']) - self.assertTrue(np.all(comp == col.columnar_data[0]['h'])) - - def test_Transect_Data(self): - - - # Already tested replace_vals test_Dense_Data - trans = form.Transect_Data('trans1.csv', replace=('0', 1)) - trans.transect_to_columnar(3, 5) - col = trans.Columnar_Object - count = np.array([1,2,3,4,5,1,1,1,1,1,2,3,4,5,6,1,1,2,1,5]) - self.assertTrue(np.all(count == col.columnar_data[0]['count'])) - - # Test that transect data reads in correctly and converts to columnar - trans = form.Transect_Data('trans1.csv') - trans.transect_to_columnar(3, 5) - col = trans.Columnar_Object - count = np.array([1,2,3,4,5,1,1,1,1,2,3,4,5,6,1,2,5]) - self.assertTrue(np.all(count == col.columnar_data[0]['count'])) - - # Test multiple datasets are converted correctly - trans = form.Transect_Data(['trans1.csv', 'trans1.csv']) - trans.transect_to_columnar(3,5) - col = trans.Columnar_Object - col.merge_data() - self.assertTrue(np.all(np.concatenate((count, count)) == - col.merged_data['count'])) - def test_Columnar_Data(self): - - # Testing missing values - col = form.Columnar_Data('col2.csv', missingd={'y' : '', 'x' : '', - 'john' : 'NA'}, delete_missing=True) - self.assertTrue(len(col.columnar_data[0]) == 2) - self.assertTrue(np.all(col.columnar_data[0]['spp'] == np.array(['y', - 'y']))) - self.assertTrue(np.all(col.columnar_data[0]['dbh1'] == np.array([100, - 300]))) - - # No missing values; Test subsetting - col = form.Columnar_Data('col1.csv') - col.subset_data({'JOHN' : ('!=', 'f')}) - self.assertTrue(np.all(col.columnar_data[0]['john'] == np.array(['g', - 'g', 'g']))) - # Test reset - col.reset_columnar_data() - check = np.array(['g','g','g','f']) - self.assertTrue(np.all(col.columnar_data[0]['john'] == check)) - - # Test splitting - col.split_up_data_by_field([('D&Bh1',), ('dbh2',)]) - self.assertTrue(len(col.columnar_data) == 2) - dbh1 = np.array([34,100,1,300]) - dbh2 = np.array([38,10,1,2]) - try: - col.columnar_data[0]['dbh2'] - except ValueError: - pass - - try: - col.columnar_data[1]['dbh1'] - except ValueError: - pass - - self.assertTrue(np.all(col.columnar_data[0]['dbh1'] == dbh1)) - self.assertTrue(np.all(col.columnar_data[1]['dbh2'] == dbh2)) - - col.reset_columnar_data() - - col.split_up_data_by_field([('spp', 'x'), ('y',)]) - self.assertTrue(len(col.columnar_data) == 2) - td1 = np.array(['spp', 'x', 'dbh1', 'dbh2', 'john']) - td2 = np.array(['y', 'dbh1', 'dbh2', 'john']) - d1 = np.array(col.columnar_data[0].dtype.names) - d2 = np.array(col.columnar_data[1].dtype.names) - self.assertTrue(np.all(d1 == td1)) - self.assertTrue(np.all(d2 == td2)) - - # Test change column names - col.reset_columnar_data() - col.split_up_data_by_field([('dbh1',), ('dbh2',)]) - self.assertRaises(ValueError,col.change_column_names, [('x', 'y')], - ['hello']) - - col.reset_columnar_data() - col.split_up_data_by_field([('dbh1',), ('dbh2',)]) - col.change_column_names([('dbh1', 'dbh2'), ('john',)], ['dbh', 'harry']) - nms = np.array(['spp', 'x', 'y', 'dbh', 'harry']) - dtnms1 = np.array(col.columnar_data[0].dtype.names) - dtnms2 = np.array(col.columnar_data[1].dtype.names) - self.assertTrue(np.all(nms == dtnms1)) - self.assertTrue(np.all(nms == dtnms2)) - - # Test if long names added - col.reset_columnar_data() - col.split_up_data_by_field([('dbh1',), ('dbh2',)]) - col.change_column_names([('dbh1', 'dbh2')], ['goofy_chew']) - nms = np.array(['spp', 'x', 'y', 'goofy_chew', 'john']) - dtnms1 = np.array(col.columnar_data[0].dtype.names) - dtnms2 = np.array(col.columnar_data[1].dtype.names) - self.assertTrue(np.all(nms == dtnms1)) - self.assertTrue(np.all(nms == dtnms2)) - - # Test adding fields to data - col.reset_columnar_data() - col.split_up_data_by_field([('dbh1',), ('dbh2',)]) - col.change_column_names([('dbh1', 'dbh2')], ['dbh']) - col.add_fields_to_data_list({'year' : (1998, 2001), 'body' : ('large', - 'small')}) - year1 = np.repeat('1998', 4) - year2 = np.repeat('2001', 4) - body1 = np.repeat('large', 4) - body2 = np.repeat('small', 4) - self.assertTrue(np.all(year1 == col.columnar_data[0]['year'])) - self.assertTrue(np.all(year2 == col.columnar_data[1]['year'])) - self.assertTrue(np.all(body1 == col.columnar_data[0]['body'])) - self.assertTrue(np.all(body2 == col.columnar_data[1]['body'])) - - # Test adding different dtypes - col.reset_columnar_data() - col.split_up_data_by_field([('dbh1',), ('dbh2',)]) - col.change_column_names([('dbh1', 'dbh2')], ['dbh']) - col.add_fields_to_data_list({'year' : (1998, 2001), 'body' : ('large', - 'small')}, descr={'year': np.int, - 'body': 'S20'}) - - year1 = np.repeat(1998, 4) - year2 = np.repeat(2001, 4) - body1 = np.repeat('large', 4) - body2 = np.repeat('small', 4) - self.assertTrue(np.all(year1 == col.columnar_data[0]['year'])) - self.assertTrue(np.all(year2 == col.columnar_data[1]['year'])) - self.assertTrue(np.all(body1 == col.columnar_data[0]['body'])) - self.assertTrue(np.all(body2 == col.columnar_data[1]['body'])) - - # Test remove columns - col = form.Columnar_Data(['col1.csv', 'col2.csv'], missingd={'y' : '', - 'x' : '', 'john' : 'NA'}, delete_missing=True) - self.assertTrue(len(col.columnar_data[0]) == 4) - self.assertTrue(len(col.columnar_data[1]) == 2) - col.remove_columns('john') - test_nm = set(['x','y', 'spp', 'dbh1', 'dbh2']) - self.assertTrue(test_nm == set(col.columnar_data[0].dtype.names)) - self.assertTrue(test_nm == set(col.columnar_data[1].dtype.names)) - - col.remove_columns(['x', 'y']) - test_nm = set(['spp', 'dbh1', 'dbh2']) - self.assertTrue(test_nm == set(col.columnar_data[0].dtype.names)) - self.assertTrue(test_nm == set(col.columnar_data[1].dtype.names)) - - # Try removing row that is not there, no error is thrown - col.remove_columns(['x']) - self.assertTrue(test_nm == set(col.columnar_data[0].dtype.names)) - self.assertTrue(test_nm == set(col.columnar_data[1].dtype.names)) - - # Fractionate is tested in test_form_func.py. Test if wid_len_old etc. - # parameters return expected results - col.reset_columnar_data() - temp_col = copy.deepcopy(col.columnar_data) - col.columnar_data = [temp_col[0]] - col.fractionate_data((1,1), (.5,.5), ('x', 'y')) - self.assertTrue(np.all(np.array([0,.5,0,.5]) == - col.columnar_data[0]['x'])) - col.columnar_data = [temp_col[1]] - col.fractionate_data((1,1), (.5,.5), ('x', 'y'), (2,2), (1,1), (1,1)) - self.assertTrue(np.all(np.array([.5,.5]) == - col.columnar_data[0]['x'])) - self.assertTrue(np.all(np.array([0,0]) == - col.columnar_data[0]['y'])) - - col.columnar_data = temp_col - - # Test merge data - col.merge_data() - self.assertTrue(len(col.merged_data) == 6) - spp = np.array(['l','y','h','y','y','y']) - self.assertTrue(np.all(col.merged_data['spp'] == spp)) - dbh2 = np.array([34,100,1,300,100,300]) - self.assertTrue(np.all(col.merged_data['dbh1'] == dbh2)) - - # Try to break merge data - col.columnar_data = [col.merged_data] - col.merge_data() - -if __name__ == '__main__': - unittest.main() diff --git a/utils/test_metadata_writer.py b/utils/test_metadata_writer.py deleted file mode 100644 index 593f5c7..0000000 --- a/utils/test_metadata_writer.py +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/python -#Testing metadata_writer.py - -import unittest -from metadata_writer import * -import numpy as np -import xml.etree.ElementTree as ET - -import os -gcwd = os.getcwd #get current directory -pd = os.path.dirname #get parent directory -chdir = os.chdir #change directories -jp = os.path.join - -class TestMetadataWriter(unittest.TestCase): - '''Tests the MetaWriter class''' - - def setUp(self): - - self.meta1 = open('meta1.csv', 'w') - self.meta1.write('''cell,row,column,spp,year - 0, 1, 2, 3, 4''') - self.meta1.close() - - def tearDown(self): - os.remove('meta1.csv') - - def test_metawriter(self): - mt = MetaWriter('meta1.csv') - att = [('row', {'maximum' : '2', 'minimum' : '0', 'precision' :\ - '0.1'}), ('column', {'maximum' : '45', 'minimum' :\ - '0', 'precision' : '1'})] - - # Check that all attributes are in xml tree - self.assertTrue(len(mt.attributeList.findall('./')) == 5) - - # Check that all types are ordinal by default - measure = mt.attributeList.findall('./attribute/measurementScale') - for i, m in enumerate(measure): - temp = m.findall('./') - self.assertTrue(temp[0].tag == 'ordinal') - - # Check that it adds correct attribute types - types = [('cell', {'cat' : True}), ('row', {'cat' : False}), ('column', {'cat' - : False}), ('spp' , {'cat' : True})] - - mt.add_attribute_types(types) - order = ['ordinal', 'interval', 'interval', 'ordinal', 'ordinal'] - for i, att in enumerate(mt.attributes): - temp = att.findall('./measurementScale/' + order[i]) - self.assertTrue(len(temp) == 1) - - # Check that it overwrites types if they are changed - types = [('cell', {'cat' : False}), ('row', {'cat' : True}), ('column', {'cat' - : True}), ('spp' , {'cat' : False})] - - mt.add_attribute_types(types) - - mt.add_attribute_types(types) - order = ['interval', 'ordinal', 'ordinal', 'interval', 'ordinal'] - for i, att in enumerate(mt.attributes): - temp = att.findall('./measurementScale/' + order[i]) - self.assertTrue(len(temp) == 1) - - # Check that max, min and precision are set correctly - - types = [('cell', {'cat' : True}), ('row', {'cat' : False}), ('column', {'cat' - : False}), ('spp' , {'cat' : True})] - - mt.add_attribute_types(types) - - att_list = [('row', {'minimum' : 0, 'maximum' : 400, 'precision' : 3, - 'random' : 'harry'}), ('column', {'maximum' : 5}), ('spp', {'precision' - : 4})] - - mt.add_attribute_traits(att_list) - - # spp should have no precision even though we tried to add it - have = mt.attributeList.findall(".//attributeName") - names = [nm.text for nm in have] - ind = names.index('spp') - maybe =\ - mt.attributes[ind].findall('./measurementScale/ordinal/precision') - self.assertTrue(len(maybe) == 0) - - # cell should have no precision - have = mt.attributeList.findall(".//attributeName") - names = [nm.text for nm in have] - ind = names.index('cell') - maybe =\ - mt.attributes[ind].findall('./measurementScale/ordinal/precision') - self.assertTrue(len(maybe) == 0) - - # Precision of row should be three - have = mt.attributeList.findall(".//attributeName") - names = [nm.text for nm in have] - ind = names.index('row') - maybe =\ - mt.attributes[ind].findall('./measurementScale/interval/precision') - self.assertTrue(maybe[0].text == "3") - - # Precision of column should be 0 - have = mt.attributeList.findall(".//attributeName") - names = [nm.text for nm in have] - ind = names.index('column') - maybe =\ - mt.attributes[ind].findall('./measurementScale/interval/precision') - self.assertTrue(maybe[0].text == "0") - - # Maximum is set right - have = mt.attributeList.findall(".//attributeName") - names = [nm.text for nm in have] - ind = names.index('column') - maybe =\ - mt.attributes[ind].findall('./measurementScale/interval/numericDomain/bounds/maximum') - self.assertTrue(maybe[0].text == "5") - - - -if __name__ == '__main__': - unittest.main() diff --git a/utils/test_workflow.py b/utils/test_workflow.py deleted file mode 100644 index 0aeb411..0000000 --- a/utils/test_workflow.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/python - -import os -import unittest -from macroeco.utils import workflow - - -# Cases for future testing: -# params file has one interactive run, user changes values. -# params file has plural interactive runs (complicated dialog ahoy). -# No params file. Dialog, write, reload. -# Params file doesn't match ask. Dialog, write, reload, check against ask. -## workflow.xml proper subset of ask -## Neither a proper subset of the other -# Types of param: string, int, float, lists of those; mixed-type list (ick). - - -class ParamfileTestCase(unittest.TestCase): - - def setUp(self): - self.cwd = os.getcwd() + '/' - - self.pf = open('parameters.xml', 'w') - self.pf.write(""" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - """) - self.pf.close() - - def tearDown(self): - pass - os.remove(workflow.paramfile) - - def test_emptyask(self): - pa = workflow.Parameters('nonexistantrun', None, {}, {}) - self.assertEqual(pa.params, {}) - self.assertEqual(pa.interactive, False) - - def test_NIrunExists(self): - req_params = {'size': 'descripsize', 'species': 'descripspp'} - pa = workflow.Parameters('RunExists', None, req_params, {}) - self.assertTrue(len(pa.params) == 1) - self.assertTrue(set(req_params.keys()).issubset(\ - set(pa.params['ParamfileTestCase'].keys()))) - self.assertTrue(pa.interactive == False) - - run = pa.params['ParamfileTestCase'] - self.assertTrue(run['size']*run['layers'][1] == 3*4.4) - - def test_MultipleNIRunsExist(self): - pa = workflow.Parameters('ManyNIRuns', None, {}, {}) - self.assertEqual(len(pa.params), 2) - self.assertEqual(pa.params['FirstCase']['size'], 4.4) - self.assertEqual(pa.params['FirstCase']['species'], 'E. coli') - self.assertEqual(pa.params['FirstCase']['layers'], [0,3,12]) - self.assertEqual(pa.params['SecondCase']['size'], 2.2) - self.assertEqual(pa.params['SecondCase']['species'], 'H. sapiens') - self.assertEqual(pa.params['SecondCase']['layers'], [5]) - - def test_UnnamedRunErrors(self): - pa = workflow.Parameters('Unnamed', None, {}, {}) - self.assertEqual(len(pa.params), 2) - self.assertEqual(pa.params['run1']['size'], 4.4) - self.assertEqual(pa.params['run1']['species'], 'E. coli') - self.assertEqual(pa.params['run1']['layers'], [0,3,12]) - self.assertEqual(pa.params['run2']['size'], 2.2) - self.assertEqual(pa.params['run2']['species'], 'H. sapiens') - self.assertEqual(pa.params['run2']['layers'], [5]) - - def test_InteractiveRun(self): - pa = workflow.Parameters('Interactive', None, {}, {}) - self.assertTrue(pa.interactive == True) - diff --git a/utils/workflow.py b/utils/workflow.py deleted file mode 100644 index 77fa462..0000000 --- a/utils/workflow.py +++ /dev/null @@ -1,547 +0,0 @@ -#!/usr/bin/python - -''' -Manages the details of a reproducible workflow within macroeco. Main Workflow -class is called with one argument, required_params, and the surrounding script -must be called with a single sys.argv with the output directory. - -Classes -------- -- `Workflow` -- tracks the analysis, data requested, and parameters; maps sites -- `Parameters` -- finds/asks for and stores run names and parameters -''' - -import xml.etree.ElementTree as etree -import sys, os, logging -import matplotlib.pyplot as plt -from macroeco.data import Metadata - - -paramfile = 'parameters.xml' # Parameter file found in output dir -logfile = 'logfile.txt' # Logfile to save in output dir - - -class Workflow: - ''' - Manages the details of a reproducible workflow with macroeco scripts. - - Arguments - --------- - required_params : dictionary - Parameters needed for analysis, in form of - 'parameter_name':'short_description'. All of these parameters must be - present in params file in output directory, or analysis will not run. - This argument is empty only when no data or parameters are required for - a script to run. - clog : bool - Whether to log to console in addition to file, False by default - short_output_name : bool - Whether to use the run-name alone to name output. False by default. - - Attributes - ---------- - script_name : string - Name of script originating the workflow - output_path : string - Path to output directory - interactive : bool - Whether the script can pause for user interaction - runs : dict - If parameters are needed, sets of parameter values are named runs - ''' - - def __init__(self, required_params={}, optional_params={}, - clog=False, svers=None, short_output_name=False): - - # Store script name from command line call - script_path, script_extension = os.path.splitext(sys.argv[0]) - self.script_name = os.path.split(script_path)[-1] - self.script_vers = svers - self.short_output_name = short_output_name - - # Store output directory path - contains params file, log, results - # TODO: Make more robust to non-absolute path entries - output_path = os.getcwd() - self.output_path = output_path - - # Prepare logger - logging.basicConfig(filename=logfile, # Add file logging - level=logging.INFO, format=('%(asctime)s | ' - '%(levelname)s | %(filename)s:%(lineno)d | ' - '%(message)s'), datefmt='%H:%M:%S') - - if clog: # Add console logging - console = logging.StreamHandler() - console.setLevel(logging.INFO) - format = logging.Formatter('%(levelname)-8s %(message)s') - console.setFormatter(format) - logging.getLogger('').addHandler(console) - - def excepthook(*args): # Catch errors to log - logging.getLogger().error('Analysis Stopped', exc_info=args) - else: - def excepthook(*args): # Catch errors to log + stderr - logging.getLogger().error('Analysis Stopped', exc_info=args) - sys.__excepthook__(*args) # Show err in console if clog False - - sys.excepthook = excepthook # Define error handler as above - - logging.captureWarnings(True) # Catch warnings - - logging.debug('Creating workflow object') - - # Get parameters from file, including data paths - assert type(required_params) == type({}), ('Required params must be a' - ' dict.') - self.parameters = Parameters(self.script_name, self.script_vers, - required_params, optional_params) - self.interactive = self.parameters.interactive - - - def single_datasets(self): - ''' - Generator that yields data files and descriptive parameters. - - Special parameter 'data_path' is a list of locations of data files to - use for analysis - if present, map of sites will be generated for each - run. - - Yields - ------ - data_path : string - Path to data to analyze, relative to current working directory - output_ID : string - Concatenates script, run, and dataset identifiers - run_params : dict - Dictionary of parameters for each script_name and run - ''' - - def clean_name(fp): # Extract file name from path - return os.path.splitext(os.path.split(fp)[-1])[0] - - # Run script on all runs (parameter sets), and data sets - for run_name in self.parameters.params.keys(): - # TODO: Check for output_ID conflicts (must be unique) - - # Check if data_paths in params. If not, add one empty data_path - # for the loop below. If so, make a map. - if len(self.parameters.data_path) == 0: - logging.debug(('No data paths given for run %s, no map of ' - 'sites created') % run_name) - self.parameters.data_path[run_name] = [''] - else: - make_map(self.parameters.data_path[run_name], run_name) - - # Loop through each dataset and yield values for dataset and run - for data_path in self.parameters.data_path[run_name]: - abs_data_path = os.path.abspath(os.path.join(self.output_path, - data_path)) - if self.short_output_name: - print('Using short output name in single_datasets:') - output_ID = run_name - print(output_ID) - else: - output_ID = '_'.join([self.script_name, - run_name, clean_name(data_path)]) - logging.info('Beginning %s' % output_ID) - yield (abs_data_path, output_ID, - self.parameters.params[run_name]) - - def all_datasets(self): - ''' Generator that yields a list of data files and descriptive - parameters for each run. - - Yields - ------ - data_path : list - A list of paths to data to analyze - output_ID : list - A list of IDs that concatenate script, run, and dataset. - run_params : dict - Dictionary of parameters for each script_name and run - - ''' - - def clean_name(fp): # Extract file name from path - return os.path.splitext(os.path.split(fp)[-1])[0] - - # Run script on all runs (parameter sets), and data sets - for run_name in self.parameters.params.keys(): - # TODO: Check for output_ID conflicts (must be unique) - - # Check if data_paths in params. If not, add one empty data_path - # for the loop below. If so, make a map. - if len(self.parameters.data_path) == 0: - logging.debug(('No data paths given for run %s, no map of ' - 'sites created') % run_name) - self.parameters.data_path[run_name] = [''] - else: - make_map(self.parameters.data_path[run_name], run_name) - - abs_data_paths = [os.path.abspath(os.path.join(self.output_path, - data_path)) for data_path in self.parameters. - data_path[run_name]] - if self.short_output_name: - print('Using short output name in all_datasets:') - output_IDs = ['_'.join([run_name, clean_name(data_path)]) for - data_path in self.parameters.data_path[run_name]] - print(output_IDs) - else: - output_IDs = ['_'.join([self.script_name, run_name, - clean_name(data_path)]) for data_path in - self.parameters.data_path[run_name]] - logging.info('Beginning %s script' % self.script_name) - yield (abs_data_paths, output_IDs, - self.parameters.params[run_name], run_name, - self.script_name) - - - - -class Parameters: - ''' - Load parameters from parameter file in current working directory - and make available as self.params. - Checks that all required_params are present and loaded. - - Arguments - --------- - script_name : string - Name of script originating the workflow - required_params : dictionary - Parameters needed for analysis, in form of - 'parameter_name':'short_description'. All of these parameters must be - present in params file in output directory for this script_name and - run, or analysis will not run. This argument is empty only when no data - or parameters are required for a script to run. - - Attributes - ---------- - script_name : string - Name of script originating the workflow - script_vers : string - Version of script originating the workflow - interactive : bool - Whether the script can pause for user interaction - params : dict - Dictionary of dictionaries, with each outer key a run name and each - outer value a dictionary of parameter names and values for each run. - data_path : dict - Dictonarity where keys are run names and values are lists of data paths - associated with each run. - - ''' - - def __init__(self, script_name, script_vers, required_params, - optional_params, output_path=False): - - # Store initial attributes - self.script_name = script_name - self.script_vers = script_vers - self.interactive = False - self.params = {} - self.data_path = {} - if not output_path: - output_path = os.getcwd() - - # Check that parameter file exists, if not use default values - try: - pf = open(paramfile, 'r') - pf.close() - except IOError: - logging.info(('No parameter file found at %s, proceeding without ' - 'parameters') % output_path) - self.params[''] = {} - self.data_path[''] = {} - self.interactive = False - return - - # Read parameter file - logging.info('Reading parameters from %s' % os.path.join(output_path, - paramfile)) - self.read_from_xml() - - # Check that all required parameters present in all runs - if not self.required_params_present(required_params): - raise IOError('Required parameters missing') - - # If optional params are missing, set to default - self.set_optional_params(optional_params, script_name) - - logging.info('Parameters: %s' % str(self.params)) - logging.info('Data: %s' % str(self.data_path)) - - # Evaluate param values into appropriate types - self.eval_params() - - - def read_from_xml(self): - ''' Read parameters from xml file into self.params dictionary. ''' - - # Define class for checking keys - class AllEntities: - def __getitem__(self, key): - return key - - # Declare parser object - # TODO: Without next line, works in iPython, console, not script ?? - parser = etree.XMLParser() - parser.parser.UseForeignDTD(True) - parser.entity = AllEntities() - - # Try to open paramfile from output_path - # TODO: Integration test - try: - pml = etree.parse(paramfile, parser=parser).getroot() - except etree.ParseError: - raise IOError('ParseError trying to read %s' % paramfile) - except: - raise - - # Create params dictionary - if len(pml) == 0: # Error if no analyses in param file - raise IOError('Parameter file %s contains no valid analyses' % - paramfile) - for analysis in pml: # Loop analyses looking for script_name - if analysis.get('script_name') == self.script_name: - - if 'version' in analysis.attrib: # Set version - vers = analysis.get('version') - if self.script_vers: # If got script_vers, check - if float(vers) != float(self.script_vers): - logging.warning(('Script version does not match ' - 'version in parameters. ' - 'Continuing, but may fail.')) - - if 'interactive' in analysis.attrib: # Set interactive - ia = analysis.get('interactive') - if ia in ['T', 'True', 't', 'true']: - self.interactive = True - else: - self.interactive = False - else: - self.interactive = False - - if len(analysis) == 0: # Error if no runs in analysis - raise IOError(('Analysis found for this script, but no ' - 'valid runs found')) - - run_counter = 1 - for run in analysis.getchildren(): # Loop runs - run_name = run.get('name') - if run_name is None: - run_name = 'run' + str(run_counter) - run_counter += 1 - self.params[run_name] = {} - self.data_path[run_name] = [] - for elt in run.getchildren(): # Loop params in run - if elt.tag == 'param': - param = elt.get('name') - value = elt.get('value') - self.params[run_name][param] = value - if elt.tag == 'data': - data_type = elt.get('type') - data_location = elt.get('location') - if data_location == 'system': - # User responsible for sys paths, security, etc - prepend = '' - elif data_location == 'archival': - prepend = os.path.join('..','..', - 'archival') - else: - prepend = os.path.join('..','..','data', - 'formatted') - if data_type == '' or data_type == None: - logging.warning(('No data type specified,' - ' assuming .csv')) - data_type = 'csv' - # Remove any data extension - if data_type == 'csv': - - data_path = convert_filename(elt, prepend, - 'csv') - self.data_path[run_name].append(data_path) - - elif data_type == 'txt': - data_path = convert_filename(elt, prepend, - 'txt') - self.data_path[run_name].append(data_path) - - elif data_type == 'sql': - data_path = convert_filename(elt, prepend, - 'sql') - self.data_path[run_name].append(data_path) - elif data_type == 'db': - data_path = convert_filename(elt, prepend, - 'db') - self.data_path[run_name].append(data_path) - else: - logging.error('Data type {!s} not yet handled; ' - 'not using this data.'.format( - data_type)) - - def required_params_present(self, req_params): - ''' Check if any required parameters missing from any runs. ''' - - status = 1 - for run_name in self.params.keys(): - run_params = self.params[run_name] - if not set(req_params.keys()).issubset(set(run_params.keys())): - logging.error('In run {!s}, missing parameters {!s}'.format( - run_name, set(req_params.keys()).difference(set(run_params.keys())))) - status = 0 - return status - - def set_optional_params(self, opt_params, script_name): - ''' Set optional params with default values if params are missing''' - for run_name in self.params.keys(): - run_params = self.params[run_name] - for optpar in opt_params: - if not optpar in run_params: - logging.info("Default value for {!s} in {!s}: {!s}".format(optpar, - script_name, str(opt_params[optpar][1]))) - run_params[optpar] = opt_params[optpar][1] - - def eval_params(self): - ''' - Attempts to evaluate parameters to appropriate types. - - If eval() fails, parameter will stay a string, possibly leading to - cryptic errors later if there is a typo in a param value. - ''' - - for run_name in self.params.keys(): - for param_name in self.params[run_name].keys(): - try: - value = eval(self.params[run_name][param_name]) - self.params[run_name][param_name] = value - value_type = str(type(value)).split("'")[1] - logging.debug('In run %s, parameter %s evaluated to %s' % - (run_name, param_name, value_type)) - except: - logging.debug('In run %s, parameter %s left as string' % - (run_name, param_name)) - - -def make_map(data_paths, run_name, whole_globe=False): - ''' - Makes a map of all sites in run. - - Parameter - --------- - data_paths : list - Paths to data files (csv's). Data location will be extracted from - corresponding xml metadata file. - run_name : str - Name of run, used as name of map file. - whole_globe : bool - If True, map is entire globe. If False, map is "zoomed in" on data - locations. - - Returns - ------- - map_created : bool - True if file was created, False if a file already existed and none was - created. - - Notes - ----- - Map will be given the name of a run. If multiple runs have the same name, - only the map associated with the first run of that name will be saved. - - The label for each site will be the data file base name - (e.g., LBRI_2000.csv and LBRI.csv will be LBRI_2000 and LBRI respectively). - ''' - - # Check if Basemap present - if not, log and return - try: - from mpl_toolkits.basemap import Basemap - except: - logging.debug('Basemap module is not available, no map of data ' + - 'locations can be created') - return False - - # Set map_name - map_name = 'map_' + run_name + '.png' - - # TODO: Check if run_name is unique - # Check if map with this run_name already exists - if os.path.isfile(map_name): - logging.debug('Map with this run name already exists. New map ' + - 'overwriting old one.') - - # Get lat, long, and name of each data set - lats = [] - lons = [] - names = [] - - for path in data_paths: - temp = list(os.path.split(path)) - temp[1] = temp[1].split('.')[0] + '.xml' - x = os.path.join(temp[0], temp[1]) - - try: - meta = Metadata(x, {}) - bounds = meta.get_physical_coverage() - lats.append(bounds[0]) - lons.append(bounds[1]) - - fname, fext = os.path.splitext(os.path.split(path)[-1]) - names.append(fname) # First 4 letters of data set name - except: - logging.info('No location data found in %s, no map point ' - 'added.' % x) - - # If no valid location data, return without making a map - if len(names) == 0: - return False - - # Set up map - logging.debug('Creating map for run %s' % run_name) - if whole_globe: - m = Basemap(projection='cyl', resolution='i') - else: - # 10 degree buffer around min and max lat/long - m = Basemap(projection='cyl', lat_0=50, lon_0=-100, - llcrnrlon=min(lons)-10, llcrnrlat=min(lats)-10, - urcrnrlon=max(lons)+10, urcrnrlat=max(lats)+10, - resolution='l') - - # Draw features - m.bluemarble() - m.drawcoastlines() - m.drawcountries() - m.drawmapboundary() - - # Add sites - x, y = m(lons, lats) - m.plot(x, y, 'yo') - for n, xpt, ypt in zip(names,x,y): - if n == 'BCIS': ypt += 1 # Manual Cleanup for crowded areas - if n == 'SHER': ypt += 2 - plt.text(xpt+.5,ypt+.5,n,color='yellow') - - plt.savefig(map_name) - plt.close() - return True - - -def convert_filename(elt, prepend, ext): - '''Parses xml tree to return filename - - Parameters - ---------- - elt : Etree - XML tree object - prepend : str - String to be appended - ext : str - File type, i.e. csv, txt, sql, db - ''' - - file_name = elt.find('file').text.split('.')[0] - - directory = elt.find('directory').text - data_file = os.path.extsep.join((file_name, ext)) - data_path = os.path.join(prepend, directory, data_file) - return data_path