diff --git a/pyleoclim/core/psds.py b/pyleoclim/core/psds.py index e7bcafaf..c572bedd 100644 --- a/pyleoclim/core/psds.py +++ b/pyleoclim/core/psds.py @@ -179,7 +179,7 @@ def signif_test(self, method='ar1sim', number=None, seed=None, qs=[0.95], Number of surrogate series to generate for significance testing. The default is None. - method : str; {'ar1asym','ar1sim'} + method : str; {'ar1asym','ar1sim','uar1'} Method to generate surrogates. AR1sim uses simulated timeseries with similar persistence. AR1asymp represents the closed form solution. The default is AR1sim @@ -280,10 +280,10 @@ def signif_test(self, method='ar1sim', number=None, seed=None, qs=[0.95], if self.spec_method == 'lomb_scargle' and method == 'ar1asym': raise ValueError('Asymptotic solution is not supported for the Lomb-Scargle method') - if method not in ['ar1sim', 'ar1asym']: + if method not in ['ar1sim', 'uar1','ar1asym']: raise ValueError("The available methods are 'ar1sim' and 'ar1asym'") - if method in ['ar1sim', 'ar1old']: # fix later in Series.surrogates() + if method in ['ar1sim', 'uar1']: signif_scals = None if scalogram: try: @@ -760,7 +760,8 @@ def plot(self, in_loglog=True, in_period=True, label=None, xlabel=None, ylabel=' # plot significance levels if self.signif_qs is not None: signif_method_label = { - 'ar1sim': 'AR(1) simulations', + 'ar1sim': 'AR(1) simulations (MoM)', + 'uar1': 'AR(1) simulations (MLE)', 'ar1asym': 'AR(1) asymptotic solution' } nqs = np.size(self.signif_qs.psd_list) diff --git a/pyleoclim/core/scalograms.py b/pyleoclim/core/scalograms.py index 08787d71..0661ad34 100644 --- a/pyleoclim/core/scalograms.py +++ b/pyleoclim/core/scalograms.py @@ -413,7 +413,7 @@ def signif_test(self, method='ar1sim', number=None, seed=None, qs=[0.95], Parameters ---------- - method : {'ar1asym', 'ar1sim'} + method : {'ar1asym', 'ar1sim','uar1'} Method to use to generate the surrogates. ar1sim uses simulated timeseries with similar persistence. ar1asym represents the theoretical, closed-form solution. The default is ar1sim @@ -485,10 +485,11 @@ def signif_test(self, method='ar1sim', number=None, seed=None, qs=[0.95], if self.wave_method == 'wwz' and method == 'ar1asym': raise ValueError('Asymptotic solution is not supported for the wwz method') - if method not in ['ar1sim', 'ar1asym']: - raise ValueError("The available methods are ar1sim'and 'ar1asym'") + acceptable_methods = ['ar1sim', 'ar1asym', 'uar1'] + if method not in acceptable_methods: + raise ValueError(f"The available methods are: {acceptable_methods}") - if method == 'ar1sim': + if method in ['ar1sim','uar1'] : if hasattr(self,'signif_scals'): signif_scals = self.signif_scals diff --git a/pyleoclim/core/series.py b/pyleoclim/core/series.py index aef54763..3ddd3953 100644 --- a/pyleoclim/core/series.py +++ b/pyleoclim/core/series.py @@ -105,14 +105,14 @@ class Series: source of the dataset. If it came from a LiPD file, this could be the datasetID property archiveType : string - climate archive, one of 'Borehole', 'Coral', 'FluvialSediment', 'GlacierIce', 'GroundIce', 'LakeSediment', 'MarineSediment', 'Midden', 'MolluskShell', 'Peat', 'Sclerosponge', 'Shoreline', 'Speleothem', 'TerrestrialSediment', 'Wood' + climate archive, one of 'Borehole', 'Coral', 'FluvialSediment', 'GlacierIce', 'GroundIce', 'LakeSediment', 'MarineSediment', 'Midden', 'MolluskShell', 'Peat', 'Sclerosponge', 'Shoreline', 'Speleothem', 'TerrestrialSediment', 'Wood' Reference: https://lipdverse.org/vocabulary/archivetype/ - + control_archiveType : [True, False] - Whether to standardize the name of the archiveType agains the vocabulary from: https://lipdverse.org/vocabulary/paleodata_proxy/. - If set to True, will only allow for these terms and automatically convert known synonyms to the standardized name. Only standardized variable names will be automatically assigned a color scheme. - Default is False. - + Whether to standardize the name of the archiveType agains the vocabulary from: https://lipdverse.org/vocabulary/paleodata_proxy/. + If set to True, will only allow for these terms and automatically convert known synonyms to the standardized name. Only standardized variable names will be automatically assigned a color scheme. + Default is False. + dropna : bool Whether to drop NaNs from the series to prevent downstream functions from choking on them defaults to True @@ -128,10 +128,10 @@ class Series: set to True to remove the NaNs and make time axis strictly prograde with duplicated timestamps reduced by averaging the values Default is None (marked for deprecation) - auto_time_params : bool, - If True, uses tsbase.disambiguate_time_metadata to ensure that time_name and time_unit are usable by Pyleoclim. This may override the provided metadata. + auto_time_params : bool, + If True, uses tsbase.disambiguate_time_metadata to ensure that time_name and time_unit are usable by Pyleoclim. This may override the provided metadata. If False, the provided time_name and time_unit are used. This may break some functionalities (e.g. common_time and convert_time_unit), so use at your own risk. - If not provided, code will set to True for internal consistency. + If not provided, code will set to True for internal consistency. Examples -------- @@ -265,16 +265,16 @@ def __init__(self, time, value, time_unit=None, time_name=None, self.control_archiveType = control_archiveType if archiveType is not None: #Deal with archiveType - + #Get the possible list of archiveTyp - - + + if control_archiveType == True: - + res = lipdutils.get_archive_type() std_var = list(res.keys()) std_var_lower = [key.lower() for key in res.keys()] - + data = [] for key, values in res.items(): if values: # Check if the list is not empty @@ -288,10 +288,10 @@ def __init__(self, time, value, time_unit=None, time_name=None, synonym = df[df['Value'].isin([archiveType.lower().replace(" ", "")])]['Key'] if synonym.empty == True: synonym = None - else: + else: synonym = synonym.to_string(header=False, index=False) - - + + if archiveType.lower().replace(" ", "") in std_var_lower: index = std_var_lower.index(archiveType.lower().replace(" ", "")) self.archiveType = std_var[index] @@ -561,7 +561,7 @@ def from_json(cls, path): b = jsonutils.iterate_through_dict(a, 'Series') return cls(**b) - + def pandas_method(self, method): ser = self.to_pandas() result = method(ser) @@ -3239,19 +3239,19 @@ def wavelet_coherence(self, target_series, method='cwt', settings=None, As with wavelet analysis, both CWT and WWZ admit optional arguments through `settings`. For instance, one can adjust the resolution of the time axis on which coherence is evaluated: - + .. jupyter-execute:: coh_wwz = ts_air.wavelet_coherence(ts_nino, method = 'wwz', settings = {'ntau':20}) coh_wwz.plot() - + The frequency (scale) axis can also be customized, e.g. to focus on scales from 1 to 20y, with 24 scales: - + .. jupyter-execute:: - + coh = ts_air.wavelet_coherence(ts_nino, freq_kwargs={'fmin':1/20,'fmax':1,'nf':24}) coh.plot() - + Significance is assessed similarly to PSD or Scalogram objects: .. jupyter-execute:: @@ -3270,8 +3270,8 @@ def wavelet_coherence(self, target_series, method='cwt', settings=None, cwt_sig.dashboard() Note: this design balances many considerations, and is not easily customizable. - - + + ''' if not verbose: warnings.simplefilter('ignore') @@ -3290,7 +3290,7 @@ def wavelet_coherence(self, target_series, method='cwt', settings=None, args = {} args['wwz'] = {'freq': freq, 'verbose': verbose} args['cwt'] = {'freq': freq} - + # put on same time axes if necessary if method == 'cwt' and not np.array_equal(self.time, target_series.time): @@ -3326,7 +3326,7 @@ def wavelet_coherence(self, target_series, method='cwt', settings=None, tau = np.linspace(lb, ub, ntau) settings.update({'tau': tau}) - + args[method].update(settings) # Apply WTC method @@ -3358,8 +3358,11 @@ def correlation(self, target_series, alpha=0.05, statistic='pearsonr', method = The significance of the correlation is assessed using one of the following methods: 1) 'ttest': T-test adjusted for effective sample size. - 2) 'isopersistent': AR(1) modeling of x and y. - 3) 'isospectral': phase randomization of original inputs. (default) + 2) 'ar1sim': AR(1) modeling of x and y. + 3) 'phaseran': phase randomization of original inputs. (default) + 4) 'built-in': uses built-in method + + Note: Up to version v0.14.0. ar1sim was called "isopersistent", phaseran was called "isospectral" The T-test is a parametric test, hence computationally cheap, but can only be performed in ideal circumstances. The others are non-parametric, but their computational requirements scale with the number of simulations. @@ -3371,27 +3374,25 @@ def correlation(self, target_series, alpha=0.05, statistic='pearsonr', method = target_series : Series A pyleoclim Series object - + alpha : float The significance level (default: 0.05) - - statistic : str + + statistic : str statistic being evaluated. Can use any of the SciPy-supported ones: https://docs.scipy.org/doc/scipy/reference/stats.html#association-correlation-tests Currently supported: ['pearsonr','spearmanr','pointbiserialr','kendalltau','weightedtau'] - Default: 'pearsonr'. - + Default: 'pearsonr'. + method : str, {'ttest','built-in','ar1sim','phaseran'} method for significance testing. Default is 'phaseran' 'ttest' implements the T-test with degrees of freedom adjusted for autocorrelation, as done in [1] - 'built-in' uses the p-value that ships with the statistic. - The old options 'isopersistent' and 'isospectral' still work, but trigger a deprecation warning. - Note that 'weightedtau' does not have a known distribution, so the 'built-in' method returns an error in that case. - + 'built-in' uses the p-value that ships with the SciPy function. + The old options 'isopersistent' and 'isospectral' still work, but trigger a deprecation warning. + Note that 'weightedtau' does not have a known distribution, so the 'built-in' method returns an error in that case. + timespan : tuple The time interval over which to perform the calculation - - settings : dict Parameters for the correlation function, including: @@ -3408,7 +3409,7 @@ def correlation(self, target_series, alpha=0.05, statistic='pearsonr', method = seed : float or int random seed for isopersistent and isospectral methods - + mute_pbar : bool, optional Mute the progress bar. The default is False. @@ -3432,13 +3433,13 @@ def correlation(self, target_series, alpha=0.05, statistic='pearsonr', method = -------- pyleoclim.utils.correlation.corr_sig : Correlation function (marked for deprecation) - + pyleoclim.utils.correlation.association : SciPy measures of association between variables - + pyleoclim.series.surrogates : parametric and non-parametric surrogates of any Series object - + pyleoclim.multipleseries.common_time : Aligning time axes - + References ---------- [1] Hu, J., J. Emile-Geay, and J. Partin (2017), Correlation-based interpretations of paleoclimate data – where statistics meet past climates, Earth and Planetary Science Letters, 459, 362–371, doi:10.1016/j.epsl.2016.11.048. @@ -3458,7 +3459,7 @@ def correlation(self, target_series, alpha=0.05, statistic='pearsonr', method = # set an arbitrary random seed to fix the result corr_res = ts_nino.correlation(ts_air, settings={'nsim': 20}, seed=2333) print(corr_res) - + # changing the statistic corr_res = ts_nino.correlation(ts_air, statistic='kendalltau') print(corr_res) @@ -3482,7 +3483,7 @@ def correlation(self, target_series, alpha=0.05, statistic='pearsonr', method = warnings.warn("isopersistent is deprecated and was replaced by 'ar1sim'", DeprecationWarning, stacklevel=2) method = 'ar1sim' - + settings = {} if settings is None else settings.copy() corr_args = {'alpha': alpha, 'method': method} corr_args.update(settings) @@ -3514,19 +3515,19 @@ def correlation(self, target_series, alpha=0.05, statistic='pearsonr', method = res = corrutils.association(ts0.value,ts1.value,statistic) stat = res[0] pval = res.pvalue if len(res) > 1 else np.nan - signif = pval <= alpha - elif method in supported_surrogates: + signif = pval <= alpha + elif method in supported_surrogates: number = corr_args['nsim'] if 'nsim' in corr_args.keys() else 1000 seed = corr_args['seed'] if 'seed' in corr_args.keys() else None - method = corr_args['method'] if 'method' in corr_args.keys() else None + #method = corr_args['method'] if 'method' in corr_args.keys() else None surr_settings = corr_args['surr_settings'] if 'surr_settings' in corr_args.keys() else None - + # compute correlation statistic stat = corrutils.association(ts0.value,ts1.value,statistic)[0] - - ts0_surr = ts0.surrogates(number=number, seed=seed, + + ts0_surr = ts0.surrogates(number=number, seed=seed, method=method, settings=surr_settings) - ts1_surr = ts1.surrogates(number=number, seed=seed, + ts1_surr = ts1.surrogates(number=number, seed=seed, method=method, settings=surr_settings) stat_surr = np.empty((number)) for i in tqdm(range(number), desc='Evaluating association on surrogate pairs', total=number, disable=mute_pbar): @@ -3535,7 +3536,7 @@ def correlation(self, target_series, alpha=0.05, statistic='pearsonr', method = # obtain p-value pval = np.sum(np.abs(stat_surr) >= np.abs(stat)) / number # establish significance - signif = pval <= alpha + signif = pval <= alpha else: raise ValueError(f'Unknown method: {method}. Look up documentation for a wiser choice.') @@ -3641,23 +3642,30 @@ def causality(self, target_series, method='liang', timespan=None, settings=None, causal_res = spec_func[method](value1, value2, **args[method]) return causal_res - def surrogates(self, method='ar1sim', number=1, length=None, seed=None, settings=None): + def surrogates(self, method='ar1sim', number=1, time_pattern='match', + length=None, seed=None, settings=None): ''' Generate surrogates of the Series object according to "method" - + For now, assumes uniform spacing and increasing time axis Parameters ---------- - method : {ar1sim, phaseran} + method : {ar1sim, phaseran, uar1} The method used to generate surrogates of the timeseries - - Note that phaseran assumes an odd number of samples. If the series + + Note that phaseran assumes an odd number of samples. If the series has even length, the last point is dropped to satisfy this requirement number : int The number of surrogates to generate + time_pattern : str {match, even, random} + The pattern used to generate the surrogate time axes + 'match' uses the same pattern as the original Series + 'even' uses an evenly-spaced time with spacing delta_t specified in settings (will return error if not specified) + 'random' uses random_time_index() with specified distribution and parameters (default: 'exponential' with parameter 1) + length : int Length of the series @@ -3675,52 +3683,92 @@ def surrogates(self, method='ar1sim', number=1, length=None, seed=None, settings -------- pyleoclim.utils.tsmodel.ar1_sim : AR(1) simulator - pyleoclim.utils.tsutils.phaseran : phase randomization + pyleoclim.utils.tsmodel.uar1_sim : maximum likelihood AR(1) simulator + pyleoclim.utils.tsutils.phaseran2 : phase randomization + pyleoclim.utils.tsutils.random_time_index : random time index vector according to a specific probability model ''' settings = {} if settings is None else settings.copy() - args = {} - time = self.time - args['ar1sim'] = {'t': time} - args['phaseran'] = {} - args[method].update(settings) if seed is not None: np.random.seed(seed) + if length is None: + n = len(self.value) + else: + n = length + + # generate time axes according to provided pattern + if time_pattern == "match": + times = np.tile(self.time, (number, 1)).T + elif time_pattern == "even": + if "time_increment" not in settings: + warnings.warn("'time_increment' not found in the dictionary, default set to 1.",stacklevel=2) + time_increment = 1 + else: + time_increment = settings["time_increment"] + + t = np.cumsum([time_increment]*n) + times = np.tile(t, (number, 1)).T + elif time_pattern == "random": + times = np.zeros((n, number)) + for i in range(number): + times[:, i] = tsmodel.random_time_index(n = n, **settings) # TODO: check that this does not break when unexpected keywords are passed in `settings` + else: + raise ValueError(f"Unknown time pattern: {time_pattern}") + + # apply surrogate method if method == 'ar1sim': - surr_res = tsmodel.ar1_sim(self.value, number, **args[method]) + if time_pattern != 'match': + raise ValueError('Only a matching time pattern is supported with this method') + else: + y_surr = tsmodel.ar1_sim(self.value, number, self.time) # CHECK: how does this handle the new time? + elif method == 'phaseran': - if self.is_evenly_spaced(): - surr_res = tsutils.phaseran2(self.value, number, **args[method]) + if self.is_evenly_spaced() and time_pattern != "random": + y_surr = tsutils.phaseran2(self.value, number) else: raise ValueError("Phase-randomization presently requires evenly-spaced series.") - # elif method == 'uar1': - # # TODO : implement Lionel's ML method + elif method == 'uar1': + # estimate theta with MLE + theta_hat = tsmodel.uar1_fit(self.value, self.time) + # generate surrogates + y_surr = np.empty_like(times) + for j in range(number): + y_surr[:,j] = tsmodel.uar1_sim(t = times[:,j], + tau_0=theta_hat[0],sigma_2_0=theta_hat[1]) + # elif method == 'power-law': # # TODO : implement Stochastic # elif method == 'fBm': # # TODO : implement Stochastic - - if len(np.shape(surr_res)) == 1: - surr_res = surr_res[:, np.newaxis] + + # THIS SHOULD BE UNNECESSARY + # # reshape + # if len(np.shape(y_surr)) == 1: + # y_surr = y_surr[:, np.newaxis] + + # if len(np.shape(times)) == 1: + # times = times[:, np.newaxis] + + # wrap it all up with a bow s_list = [] - for i, s in enumerate(surr_res.T): - s_tmp = Series(time=time, value=s, # will need reformation after uar1 pull - time_name=self.time_name, - time_unit=self.time_unit, - value_name=self.value_name, - value_unit=self.value_unit, + for i, (t, y) in enumerate(zip(times.T,y_surr.T)): + ts = Series(time=t, value=y, + time_name=self.time_name, + time_unit=self.time_unit, + value_name=self.value_name, + value_unit=self.value_unit, label = str(self.label or '') + " surr #" + str(i+1), verbose=False, auto_time_params=True) - s_list.append(s_tmp) + s_list.append(ts) - surr = SurrogateSeries(series_list=s_list, + surr = SurrogateSeries(series_list=s_list, label = self.label, - surrogate_method=method, - surrogate_args=args[method]) + surrogate_method=method, + surrogate_args=settings) return surr @@ -4173,7 +4221,7 @@ def resolution(self): See Also -------- - + pyleoclim.core.resolutions.Resolution Examples diff --git a/pyleoclim/core/surrogateseries.py b/pyleoclim/core/surrogateseries.py index 20102ecb..1fe4c32c 100644 --- a/pyleoclim/core/surrogateseries.py +++ b/pyleoclim/core/surrogateseries.py @@ -6,7 +6,7 @@ from ..core.multipleseries import MultipleSeries -supported_surrogates = frozenset(['ar1sim','phaseran']) # broadcast all supported surrogates as global variable, for exception handling +supported_surrogates = frozenset(['ar1sim','phaseran', 'uar1']) # broadcast all supported surrogates as global variable, for exception handling class SurrogateSeries(MultipleSeries): ''' Object containing surrogate timeseries, usually obtained through recursive modeling (e.g., AR(1)) @@ -24,8 +24,10 @@ def __init__(self, series_list, label, surrogate_method=None, surrogate_args=Non self.label = str(label or "series") + " surrogates [AR(1)]" elif surrogate_method == 'phaseran': self.label = str(label or "series") + " surrogates [phase-randomized]" + elif surrogate_method == 'uar1': + self.label = str(label or "series") + " surrogates [uar1]" else: - raise ValueError('Surrogate method should either be "ar1sim" or "phaseran"') + raise ValueError('Surrogate method should either be "ar1sim", "phaseran" or "uar1"') diff --git a/pyleoclim/tests/test_core_PSD.py b/pyleoclim/tests/test_core_PSD.py index eab313c4..8cc4cd64 100644 --- a/pyleoclim/tests/test_core_PSD.py +++ b/pyleoclim/tests/test_core_PSD.py @@ -12,7 +12,6 @@ 4. after `pip install pytest-xdist`, one may execute "pytest -n 4" to test in parallel with number of workers specified by `-n` 5. for more details, see https://docs.pytest.org/en/stable/usage.html ''' -import numpy as np import pytest import pyleoclim as pyleo @@ -20,17 +19,7 @@ def gen_ts(model='colored_noise',alpha=1, nt=100, f0=None, m=None, seed=None): 'wrapper for gen_ts in pyleoclim' t,v = pyleo.utils.gen_ts(model=model,alpha=alpha, nt=nt, f0=f0, m=m, seed=seed) - ts=pyleo.Series(t,v) - return ts - -# a collection of useful functions - -def gen_normal(loc=0, scale=1, nt=100): - ''' Generate random data with a Gaussian distribution - ''' - t = np.arange(nt) - v = np.random.normal(loc=loc, scale=scale, size=nt) - ts = pyleo.Series(t,v) + ts=pyleo.Series(t,v, auto_time_params=True,verbose=False) return ts @@ -50,12 +39,13 @@ def test_plot_t0(self): class TestUiPsdSignifTest: ''' Tests for PSD.signif_test() ''' - - def test_signif_test_t0(self): + @pytest.mark.parametrize('method',['ar1sim','uar1','ar1asym']) + def test_signif_test_t0(self,method): ''' Test PSD.signif_test() with default parameters ''' ts = gen_ts(nt=500) psd = ts.spectral(method='mtm') - psd_signif = psd.signif_test(number=10) + psd_signif = psd.signif_test(number=10,method=method) fig, ax = psd_signif.plot() - pyleo.closefig(fig) \ No newline at end of file + pyleo.closefig(fig) + \ No newline at end of file diff --git a/pyleoclim/tests/test_core_Scalogram.py b/pyleoclim/tests/test_core_Scalogram.py index 041483ce..32eb3327 100644 --- a/pyleoclim/tests/test_core_Scalogram.py +++ b/pyleoclim/tests/test_core_Scalogram.py @@ -20,8 +20,7 @@ def gen_ts(model='colored_noise',alpha=1, nt=100, f0=None, m=None, seed=None): 'wrapper for gen_ts in pyleoclim' t,v = pyleo.utils.gen_ts(model=model,alpha=alpha, nt=nt, f0=f0, m=m, seed=seed) - ts=pyleo.Series(t,v) - + ts=pyleo.Series(t,v, auto_time_params=True,verbose=False) return ts # Tests below @@ -31,19 +30,20 @@ class TestUiScalogramSignifTest: @pytest.mark.parametrize('wave_method',['wwz','cwt']) def test_signif_test_t0(self, wave_method): - ''' Test scalogram.signif_test() with default parameters + ''' Test scalogram.signif_test() with differennt methods ''' - ts = gen_ts(model='colored_noise',nt=500) + ts = gen_ts(model='colored_noise') scal = ts.wavelet(method=wave_method) scal_signif = scal.signif_test(number=5, qs = [0.8, 0.9, .95]) fig,ax = scal_signif.plot(signif_thresh=0.99) pyleo.closefig(fig) - - @pytest.mark.parametrize('ar1_method',['ar1asym', 'ar1sim']) - def test_signif_test_t1(self,ar1_method): - ''' Test scalogram.signif_test() with default parameters + @pytest.mark.parametrize('method',['ar1sim','uar1','ar1asym']) + def test_signif_test_t1(self,method): + ''' Test scalogram.signif_test() with various surrogates ''' - ts = gen_ts(model='colored_noise',nt=500) + ts = gen_ts(model='colored_noise') scal = ts.wavelet(method='cwt') - scal_signif = scal.signif_test(method=ar1_method,number=1) + scal_signif = scal.signif_test(method=method,number=2) + fig,ax = scal_signif.plot(signif_thresh=0.99) + pyleo.closefig(fig) diff --git a/pyleoclim/tests/test_core_Series.py b/pyleoclim/tests/test_core_Series.py index 7b5a374c..a3c858b7 100644 --- a/pyleoclim/tests/test_core_Series.py +++ b/pyleoclim/tests/test_core_Series.py @@ -28,10 +28,12 @@ #from urllib.request import urlopen import pyleoclim as pyleo -from pyleoclim.utils.tsmodel import ar1_fit +import pyleoclim.utils.tsmodel as tsmodel import pyleoclim.utils.tsbase as tsbase from statsmodels.tsa.arima_process import arma_generate_sample +from scipy.stats import expon + # a collection of useful functions @@ -538,7 +540,7 @@ def test_invalid(self): class TestUISeriesSurrogates: ''' Test Series.surrogates() ''' - def test_surrogates_t0(self, eps=0.2): + def test_surrogates_t0(self, p=2, eps=0.2): ''' Generate AR(1) surrogates based on a AR(1) series with certain parameters, and then evaluate and assert the parameters of the surrogates are correct ''' @@ -549,10 +551,53 @@ def test_surrogates_t0(self, eps=0.2): ar1 = arma_generate_sample(ar, ma, nsample=n, scale=1) ts = pyleo.Series(time=np.arange(1000), value=ar1) - ts_surrs = ts.surrogates(number=1) + ts_surrs = ts.surrogates(number=p) for ts_surr in ts_surrs.series_list: - g_surr = ar1_fit(ts_surr.value) + g_surr = tsmodel.ar1_fit(ts_surr.value) assert np.abs(g_surr-g) < eps + + @pytest.mark.parametrize('number',[1,5]) + def test_surrogates_uar1_match(self, number): + ts = gen_ts(nt=550, alpha=1.0) + # generate surrogates + surr = ts.surrogates(method = 'uar1', number = number, time_pattern ="match") + for i in range(number): + assert(np.allclose(surr.series_list[i].time, ts.time)) + + def test_surrogates_uar1_even(self, p=5): + ts = gen_ts(nt=550, alpha=1.0) + time_incr = np.median(np.diff(ts.time)) + # generate surrogates + surr = ts.surrogates(method = 'uar1', number = p, time_pattern ="even", settings={"time_increment" :time_incr}) + for i in range(p): + assert(np.allclose(tsmodel.inverse_cumsum(surr.series_list[i].time),time_incr)) + + def test_surrogates_uar1_random(self, p=5, tol = 0.5): + tau = 2 + sigma_2 = 1 + n = 500 + # generate time index + t = np.arange(1,(n+1)) + # create time series + ys = tsmodel.uar1_sim(t, tau_0=tau, sigma_2_0=sigma_2) + ts = pyleo.Series(time = t, value=ys, auto_time_params=True,verbose=False) + # generate surrogates default is exponential with parameter value 1 + surr = ts.surrogates(method = 'uar1', number = p, time_pattern ="random") + #surr = ts.surrogates(method = 'uar1', number = p, time_pattern ="uneven",settings={"delta_t_dist" :"poisson","param":[1]} ) + + for i in range(p): + delta_t = tsmodel.inverse_cumsum(surr.series_list[i].time) + # Compute the empirical cumulative distribution function (CDF) of the generated data + empirical_cdf, bins = np.histogram(delta_t, bins=100, density=True) + empirical_cdf = np.cumsum(empirical_cdf) * np.diff(bins) + # Compute the theoretical CDF of the Exponential distribution + theoretical_cdf = expon.cdf(bins[1:], scale=1) + # Trim theoretical_cdf to match the size of empirical_cdf + theoretical_cdf = theoretical_cdf[:len(empirical_cdf)] + # Compute the L2 norm (Euclidean distance) between empirical and theoretical CDFs + l2_norm = np.linalg.norm(empirical_cdf - theoretical_cdf) + assert(l2_norm= 0) assert np.square(miss_ssa.RCseries.value - soi.value).mean() < 0.3 - - - # def test_ssa_t5(self): - # '''Test Series.ssa() on Allen&Smith dataset - # ''' - # df = pd.read_csv('https://raw.githubusercontent.com/LinkedEarth/Pyleoclim_util/Development/example_data/mratest.txt',delim_whitespace=True,names=['Total','Signal','Noise']) - # mra = pyleo.Series(time=df.index, value=df['Total'], value_name='Allen&Smith test data', time_name='Time', time_unit='yr') - # mraSsa = mra.ssa(nMC=10) - # fig, ax = mraSsa.screeplot() - # pyleo.closefig(fig) - class TestUISeriesPlot: '''Test for Series.plot() diff --git a/pyleoclim/tests/test_utils_tsmodel.py b/pyleoclim/tests/test_utils_tsmodel.py index 5342ac9a..68eb64b1 100644 --- a/pyleoclim/tests/test_utils_tsmodel.py +++ b/pyleoclim/tests/test_utils_tsmodel.py @@ -6,64 +6,69 @@ @author: julieneg """ + import pytest import numpy as np from pyleoclim.utils import tsmodel +#from scipy.stats import poisson @pytest.mark.parametrize('model', ["exponential", "poisson"]) -def test_time_increments_0(model): +def test_time_index_0(model): ''' Generate time increments with 1-parameter models ''' - delta_t = tsmodel.time_increments(n=20, param=1, delta_t_dist = model) + delta_t = tsmodel.random_time_index(n=20, param=[1], delta_t_dist = model) assert all(np.cumsum(delta_t)>0) -def test_time_increments_1(): +def test_time_index_1(): ''' Generate time increments with Pareto ''' - delta_t = tsmodel.time_increments(n=20, param=[4.2,2.5], delta_t_dist = "pareto") + delta_t = tsmodel.random_time_index(n=20, param=[4.2,2.5], delta_t_dist = "pareto") assert all(np.cumsum(delta_t)>0) -def test_time_increments_2(): +def test_time_index_2(): ''' Generate time increments with random choice ''' - delta_t = tsmodel.time_increments(n=20, delta_t_dist = "random_choice", + delta_t = tsmodel.random_time_index(n=20, delta_t_dist = "random_choice", param=[[1,2],[.95,.05]] ) assert all(np.cumsum(delta_t)>0) + + + -@pytest.mark.parametrize('evenly_spaced', [True, False]) -def test_ar1fit_ml(evenly_spaced): +@pytest.mark.parametrize(('p', 'evenly_spaced'), [(1, True), (10, True), (1, False), (10, False)]) +def test_uar1_fit(p, evenly_spaced): ''' - Tests whether this method works well on an AR(1) process with known parameters + Tests whether this method works well on an AR(1) process with known parameters and evenly spaced time points ''' # define tolerance tol = .4 tau = 2 sigma_2 = 1 + n = 500 - # create p=50 time series - y_sim, t_sim = tsmodel.ar1_sim_geneva(n=200, tau_0=tau, sigma_2_0=sigma_2, - evenly_spaced=evenly_spaced, p = 10) - + if evenly_spaced: + t_arr = np.tile(range(1,n), (p, 1)).T + else: + t_arr = np.zeros((n, p)) # Initialize matrix to store time increments + for j in range(p): + # Generate random time increment + t_arr[:, j] = tsmodel.random_time_index(n=n, param=[1]) + # Create an empty matrix to store estimated parameters - theta_hat_matrix = np.empty((y_sim.shape[1], 2)) - + theta_hat_matrix = np.empty((p, 2)) # estimate parameters for each time series - for j in range(y_sim.shape[1]): - theta_hat_matrix[j,:] = tsmodel.ar1_fit_ml(y_sim[:, j], t_sim[:, j]) - + for j in range(p): + ys = tsmodel.uar1_sim(t = t_arr[:, j], tau_0 = tau, sigma_2_0=sigma_2) + theta_hat_matrix[j,:] = tsmodel.uar1_fit(ys, t_arr[:, j]) # compute mean of estimated param for each simulate ts theta_hat_bar = np.mean(theta_hat_matrix, axis=0) - + # test that - assert np.abs(theta_hat_bar[0]-tau) < tol assert np.abs(theta_hat_bar[1]-sigma_2) < tol - - - \ No newline at end of file diff --git a/pyleoclim/utils/correlation.py b/pyleoclim/utils/correlation.py index 1217d0af..cdf045f4 100644 --- a/pyleoclim/utils/correlation.py +++ b/pyleoclim/utils/correlation.py @@ -711,7 +711,7 @@ def association(y1, y2, statistic='pearsonr',settings=None): func = getattr(stats, statistic) res = func(y1,y2,**args) else: - raise ValueError(f'Wrong statistic: {statistic}; acceptble choices are {acceptable_methods}') + raise ValueError(f'Wrong statistic: {statistic}; acceptable choices are {acceptable_methods}') return res diff --git a/pyleoclim/utils/tsmodel.py b/pyleoclim/utils/tsmodel.py index e3f28a3f..d003483d 100644 --- a/pyleoclim/utils/tsmodel.py +++ b/pyleoclim/utils/tsmodel.py @@ -19,15 +19,17 @@ __all__ = [ 'ar1_fit', - 'ar1_fit_ml', 'ar1_sim', - 'ar1_sim_geneva', + 'uar1_fit', + 'uar1_sim', 'colored_noise', 'colored_noise_2regimes', 'gen_ar1_evenly', 'gen_ts', 'tau_estimation', - 'parametric_surrogates' + 'parametric_surrogates', + 'random_time_index', + 'inverse_cumsum' ] @@ -650,7 +652,7 @@ def n_ll_unevenly_spaced_ar1(theta, y, t): nll = np.log(2*np.pi) + np.log(sigma_2) + term_5 + term_4 return(nll) -def ar1_fit_ml(y, t): +def uar1_fit(y, t): ''' Maximum Likelihood Estimation of parameters tau_0 and sigma_2_0 @@ -667,6 +669,7 @@ def ar1_fit_ml(y, t): None. ''' + # obtain initial value for tau_0 tau_initial_value = tau_estimation(y= y, t = t) # obtain initial value for sifma_2_0 @@ -678,103 +681,64 @@ def ar1_fit_ml(y, t): return theta_hat -def ar1_sim_geneva(n, tau_0=5, sigma_2_0=2, seed=123, p=1, evenly_spaced = False, - delta_t_dist = "exponential", param = 1): +def uar1_sim(t, tau_0=5, sigma_2_0=2): - """ - Generate a time series of length n from an autoregressive process of order 1 with evenly/unevenly spaced time points. - - Parameters - ---------- - n : integer - The length of the time series - - tau_0 : float - Time decay parameter of the AR(1) model ($\phi = e^{-\tau}$) - - sigma_2_0 : float - Variance of the innovations - - seed : integer - Random seed for reproducible results. - - p : integer - Parameter specifying the number of time series to generate - - evenly_spaced : boolean - if True, delta_t (spacing between time points) is a vector of 1, - if False, delta_t is generated from various distribution (exponential, pareto, poisson and random choice). - - delta_t_dist : str - the distribution that generates the delta_t - possible choices include 'exponential', 'poisson', 'pareto', or 'random_choice' - - param : distributional parameter(s) - - Returns - ------- - A tuple of 2 arrays - y_sim : n x p NumPy array - matrix of simulated AR(1) vectors - t_sim : n x p NumPy array - matrix of corresponding time axes + """ + Generate a time series of length n from an autoregressive process of order 1 with evenly/unevenly spaced time points. + + Parameters + ---------- + t : array + Time axis - See also - -------- - - pyleoclim.utils.tsmodel.ar1_fit_ml : Maximumum likelihood estimate of AR(1) parameters + tau_0 : float + Time decay parameter of the AR(1) model ($\phi = e^{-\tau}$) + + sigma_2_0 : float + Variance of the innovations - pyleoclim.utils.tsmodel.time_increments : Generate time increment vector according to a specific probability model + Returns + ------- + ys : n + matrix of simulated AR(1) vector - """ + + See also + -------- - # declare two array to save the values and the time index - y_sim = np.empty(shape=(n, p)) - t_sim = np.empty(shape=(n, p)) - - # generate p time series - for j in np.arange(p): - if evenly_spaced: - delta_t = [1]*n # for now we assume delta_t = 1 if evenly sampled, potentially to improve with a parameter that specify the time spacing - else: - delta_t = time_increments(n, param, delta_t_dist = "exponential", seed = seed+j) - - # obtain the 0-based time index from the delta_t distribution - t = np.cumsum(delta_t)-1 + pyleoclim.utils.tsmodel.uar1_fit : Maximumum likelihood estimate of AR(1) parameters + pyleoclim.utils.tsmodel.random_time_index : Generate time increment vector according to a specific probability model - # create empty vector - y = np.empty(n) - - # generate unevenly spaced AR(1) - np.random.seed(seed+j) - z = np.random.normal(loc=0, scale=1, size=n) - y[0] = z[0] - for i in range(1,n): - delta_i = t[i] - t[i-1] - phi_i = np.exp(-delta_i / tau_0) - sigma_2_i = sigma_2_0 * (1-pow(phi_i, 2)) - sigma_i = np.sqrt(sigma_2_i) - y[i] = phi_i * y[i-1] + sigma_i * z[i] - t_sim[:, j] = t - y_sim[:, j] = y - return y_sim, t_sim - - -def time_increments(n, param, delta_t_dist = "exponential", seed = 12345): + """ + n = len(t) + ys = np.zeros(n) # create empty vector + # generate unevenly spaced AR(1) + z = np.random.normal(loc=0, scale=1, size=n) + ys[0] = z[0] + for i in range(n-1): + delta_i = t[i+1] - t[i] + phi_i = np.exp(-delta_i / tau_0) + sigma_2_i = sigma_2_0 * (1-pow(phi_i, 2)) + sigma_i = np.sqrt(sigma_2_i) + ys[i+1] = phi_i * ys[i] + sigma_i * z[i] + + return ys + +def inverse_cumsum(arr): + return np.diff(np.concatenate(([0], arr))) + +def random_time_index(n, delta_t_dist = "exponential", param = [1]): ''' - Generate a time increment vector according to a specific probability model + Generate a random time index vector according to a specific probability model Parameters ---------- n: integer The length of the time series - seed: integer - Random seed for reproducible results. - delta_t_dist: str - the probability distribution of delta_t - possible choices include 'exponential', 'poisson', 'pareto', or 'random_choice' + the probability distribution of the random time increments. + possible choices include 'exponential', 'poisson', 'pareto', or 'random_choice'. if 'exponential', `param` is expected to be a single scale parameter (traditionally denoted \lambda) if 'poisson', `param` is expected to be a single parameter (rate) @@ -785,24 +749,31 @@ def time_increments(n, param, delta_t_dist = "exponential", seed = 12345): prob_random_choice: probabilities associated with each entry value_random_choice (e.g. [.95,.05]) (These two arrays must be of the same size) + Returns: ------- - delta_t : 1D array of time increments, length n + t : 1D array of random time index obtained by taking the cumulative sum of the sampled random time increments, length n ''' # check for a valid distribution valid_distributions = ["exponential", "poisson", "pareto", "random_choice"] if delta_t_dist not in valid_distributions: - raise ValueError("delta_t_dist must be one of: 'exponential', 'poisson', 'pareto', or 'random_choice'.") + raise ValueError("delta_t_dist must be one of: 'exponential', 'poisson', 'pareto', 'random_choice'.") - np.random.seed(seed) + param = np.array(param) # coerce array type if delta_t_dist == "exponential": + # make sure that param is of len 1 + if len(param) != 1: + raise ValueError('The Exponential law takes a single scale parameter.') delta_t = np.random.exponential(scale = param, size=n) + elif delta_t_dist == "poisson": + if len(param) != 1: + raise ValueError('The Poisson law takes a single parameter.') delta_t = np.random.poisson(lam = param, size = n) + 1 elif delta_t_dist == "pareto": if len(param) != 2: @@ -813,8 +784,11 @@ def time_increments(n, param, delta_t_dist = "exponential", seed = 12345): if len(param)<2 or len(param[0]) != len(param[1]): raise ValueError("value_random_choice and prob_random_choice must have the same size.") delta_t = np.random.choice(param[0], size=n, p=param[1]) - - return delta_t + return np.cumsum(delta_t) + + + + # def fBMsim(N=128, H=0.25): # '''Simple method to generate fractional Brownian Motion