Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

better bootstrapping #285

Merged
merged 34 commits into from
Feb 4, 2020
Merged
Changes from 1 commit
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
fd68215
bootstrap.stats parallelized properly
Dec 29, 2019
9da77a2
bootstrap_compute properly parallelized
Dec 29, 2019
21f4199
CL PR_template
Dec 29, 2019
28bc13a
dask demo
Jan 1, 2020
fd29223
dask notebook macbook timings
Jan 2, 2020
9c74bea
dask nb timings mistral
Jan 2, 2020
0690392
tried to fix my_quantile
Jan 2, 2020
45db8e9
my_quantile q=list
Jan 4, 2020
71859fe
comments
Jan 4, 2020
b21bcfb
Merge branch 'master' into AS_fix_bootstrapping
aaronspring Jan 5, 2020
1daf22f
Merge branch 'master' into AS_fix_bootstrapping
bradyrx Jan 12, 2020
ad55f63
minor changes requested
Jan 14, 2020
cd89817
no .load() in bootstrap
Jan 14, 2020
840597f
Merge branch 'master' into AS_fix_bootstrapping
aaronspring Jan 14, 2020
b006b25
contrib asv added
Jan 14, 2020
62521ec
new hindcast benchmarking
Jan 14, 2020
e0cb8b6
rm commented out _load_mem
Jan 14, 2020
96fb4eb
persist smp_hind speedup?
Jan 20, 2020
95150b9
not persist, fixed benchmarks
Jan 24, 2020
c11dd69
Merge branch 'master' into AS_fix_bootstrapping
aaronspring Jan 24, 2020
c25cca2
bf ds.get_axis_num
Jan 24, 2020
d96172d
Merge branch 'master' into AS_fix_bootstrapping
aaronspring Jan 30, 2020
22fee69
PR template and contrib asv
Jan 30, 2020
ec14d50
req changes
Jan 30, 2020
0f87ff0
dask examples docs
Jan 30, 2020
fcd493f
Update test_stats.py
aaronspring Jan 30, 2020
6bcc96a
formatting requested
Feb 3, 2020
c790afa
alphabetical order
Feb 3, 2020
2f2f5a1
require pandas 0.25.3
Feb 3, 2020
0fbb8b1
lint
Feb 3, 2020
f295aaf
resolved comments
Feb 3, 2020
5a27833
benchmarks prob dim=member
Feb 3, 2020
ba85852
get_chunksize takes xr.ds
Feb 4, 2020
abc8e3a
warning caption MB added
Feb 4, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
formatting requested
AS committed Feb 3, 2020
commit 6bcc96ab66ac6faf25905dc81eb242bd83229d91
8 changes: 8 additions & 0 deletions asv_bench/benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# https://github.com/pydata/xarray/blob/master/asv_bench/benchmarks/__init__.py
import itertools

import dask
import numpy as np

_counter = itertools.count()
@@ -47,3 +48,10 @@ def randint(low, high=None, size=None, frac_minus=None, seed=0):
x.flat[inds] = -1

return x


def ensure_loaded(res):
"""Compute no lazy results."""
if dask.is_dask_collection(res):
res = res.compute()
return res
126 changes: 68 additions & 58 deletions asv_bench/benchmarks/benchmarks_hindcast.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,33 @@
# Write the benchmarking functions here.
# See "Writing benchmarks" in the asv docs for more information.


import dask
import numpy as np
import xarray as xr

from climpred.bootstrap import bootstrap_hindcast
# from climpred.constants import HINDCAST_COMPARISONS
from climpred.prediction import compute_hindcast

from . import parameterized, randn, requires_dask
from . import ensure_loaded, parameterized, randn, requires_dask

HINDCAST_COMPARISONS = ['m2r'] # e2r and probabilistic dont match
# only take subselection of all possible metrics
METRICS = ['rmse', 'pearson_r', 'crpss']
# only take comparisons compatible with probabilistic metrics
HINDCAST_COMPARISONS = ['m2o']

bootstrap = 8


def _ensure_loaded(res):
"""Compute no lazy results."""
if dask.is_dask_collection(res):
res = res.compute()
return res
BOOTSTRAP = 8


class Generate:
"""
Generate random hind, control to be benckmarked.
Generate input data for benchmark.
"""

timeout = 600
repeat = (2, 5, 20)

def make_hind_ref(self):
"""hind and ref mimik hindcast experiment"""
def make_hind_obs(self):
"""Generates initialized hindcast, uninitialized historical and observational
data, mimicking a hindcast experiment."""
self.hind = xr.Dataset()
self.reference = xr.Dataset()
self.hist = xr.Dataset()
self.observations = xr.Dataset()
self.uninit = xr.Dataset()

self.nmember = 3
self.nlead = 3
@@ -47,26 +37,26 @@ def make_hind_ref(self):
self.init_end = 2000
self.ninit = self.init_end - self.init_start

frac_nan = 0.0
FRAC_NAN = 0.0

inits = np.arange(self.init_start, self.init_end)
leads = np.arange(1, 1 + self.nlead)
members = np.arange(1, 1 + self.nmember)

lons = xr.DataArray(
np.linspace(0, 360, self.nx),
np.linspace(0.5, 359.5, self.nx),
dims=('lon',),
attrs={'units': 'degrees east', 'long_name': 'longitude'},
)
lats = xr.DataArray(
np.linspace(-90, 90, self.ny),
np.linspace(-89.5, 89.5, self.ny),
dims=('lat',),
attrs={'units': 'degrees north', 'long_name': 'latitude'},
)
self.hind['tos'] = xr.DataArray(
self.hind['var'] = xr.DataArray(
randn(
(self.nmember, self.ninit, self.nlead, self.nx, self.ny),
frac_nan=frac_nan,
frac_nan=FRAC_NAN,
),
coords={
'member': members,
@@ -76,91 +66,111 @@ def make_hind_ref(self):
'lead': leads,
},
dims=('member', 'init', 'lead', 'lon', 'lat'),
name='tos',
name='var',
encoding=None,
attrs={'units': 'foo units', 'description': 'a description'},
attrs={'units': 'var units', 'description': 'a description'},
)
self.reference['tos'] = xr.DataArray(
randn((self.ninit, self.nx, self.ny), frac_nan=frac_nan),
self.observations['var'] = xr.DataArray(
randn((self.ninit, self.nx, self.ny), frac_nan=FRAC_NAN),
coords={'lon': lons, 'lat': lats, 'time': inits},
dims=('time', 'lon', 'lat'),
name='tos',
name='var',
encoding=None,
attrs={'units': 'foo units', 'description': 'a description'},
attrs={'units': 'var units', 'description': 'a description'},
)

self.hist['tos'] = xr.DataArray(
randn((self.ninit, self.nx, self.ny, self.nmember), frac_nan=frac_nan),
coords={'lon': lons, 'lat': lats, 'time': inits, 'member': members},
self.uninit['var'] = xr.DataArray(
randn(
(self.ninit, self.nx, self.ny, self.nmember), frac_nan=FRAC_NAN
),
coords={
'lon': lons,
'lat': lats,
'time': inits,
'member': members,
},
dims=('time', 'lon', 'lat', 'member'),
name='tos',
name='var',
encoding=None,
attrs={'units': 'foo units', 'description': 'a description'},
attrs={'units': 'var units', 'description': 'a description'},
)

self.hind.attrs = {'history': 'created for xarray benchmarking'}


class Compute(Generate):
"""
A few examples that benchmark climpred compute_hindcast.
Benchmark time and peak memory of `compute_hindcast` and `bootstrap_hindcast`.
"""

def setup(self, *args, **kwargs):
self.make_hind_ref()
self.make_hind_obs()

@parameterized(['metric', 'comparison'], (METRICS, HINDCAST_COMPARISONS))
def time_compute_hindcast(self, metric, comparison):
"""Take time for compute_hindcast."""
_ensure_loaded(
"""Take time for `compute_hindcast`."""
ensure_loaded(
compute_hindcast(
self.hind, self.reference, metric=metric, comparison=comparison
self.hind,
self.observations,
metric=metric,
comparison=comparison,
)
)

@parameterized(['metric', 'comparison'], (METRICS, HINDCAST_COMPARISONS))
def peakmem_compute_hindcast(self, metric, comparison):
"""Take memory peak for compute_hindcast for all comparisons."""
_ensure_loaded(
"""Take memory peak for `compute_hindcast`."""
ensure_loaded(
compute_hindcast(
self.hind, self.reference, metric=metric, comparison=comparison
self.hind,
self.observations,
metric=metric,
comparison=comparison,
)
)

@parameterized(['metric', 'comparison'], (METRICS, HINDCAST_COMPARISONS))
def time_bootstrap_hindcast(self, metric, comparison):
"""Take time for bootstrap_hindcast for one metric."""
_ensure_loaded(
"""Take time for `bootstrap_hindcast`."""
ensure_loaded(
bootstrap_hindcast(
self.hind,
self.reference,
self.hist,
self.uninit,
self.observations,
metric=metric,
comparison=comparison,
bootstrap=bootstrap,
bootstrap=BOOTSTRAP,
)
)

@parameterized(['metric', 'comparison'], (METRICS, HINDCAST_COMPARISONS))
def peakmem_bootstrap_hindcast(self, metric, comparison):
"""Take memory peak for bootstrap_hindcast."""
_ensure_loaded(
"""Take memory peak for `bootstrap_hindcast`."""
ensure_loaded(
bootstrap_hindcast(
self.hind,
self.reference,
self.hist,
self.uninit,
self.observations,
metric=metric,
comparison=comparison,
bootstrap=bootstrap,
bootstrap=BOOTSTRAP,
)
)


class ComputeDask(Compute):
aaronspring marked this conversation as resolved.
Show resolved Hide resolved
def setup(self, *args, **kwargs):
"""Benchmark time and peak memory of `compute_hindcast` and
`bootstrap_hindcast`. This executes the same tests as `Compute` but on chunked
data."""
requires_dask()
# magic taken from
# https://github.com/pydata/xarray/blob/stable/asv_bench/benchmarks/rolling.py
super().setup(**kwargs)
# chunk along a spatial dimension to enable embarrasingly parallel computation
self.hind = self.hind['tos'].chunk({'lon': self.nx // bootstrap})
self.reference = self.reference['tos'].chunk({'lon': self.nx // bootstrap})
self.hist = self.hist['tos'].chunk({'lon': self.nx // bootstrap})
self.hind = self.hind['var'].chunk({'lon': self.nx // BOOTSTRAP})
self.observations = self.observations['var'].chunk(
{'lon': self.nx // BOOTSTRAP}
)
self.uninit = self.uninit['var'].chunk({'lon': self.nx // BOOTSTRAP})
Loading