From dfdeef79d82e81357a276120a7d34738db460e48 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 3 Oct 2019 21:42:50 -0700 Subject: [PATCH 1/6] Explicitly keep track of indexes with merging (#3234) * Explicitly keep track of indexes in merge.py * Typing fixes * More tying fixes * more typing fixes * fixup --- xarray/core/alignment.py | 26 ++- xarray/core/computation.py | 39 ++-- xarray/core/coordinates.py | 88 +++++--- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 55 +++-- xarray/core/merge.py | 422 +++++++++++++++++++++---------------- 6 files changed, 352 insertions(+), 280 deletions(-) diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 4529fa509d9..3bc60db0a0b 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -268,7 +268,7 @@ def align( all_indexes[dim].append(index) if join == "override": - objects = _override_indexes(list(objects), all_indexes, exclude) + objects = _override_indexes(objects, all_indexes, exclude) # We don't reindex over dimensions with all equal indexes for two reasons: # - It's faster for the usual case (already aligned objects). @@ -365,26 +365,27 @@ def is_alignable(obj): targets = [] no_key = object() not_replaced = object() - for n, variables in enumerate(objects): + for position, variables in enumerate(objects): if is_alignable(variables): - positions.append(n) + positions.append(position) keys.append(no_key) targets.append(variables) out.append(not_replaced) elif is_dict_like(variables): + current_out = OrderedDict() for k, v in variables.items(): - if is_alignable(v) and k not in indexes: - # Skip variables in indexes for alignment, because these - # should to be overwritten instead: - # https://github.com/pydata/xarray/issues/725 - positions.append(n) + if is_alignable(v): + positions.append(position) keys.append(k) targets.append(v) - out.append(OrderedDict(variables)) + current_out[k] = not_replaced + else: + current_out[k] = v + out.append(current_out) elif raise_on_invalid: raise ValueError( "object to align is neither an xarray.Dataset, " - "an xarray.DataArray nor a dictionary: %r" % variables + "an xarray.DataArray nor a dictionary: {!r}".format(variables) ) else: out.append(variables) @@ -405,7 +406,10 @@ def is_alignable(obj): out[position][key] = aligned_obj # something went wrong: we should have replaced all sentinel values - assert all(arg is not not_replaced for arg in out) + for arg in out: + assert arg is not not_replaced + if is_dict_like(arg): + assert all(value is not not_replaced for value in arg.values()) return out diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 0d08234c474..4b9428847f4 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -24,12 +24,13 @@ from . import duck_array_ops, utils from .alignment import deep_align -from .merge import expand_and_merge_variables +from .merge import merge_coordinates_without_align from .pycompat import dask_array_type from .utils import is_dict_like from .variable import Variable if TYPE_CHECKING: + from .coordinates import Coordinates # noqa from .dataset import Dataset _DEFAULT_FROZEN_SET = frozenset() # type: frozenset @@ -152,17 +153,16 @@ def result_name(objects: list) -> Any: return name -def _get_coord_variables(args): - input_coords = [] +def _get_coords_list(args) -> List["Coordinates"]: + coords_list = [] for arg in args: try: coords = arg.coords except AttributeError: pass # skip this argument else: - coord_vars = getattr(coords, "variables", coords) - input_coords.append(coord_vars) - return input_coords + coords_list.append(coords) + return coords_list def build_output_coords( @@ -185,32 +185,29 @@ def build_output_coords( ------- OrderedDict of Variable objects with merged coordinates. """ - input_coords = _get_coord_variables(args) + coords_list = _get_coords_list(args) - if exclude_dims: - input_coords = [ - OrderedDict( - (k, v) for k, v in coord_vars.items() if exclude_dims.isdisjoint(v.dims) - ) - for coord_vars in input_coords - ] - - if len(input_coords) == 1: + if len(coords_list) == 1 and not exclude_dims: # we can skip the expensive merge - unpacked_input_coords, = input_coords - merged = OrderedDict(unpacked_input_coords) + unpacked_coords, = coords_list + merged_vars = OrderedDict(unpacked_coords.variables) else: - merged = expand_and_merge_variables(input_coords) + # TODO: save these merged indexes, instead of re-computing them later + merged_vars, unused_indexes = merge_coordinates_without_align( + coords_list, exclude_dims=exclude_dims + ) output_coords = [] for output_dims in signature.output_core_dims: dropped_dims = signature.all_input_core_dims - set(output_dims) if dropped_dims: filtered = OrderedDict( - (k, v) for k, v in merged.items() if dropped_dims.isdisjoint(v.dims) + (k, v) + for k, v in merged_vars.items() + if dropped_dims.isdisjoint(v.dims) ) else: - filtered = merged + filtered = merged_vars output_coords.append(filtered) return output_coords diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index ddea5739fff..430e507396b 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -6,8 +6,8 @@ Hashable, Iterator, Mapping, - Sequence, Set, + Sequence, Tuple, Union, cast, @@ -17,11 +17,7 @@ from . import formatting, indexing from .indexes import Indexes -from .merge import ( - expand_and_merge_variables, - merge_coords, - merge_coords_for_inplace_math, -) +from .merge import merge_coords, merge_coordinates_without_align from .utils import Frozen, ReprObject, either_dict_or_kwargs from .variable import Variable @@ -34,7 +30,7 @@ _THIS_ARRAY = ReprObject("") -class AbstractCoordinates(Mapping[Hashable, "DataArray"]): +class Coordinates(Mapping[Hashable, "DataArray"]): __slots__ = () def __getitem__(self, key: Hashable) -> "DataArray": @@ -57,10 +53,10 @@ def indexes(self) -> Indexes: @property def variables(self): - raise NotImplementedError() + raise NotImplementedError - def _update_coords(self, coords): - raise NotImplementedError() + def _update_coords(self, coords, indexes): + raise NotImplementedError def __iter__(self) -> Iterator["Hashable"]: # needs to be in the same order as the dataset variables @@ -116,19 +112,19 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index: def update(self, other: Mapping[Hashable, Any]) -> None: other_vars = getattr(other, "variables", other) - coords = merge_coords( + coords, indexes = merge_coords( [self.variables, other_vars], priority_arg=1, indexes=self.indexes ) - self._update_coords(coords) + self._update_coords(coords, indexes) def _merge_raw(self, other): """For use with binary arithmetic.""" if other is None: variables = OrderedDict(self.variables) + indexes = OrderedDict(self.indexes) else: - # don't align because we already called xarray.align - variables = expand_and_merge_variables([self.variables, other.variables]) - return variables + variables, indexes = merge_coordinates_without_align([self, other]) + return variables, indexes @contextmanager def _merge_inplace(self, other): @@ -136,18 +132,18 @@ def _merge_inplace(self, other): if other is None: yield else: - # don't include indexes in priority_vars, because we didn't align - # first - priority_vars = OrderedDict( - kv for kv in self.variables.items() if kv[0] not in self.dims - ) - variables = merge_coords_for_inplace_math( - [self.variables, other.variables], priority_vars=priority_vars + # don't include indexes in prioritized, because we didn't align + # first and we want indexes to be checked + prioritized = { + k: (v, None) for k, v in self.variables.items() if k not in self.indexes + } + variables, indexes = merge_coordinates_without_align( + [self, other], prioritized ) yield - self._update_coords(variables) + self._update_coords(variables, indexes) - def merge(self, other: "AbstractCoordinates") -> "Dataset": + def merge(self, other: "Coordinates") -> "Dataset": """Merge two sets of coordinates to create a new Dataset The method implements the logic used for joining coordinates in the @@ -173,13 +169,19 @@ def merge(self, other: "AbstractCoordinates") -> "Dataset": if other is None: return self.to_dataset() - else: - other_vars = getattr(other, "variables", other) - coords = expand_and_merge_variables([self.variables, other_vars]) - return Dataset._from_vars_and_coord_names(coords, set(coords)) + + if not isinstance(other, Coordinates): + other = Dataset(coords=other).coords + + coords, indexes = merge_coordinates_without_align([self, other]) + coord_names = set(coords) + merged = Dataset._construct_direct( + variables=coords, coord_names=coord_names, indexes=indexes + ) + return merged -class DatasetCoordinates(AbstractCoordinates): +class DatasetCoordinates(Coordinates): """Dictionary like container for Dataset coordinates. Essentially an immutable OrderedDict with keys given by the array's @@ -218,7 +220,11 @@ def to_dataset(self) -> "Dataset": """ return self._data._copy_listed(self._names) - def _update_coords(self, coords: Mapping[Hashable, Any]) -> None: + def _update_coords( + self, + coords: "OrderedDict[Hashable, Variable]", + indexes: Mapping[Hashable, pd.Index], + ) -> None: from .dataset import calculate_dimensions variables = self._data._variables.copy() @@ -234,7 +240,12 @@ def _update_coords(self, coords: Mapping[Hashable, Any]) -> None: self._data._variables = variables self._data._coord_names.update(new_coord_names) self._data._dims = dims - self._data._indexes = None + + # TODO(shoyer): once ._indexes is always populated by a dict, modify + # it to update inplace instead. + original_indexes = OrderedDict(self._data.indexes) + original_indexes.update(indexes) + self._data._indexes = original_indexes def __delitem__(self, key: Hashable) -> None: if key in self: @@ -251,7 +262,7 @@ def _ipython_key_completions_(self): ] -class DataArrayCoordinates(AbstractCoordinates): +class DataArrayCoordinates(Coordinates): """Dictionary like container for DataArray coordinates. Essentially an OrderedDict with keys given by the array's @@ -274,7 +285,11 @@ def _names(self) -> Set[Hashable]: def __getitem__(self, key: Hashable) -> "DataArray": return self._data._getitem_coord(key) - def _update_coords(self, coords) -> None: + def _update_coords( + self, + coords: "OrderedDict[Hashable, Variable]", + indexes: Mapping[Hashable, pd.Index], + ) -> None: from .dataset import calculate_dimensions coords_plus_data = coords.copy() @@ -285,7 +300,12 @@ def _update_coords(self, coords) -> None: "cannot add coordinates with new dimensions to " "a DataArray" ) self._data._coords = coords - self._data._indexes = None + + # TODO(shoyer): once ._indexes is always populated by a dict, modify + # it to update inplace instead. + original_indexes = OrderedDict(self._data.indexes) + original_indexes.update(indexes) + self._data._indexes = original_indexes @property def variables(self): diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 68bfe301bfc..7ad6f3cbae8 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2519,7 +2519,7 @@ def func(self, other): if not reflexive else f(other_variable, self.variable) ) - coords = self.coords._merge_raw(other_coords) + coords, indexes = self.coords._merge_raw(other_coords) name = self._result_name(other) return self._replace(variable, coords, name) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 9a1339cf528..03276d61cf0 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -65,7 +65,7 @@ dataset_merge_method, dataset_update_method, merge_data_and_coords, - merge_variables, + merge_coordinates_without_align, ) from .options import OPTIONS, _get_keep_attrs from .pycompat import dask_array_type @@ -85,7 +85,7 @@ if TYPE_CHECKING: from ..backends import AbstractDataStore, ZarrStore from .dataarray import DataArray - from .merge import DatasetLike + from .merge import CoercibleMapping try: from dask.delayed import Delayed @@ -508,10 +508,9 @@ def __init__( data_vars = {} if coords is None: coords = {} - self._set_init_vars_and_dims(data_vars, coords, compat) # TODO(shoyer): expose indexes as a public argument in __init__ - self._indexes = None # type: Optional[OrderedDict[Any, pd.Index]] + self._set_init_vars_and_dims(data_vars, coords, compat) if attrs is not None: self._attrs = OrderedDict(attrs) @@ -531,13 +530,14 @@ def _set_init_vars_and_dims(self, data_vars, coords, compat): if isinstance(coords, Dataset): coords = coords.variables - variables, coord_names, dims = merge_data_and_coords( + variables, coord_names, dims, indexes = merge_data_and_coords( data_vars, coords, compat=compat ) self._variables = variables self._coord_names = coord_names self._dims = dims + self._indexes = indexes @classmethod def load_store(cls, store, decoder=None) -> "Dataset": @@ -838,7 +838,7 @@ def _construct_direct( cls, variables, coord_names, - dims, + dims=None, attrs=None, indexes=None, encoding=None, @@ -847,6 +847,8 @@ def _construct_direct( """Shortcut around __init__ for internal use when we want to skip costly validation """ + if dims is None: + dims = calculate_dimensions(variables) obj = object.__new__(cls) obj._variables = variables obj._coord_names = coord_names @@ -862,8 +864,7 @@ def _construct_direct( @classmethod def _from_vars_and_coord_names(cls, variables, coord_names, attrs=None): - dims = calculate_dimensions(variables) - return cls._construct_direct(variables, coord_names, dims, attrs) + return cls._construct_direct(variables, coord_names, attrs=attrs) # TODO(shoyer): renable type checking on this signature when pytype has a # good way to handle defaulting arguments to a sentinel value: @@ -1268,6 +1269,8 @@ def __delitem__(self, key: Hashable) -> None: """ del self._variables[key] self._coord_names.discard(key) + if key in self.indexes: + del self._indexes[key] self._dims = calculate_dimensions(self._variables) # mutable objects should not be hashable @@ -1807,20 +1810,16 @@ def _validate_indexers( return indexers_list def _get_indexers_coords_and_indexes(self, indexers): - """ Extract coordinates from indexers. - Returns an OrderedDict mapping from coordinate name to the - coordinate variable. + """Extract coordinates and indexes from indexers. Only coordinate with a name different from any of self.variables will be attached. """ from .dataarray import DataArray - coord_list = [] - indexes = OrderedDict() + coords_list = [] for k, v in indexers.items(): if isinstance(v, DataArray): - v_coords = v.coords if v.dtype.kind == "b": if v.ndim != 1: # we only support 1-d boolean array raise ValueError( @@ -1831,14 +1830,14 @@ def _get_indexers_coords_and_indexes(self, indexers): # Make sure in case of boolean DataArray, its # coordinate also should be indexed. v_coords = v[v.values.nonzero()[0]].coords - - coord_list.append({d: v_coords[d].variable for d in v.coords}) - indexes.update(v.indexes) + else: + v_coords = v.coords + coords_list.append(v_coords) # we don't need to call align() explicitly or check indexes for # alignment, because merge_variables already checks for exact alignment # between dimension coordinates - coords = merge_variables(coord_list) + coords, indexes = merge_coordinates_without_align(coords_list) assert_coordinate_consistent(self, coords) # silently drop the conflicted variables. @@ -2644,12 +2643,14 @@ def _rename_vars(self, name_dict, dims_dict): def _rename_dims(self, name_dict): return {name_dict.get(k, k): v for k, v in self.dims.items()} - def _rename_indexes(self, name_dict): + def _rename_indexes(self, name_dict, dims_set): if self._indexes is None: return None indexes = OrderedDict() for k, v in self.indexes.items(): new_name = name_dict.get(k, k) + if new_name not in dims_set: + continue if isinstance(v, pd.MultiIndex): new_names = [name_dict.get(k, k) for k in v.names] index = pd.MultiIndex( @@ -2667,7 +2668,7 @@ def _rename_indexes(self, name_dict): def _rename_all(self, name_dict, dims_dict): variables, coord_names = self._rename_vars(name_dict, dims_dict) dims = self._rename_dims(dims_dict) - indexes = self._rename_indexes(name_dict) + indexes = self._rename_indexes(name_dict, dims.keys()) return variables, coord_names, dims, indexes def rename( @@ -3448,7 +3449,7 @@ def unstack(self, dim: Union[Hashable, Iterable[Hashable]] = None) -> "Dataset": result = result._unstack_once(dim) return result - def update(self, other: "DatasetLike", inplace: bool = None) -> "Dataset": + def update(self, other: "CoercibleMapping", inplace: bool = None) -> "Dataset": """Update this dataset's variables with those from another dataset. Parameters @@ -3475,13 +3476,12 @@ def update(self, other: "DatasetLike", inplace: bool = None) -> "Dataset": dataset. """ _check_inplace(inplace) - variables, coord_names, dims = dataset_update_method(self, other) - - return self._replace_vars_and_dims(variables, coord_names, dims, inplace=True) + merge_result = dataset_update_method(self, other) + return self._replace(inplace=True, **merge_result._asdict()) def merge( self, - other: "DatasetLike", + other: "CoercibleMapping", inplace: bool = None, overwrite_vars: Union[Hashable, Iterable[Hashable]] = frozenset(), compat: str = "no_conflicts", @@ -3536,7 +3536,7 @@ def merge( If any variables conflict (see ``compat``). """ _check_inplace(inplace) - variables, coord_names, dims = dataset_merge_method( + merge_result = dataset_merge_method( self, other, overwrite_vars=overwrite_vars, @@ -3544,8 +3544,7 @@ def merge( join=join, fill_value=fill_value, ) - - return self._replace_vars_and_dims(variables, coord_names, dims) + return self._replace(**merge_result._asdict()) def _assert_all_in_dataset( self, names: Iterable[Hashable], virtual_okay: bool = False diff --git a/xarray/core/merge.py b/xarray/core/merge.py index ceeb7db09f1..8159e8ebcf8 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -1,13 +1,15 @@ from collections import OrderedDict from typing import ( TYPE_CHECKING, + AbstractSet, Any, Dict, Hashable, Iterable, List, Mapping, - MutableMapping, + NamedTuple, + Optional, Sequence, Set, Tuple, @@ -18,21 +20,26 @@ from . import dtypes, pdcompat from .alignment import deep_align -from .utils import Frozen +from .utils import Frozen, dict_equiv from .variable import Variable, as_variable, assert_unique_multiindex_level_names if TYPE_CHECKING: + from .coordinates import Coordinates from .dataarray import DataArray from .dataset import Dataset - DatasetLikeValue = Union[ - DataArray, Variable, Tuple[Hashable, Any], Tuple[Sequence[Hashable], Any] + DimsLike = Union[Hashable, Sequence[Hashable]] + ArrayLike = Any + VariableLike = Union[ + ArrayLike, + Tuple[DimsLike, ArrayLike], + Tuple[DimsLike, ArrayLike, Mapping], + Tuple[DimsLike, ArrayLike, Mapping, Mapping], ] - DatasetLike = Union[Dataset, Mapping[Hashable, DatasetLikeValue]] - """Any object type that can be used on the rhs of Dataset.update, - Dataset.merge, etc. - """ - MutableDatasetLike = Union[Dataset, MutableMapping[Hashable, DatasetLikeValue]] + XarrayValue = Union[DataArray, Variable, VariableLike] + DatasetLike = Union[Dataset, Mapping[Hashable, XarrayValue]] + CoercibleValue = Union[XarrayValue, pd.Series, pd.DataFrame] + CoercibleMapping = Union[Dataset, Mapping[Hashable, CoercibleValue]] PANDAS_TYPES = (pd.Series, pd.DataFrame, pdcompat.Panel) @@ -71,8 +78,12 @@ class MergeError(ValueError): # TODO: move this to an xarray.exceptions module? -def unique_variable(name, variables, compat="broadcast_equals", equals=None): - # type: (Any, List[Variable], str, bool) -> Variable +def unique_variable( + name: Hashable, + variables: List[Variable], + compat: str = "broadcast_equals", + equals: bool = None, +) -> Variable: """Return the unique variable from a list of variables or raise MergeError. Parameters @@ -121,8 +132,8 @@ def unique_variable(name, variables, compat="broadcast_equals", equals=None): if not equals: raise MergeError( - "conflicting values for variable %r on objects to be combined. You can skip this check by specifying compat='override'." - % (name) + "conflicting values for variable {!r} on objects to be combined. " + "You can skip this check by specifying compat='override'.".format(name) ) if combine_method: @@ -137,138 +148,188 @@ def _assert_compat_valid(compat): raise ValueError("compat=%r invalid: must be %s" % (compat, set(_VALID_COMPAT))) -class OrderedDefaultDict(OrderedDict): - # minimal version of an ordered defaultdict - # beware: does not pickle or copy properly - def __init__(self, default_factory): - self.default_factory = default_factory - super().__init__() - - def __missing__(self, key): - self[key] = default = self.default_factory() - return default +MergeElement = Tuple[Variable, Optional[pd.Index]] -def merge_variables( - list_of_variables_dicts: List[Mapping[Any, Variable]], - priority_vars: Mapping[Any, Variable] = None, +def merge_collected( + grouped: "OrderedDict[Hashable, List[MergeElement]]", + prioritized: Mapping[Hashable, MergeElement] = None, compat: str = "minimal", -) -> "OrderedDict[Any, Variable]": +) -> Tuple["OrderedDict[Hashable, Variable]", "OrderedDict[Hashable, pd.Index]"]: """Merge dicts of variables, while resolving conflicts appropriately. Parameters ---------- - lists_of_variables_dicts : list of mappings with Variable values - List of mappings for which each value is a xarray.Variable object. - priority_vars : mapping with Variable or None values, optional - If provided, variables are always taken from this dict in preference to - the input variable dictionaries, without checking for conflicts. - compat : {'identical', 'equals', 'broadcast_equals', 'minimal', 'no_conflicts', 'override'}, optional + Type of equality check to use when checking for conflicts. Returns ------- - OrderedDict with keys taken by the union of keys on list_of_variable_dicts, + OrderedDict with keys taken by the union of keys on list_of_mappings, and Variable values corresponding to those that should be found on the merged result. """ - if priority_vars is None: - priority_vars = {} + if prioritized is None: + prioritized = {} _assert_compat_valid(compat) - dim_compat = min(compat, "equals", key=_VALID_COMPAT.get) - - lookup = OrderedDefaultDict(list) - for variables in list_of_variables_dicts: - for name, var in variables.items(): - lookup[name].append(var) - - # n.b. it's important to fill up merged in the original order in which - # variables appear - merged = OrderedDict() # type: OrderedDict[Any, Variable] - - for name, var_list in lookup.items(): - if name in priority_vars: - # one of these arguments (e.g., the first for in-place arithmetic - # or the second for Dataset.update) takes priority - merged[name] = priority_vars[name] + + merged_vars = OrderedDict() # type: OrderedDict[Any, Variable] + merged_indexes = OrderedDict() # type: OrderedDict[Any, pd.Index] + + for name, elements_list in grouped.items(): + if name in prioritized: + variable, index = prioritized[name] + merged_vars[name] = variable + if index is not None: + merged_indexes[name] = index else: - dim_variables = [var for var in var_list if (name,) == var.dims] - if dim_variables: - # if there are dimension coordinates, these must be equal (or - # identical), and they take priority over non-dimension - # coordinates - merged[name] = unique_variable(name, dim_variables, dim_compat) + indexed_elements = [ + (variable, index) + for variable, index in elements_list + if index is not None + ] + + if indexed_elements: + # TODO(shoyer): consider adjusting this logic. Are we really + # OK throwing away variable without an index in favor of + # indexed variables, without even checking if values match? + variable, index = indexed_elements[0] + for _, other_index in indexed_elements[1:]: + if not index.equals(other_index): + raise MergeError( + "conflicting values for index %r on objects to be " + "combined:\nfirst value: %r\nsecond value: %r" + % (name, index, other_index) + ) + if compat == "identical": + for other_variable, _ in indexed_elements[1:]: + if not dict_equiv(variable.attrs, other_variable.attrs): + raise MergeError( + "conflicting attribute values on combined " + "variable %r:\nfirst value: %r\nsecond value: %r" + % (name, variable.attrs, other_variable.attrs) + ) + merged_vars[name] = variable + merged_indexes[name] = index else: + variables = [variable for variable, _ in elements_list] try: - merged[name] = unique_variable(name, var_list, compat) + merged_vars[name] = unique_variable(name, variables, compat) except MergeError: if compat != "minimal": # we need more than "minimal" compatibility (for which # we drop conflicting coordinates) raise - return merged + return merged_vars, merged_indexes -def expand_variable_dicts( - list_of_variable_dicts: "List[Union[Dataset, OrderedDict]]", -) -> "List[Mapping[Any, Variable]]": - """Given a list of dicts with xarray object values, expand the values. +def collect_variables_and_indexes( + list_of_mappings: "List[DatasetLike]", +) -> "OrderedDict[Hashable, List[MergeElement]]": + """Collect variables and indexes from list of mappings of xarray objects. - Parameters - ---------- - list_of_variable_dicts : list of dict or Dataset objects - Each value for the mappings must be of the following types: - - an xarray.Variable - - a tuple `(dims, data[, attrs[, encoding]])` that can be converted in - an xarray.Variable - - or an xarray.DataArray - - Returns - ------- - A list of ordered dictionaries corresponding to inputs, or coordinates from - an input's values. The values of each ordered dictionary are all - xarray.Variable objects. + Mappings must either be Dataset objects, or have values of one of the + following types: + - an xarray.Variable + - a tuple `(dims, data[, attrs[, encoding]])` that can be converted in + an xarray.Variable + - or an xarray.DataArray """ from .dataarray import DataArray from .dataset import Dataset - var_dicts = [] + grouped = ( + OrderedDict() + ) # type: OrderedDict[Hashable, List[Tuple[Variable, pd.Index]]] - for variables in list_of_variable_dicts: - if isinstance(variables, Dataset): - var_dicts.append(variables.variables) - continue + def append(name, variable, index): + values = grouped.setdefault(name, []) + values.append((variable, index)) - # append coords to var_dicts before appending sanitized_vars, - # because we want coords to appear first - sanitized_vars = OrderedDict() # type: OrderedDict[Any, Variable] + def append_all(variables, indexes): + for name, variable in variables.items(): + append(name, variable, indexes.get(name)) - for name, var in variables.items(): - if isinstance(var, DataArray): - # use private API for speed - coords = var._coords.copy() + for mapping in list_of_mappings: + if isinstance(mapping, Dataset): + append_all(mapping.variables, mapping.indexes) + continue + + for name, variable in mapping.items(): + if isinstance(variable, DataArray): + coords = variable._coords.copy() # use private API for speed + indexes = OrderedDict(variable.indexes) # explicitly overwritten variables should take precedence coords.pop(name, None) - var_dicts.append(coords) - - var = as_variable(var, name=name) - sanitized_vars[name] = var + indexes.pop(name, None) + append_all(coords, indexes) - var_dicts.append(sanitized_vars) + variable = as_variable(variable, name=name) + if variable.dims == (name,): + variable = variable.to_index_variable() + index = variable.to_index() + else: + index = None + append(name, variable, index) + + return grouped + + +def collect_from_coordinates( + list_of_coords: "List[Coordinates]" +) -> "OrderedDict[Hashable, List[MergeElement]]": + """Collect variables and indexes to be merged from Coordinate objects.""" + grouped = ( + OrderedDict() + ) # type: OrderedDict[Hashable, List[Tuple[Variable, pd.Index]]] + + for coords in list_of_coords: + variables = coords.variables + indexes = coords.indexes + for name, variable in variables.items(): + value = grouped.setdefault(name, []) + value.append((variable, indexes.get(name))) + return grouped + + +def merge_coordinates_without_align( + objects: "List[Coordinates]", + prioritized: Mapping[Hashable, MergeElement] = None, + exclude_dims: AbstractSet = frozenset(), +) -> Tuple["OrderedDict[Hashable, Variable]", "OrderedDict[Hashable, pd.Index]"]: + """Merge variables/indexes from coordinates without automatic alignments. + + This function is used for merging coordinate from pre-existing xarray + objects. + """ + collected = collect_from_coordinates(objects) + + if exclude_dims: + filtered = OrderedDict() # type: OrderedDict[Hashable, List[MergeElement]] + for name, elements in collected.items(): + new_elements = [ + (variable, index) + for variable, index in elements + if exclude_dims.isdisjoint(variable.dims) + ] + if new_elements: + filtered[name] = new_elements + else: + filtered = collected - return var_dicts + return merge_collected(filtered, prioritized) def determine_coords( - list_of_variable_dicts: Iterable["DatasetLike"] + list_of_mappings: Iterable["DatasetLike"] ) -> Tuple[Set[Hashable], Set[Hashable]]: """Given a list of dicts with xarray object values, identify coordinates. Parameters ---------- - list_of_variable_dicts : list of dict or Dataset objects + list_of_mappings : list of dict or Dataset objects Of the same form as the arguments to expand_variable_dicts. Returns @@ -284,12 +345,12 @@ def determine_coords( coord_names = set() # type: set noncoord_names = set() # type: set - for variables in list_of_variable_dicts: - if isinstance(variables, Dataset): - coord_names.update(variables.coords) - noncoord_names.update(variables.data_vars) + for mapping in list_of_mappings: + if isinstance(mapping, Dataset): + coord_names.update(mapping.coords) + noncoord_names.update(mapping.data_vars) else: - for name, var in variables.items(): + for name, var in mapping.items(): if isinstance(var, DataArray): coords = set(var._coords) # use private API for speed # explicitly overwritten variables should take precedence @@ -299,7 +360,7 @@ def determine_coords( return coord_names, noncoord_names -def coerce_pandas_values(objects: Iterable["DatasetLike"]) -> List["DatasetLike"]: +def coerce_pandas_values(objects: Iterable["CoercibleMapping"]) -> List["DatasetLike"]: """Convert pandas values found in a list of labeled objects. Parameters @@ -332,18 +393,9 @@ def coerce_pandas_values(objects: Iterable["DatasetLike"]) -> List["DatasetLike" return out -def merge_coords_for_inplace_math(objs, priority_vars=None): - """Merge coordinate variables without worrying about alignment. - - This function is used for merging variables in coordinates.py. - """ - expanded = expand_variable_dicts(objs) - variables = merge_variables(expanded, priority_vars) - assert_unique_multiindex_level_names(variables) - return variables - - -def _get_priority_vars(objects, priority_arg, compat="equals"): +def _get_priority_vars_and_indexes( + objects: List["DatasetLike"], priority_arg: Optional[int], compat: str = "equals" +) -> "OrderedDict[Hashable, MergeElement]": """Extract the priority variable from a list of mappings. We need this method because in some cases the priority argument itself @@ -361,36 +413,27 @@ def _get_priority_vars(objects, priority_arg, compat="equals"): Returns ------- - None, if priority_arg is None, or an OrderedDict with Variable objects as - values indicating priority variables. + An OrderedDict of variables and associated indexes (if any) to prioritize. """ if priority_arg is None: - priority_vars = {} - else: - expanded = expand_variable_dicts([objects[priority_arg]]) - priority_vars = merge_variables(expanded, compat=compat) - return priority_vars - + return OrderedDict() -def expand_and_merge_variables(objs, priority_arg=None): - """Merge coordinate variables without worrying about alignment. - - This function is used for merging variables in computation.py. - """ - expanded = expand_variable_dicts(objs) - priority_vars = _get_priority_vars(objs, priority_arg) - variables = merge_variables(expanded, priority_vars) - return variables + collected = collect_variables_and_indexes([objects[priority_arg]]) + variables, indexes = merge_collected(collected, compat=compat) + grouped = OrderedDict() # type: OrderedDict[Hashable, MergeElement] + for name, variable in variables.items(): + grouped[name] = (variable, indexes.get(name)) + return grouped def merge_coords( - objs, - compat="minimal", - join="outer", - priority_arg=None, - indexes=None, - fill_value=dtypes.NA, -): + objects: Iterable["CoercibleMapping"], + compat: str = "minimal", + join: str = "outer", + priority_arg: Optional[int] = None, + indexes: Optional[Mapping[Hashable, pd.Index]] = None, + fill_value: object = dtypes.NA, +) -> Tuple["OrderedDict[Hashable, Variable]", "OrderedDict[Hashable, pd.Index]"]: """Merge coordinate variables. See merge_core below for argument descriptions. This works similarly to @@ -398,29 +441,28 @@ def merge_coords( coordinates or not. """ _assert_compat_valid(compat) - coerced = coerce_pandas_values(objs) + coerced = coerce_pandas_values(objects) aligned = deep_align( coerced, join=join, copy=False, indexes=indexes, fill_value=fill_value ) - expanded = expand_variable_dicts(aligned) - priority_vars = _get_priority_vars(aligned, priority_arg, compat=compat) - variables = merge_variables(expanded, priority_vars, compat=compat) + collected = collect_variables_and_indexes(aligned) + prioritized = _get_priority_vars_and_indexes(aligned, priority_arg, compat=compat) + variables, out_indexes = merge_collected(collected, prioritized, compat=compat) assert_unique_multiindex_level_names(variables) - - return variables + return variables, out_indexes def merge_data_and_coords(data, coords, compat="broadcast_equals", join="outer"): """Used in Dataset.__init__.""" - objs = [data, coords] + objects = [data, coords] explicit_coords = coords.keys() - indexes = dict(extract_indexes(coords)) + indexes = dict(_extract_indexes_from_coords(coords)) return merge_core( - objs, compat, join, explicit_coords=explicit_coords, indexes=indexes + objects, compat, join, explicit_coords=explicit_coords, indexes=indexes ) -def extract_indexes(coords): +def _extract_indexes_from_coords(coords): """Yields the name & index of valid indexes from a mapping of coords""" for name, variable in coords.items(): variable = as_variable(variable, name=name) @@ -443,31 +485,42 @@ def assert_valid_explicit_coords(variables, dims, explicit_coords): ) +_MergeResult = NamedTuple( + "_MergeResult", + [ + ("variables", "OrderedDict[Hashable, Variable]"), + ("coord_names", Set[Hashable]), + ("dims", Dict[Hashable, int]), + ("indexes", "OrderedDict[Hashable, pd.Index]"), + ], +) + + def merge_core( - objs, - compat="broadcast_equals", - join="outer", - priority_arg=None, - explicit_coords=None, - indexes=None, - fill_value=dtypes.NA, -) -> Tuple["OrderedDict[Hashable, Variable]", Set[Hashable], Dict[Hashable, int]]: + objects: Iterable["CoercibleMapping"], + compat: str = "broadcast_equals", + join: str = "outer", + priority_arg: Optional[int] = None, + explicit_coords: Optional[Sequence] = None, + indexes: Optional[Mapping[Hashable, pd.Index]] = None, + fill_value: object = dtypes.NA, +) -> _MergeResult: """Core logic for merging labeled objects. This is not public API. Parameters ---------- - objs : list of mappings + objects : list of mappings All values must be convertable to labeled arrays. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts', 'override'}, optional Compatibility checks to use when merging variables. join : {'outer', 'inner', 'left', 'right'}, optional How to combine objects with different indexes. priority_arg : integer, optional - Optional argument in `objs` that takes precedence over the others. + Optional argument in `objects` that takes precedence over the others. explicit_coords : set, optional - An explicit list of variables from `objs` that are coordinates. + An explicit list of variables from `objects` that are coordinates. indexes : dict, optional Dictionary with values given by pandas.Index objects. fill_value : scalar, optional @@ -490,28 +543,25 @@ def merge_core( _assert_compat_valid(compat) - coerced = coerce_pandas_values(objs) + coerced = coerce_pandas_values(objects) aligned = deep_align( coerced, join=join, copy=False, indexes=indexes, fill_value=fill_value ) - expanded = expand_variable_dicts(aligned) + collected = collect_variables_and_indexes(aligned) - coord_names, noncoord_names = determine_coords(coerced) - - priority_vars = _get_priority_vars(aligned, priority_arg, compat=compat) - variables = merge_variables(expanded, priority_vars, compat=compat) + prioritized = _get_priority_vars_and_indexes(aligned, priority_arg, compat=compat) + variables, out_indexes = merge_collected(collected, prioritized, compat=compat) assert_unique_multiindex_level_names(variables) dims = calculate_dimensions(variables) + coord_names, noncoord_names = determine_coords(coerced) if explicit_coords is not None: assert_valid_explicit_coords(variables, dims, explicit_coords) coord_names.update(explicit_coords) - for dim, size in dims.items(): if dim in variables: coord_names.add(dim) - ambiguous_coords = coord_names.intersection(noncoord_names) if ambiguous_coords: raise MergeError( @@ -519,10 +569,15 @@ def merge_core( "coordinates or not in the merged result: %s" % ambiguous_coords ) - return variables, coord_names, dims + return _MergeResult(variables, coord_names, dims, out_indexes) -def merge(objects, compat="no_conflicts", join="outer", fill_value=dtypes.NA): +def merge( + objects: Iterable[Union["DataArray", "CoercibleMapping"]], + compat: str = "no_conflicts", + join: str = "outer", + fill_value: object = dtypes.NA, +) -> "Dataset": """Merge any number of xarray objects into a single Dataset as variables. Parameters @@ -724,7 +779,7 @@ def merge(objects, compat="no_conflicts", join="outer", fill_value=dtypes.NA): dict_like_objects = list() for obj in objects: - if not (isinstance(obj, (DataArray, Dataset, dict))): + if not isinstance(obj, (DataArray, Dataset, dict)): raise TypeError( "objects must be an iterable containing only " "Dataset(s), DataArray(s), and dictionaries." @@ -733,26 +788,21 @@ def merge(objects, compat="no_conflicts", join="outer", fill_value=dtypes.NA): obj = obj.to_dataset() if isinstance(obj, DataArray) else obj dict_like_objects.append(obj) - variables, coord_names, dims = merge_core( - dict_like_objects, compat, join, fill_value=fill_value - ) - # TODO: don't always recompute indexes - merged = Dataset._construct_direct(variables, coord_names, dims, indexes=None) - + merge_result = merge_core(dict_like_objects, compat, join, fill_value=fill_value) + merged = Dataset._construct_direct(**merge_result._asdict()) return merged def dataset_merge_method( dataset: "Dataset", - other: "DatasetLike", + other: "CoercibleMapping", overwrite_vars: Union[Hashable, Iterable[Hashable]], compat: str, join: str, fill_value: Any, -) -> Tuple["OrderedDict[Hashable, Variable]", Set[Hashable], Dict[Hashable, int]]: +) -> _MergeResult: """Guts of the Dataset.merge method. """ - # we are locked into supporting overwrite_vars for the Dataset.merge # method due for backwards compatibility # TODO: consider deprecating it? @@ -769,8 +819,10 @@ def dataset_merge_method( objs = [dataset, other] priority_arg = 1 else: - other_overwrite = OrderedDict() # type: MutableDatasetLike - other_no_overwrite = OrderedDict() # type: MutableDatasetLike + other_overwrite = OrderedDict() # type: OrderedDict[Hashable, CoercibleValue] + other_no_overwrite = ( + OrderedDict() + ) # type: OrderedDict[Hashable, CoercibleValue] for k, v in other.items(): if k in overwrite_vars: other_overwrite[k] = v @@ -785,8 +837,8 @@ def dataset_merge_method( def dataset_update_method( - dataset: "Dataset", other: "DatasetLike" -) -> Tuple["OrderedDict[Hashable, Variable]", Set[Hashable], Dict[Hashable, int]]: + dataset: "Dataset", other: "CoercibleMapping" +) -> _MergeResult: """Guts of the Dataset.update method. This drops a duplicated coordinates from `other` if `other` is not an From 283b4feba601e9838ed538106b2091d0b8264c77 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 4 Oct 2019 17:04:36 +0000 Subject: [PATCH 2/6] Docs/more fixes (#2934) * Move netcdf to beginning of io.rst * Better indexing example. * Start de-emphasizing pandas * misc. * compute, load, persist docstrings + text. * split-apply-combine. * np.newaxis. * misc. * some dask stuff. * Little more dask. * undo index.rst changes. * link to dask docs on chunks * Fix io.rst. * small changes. * rollingupdate. * joe's review --- doc/computation.rst | 4 +- doc/dask.rst | 65 +++++++++++---- doc/faq.rst | 46 +++++++---- doc/index.rst | 2 +- doc/io.rst | 178 +++++++++++++++++++++++------------------ doc/quick-overview.rst | 16 ++-- doc/why-xarray.rst | 17 ++-- xarray/core/dataset.py | 19 ++--- 8 files changed, 212 insertions(+), 135 deletions(-) diff --git a/doc/computation.rst b/doc/computation.rst index 3d10774bcac..ae5f4bc5c66 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -179,7 +179,9 @@ a value when aggregating: r = arr.rolling(y=3, center=True, min_periods=2) r.mean() -Note that rolling window aggregations are faster when bottleneck_ is installed. +.. tip:: + + Note that rolling window aggregations are faster and use less memory when bottleneck_ is installed. This only applies to numpy-backed xarray objects. .. _bottleneck: https://github.com/kwgoodman/bottleneck/ diff --git a/doc/dask.rst b/doc/dask.rst index 19cbc11292c..5bdbf779463 100644 --- a/doc/dask.rst +++ b/doc/dask.rst @@ -5,13 +5,14 @@ Parallel computing with Dask xarray integrates with `Dask `__ to support parallel computations and streaming computation on datasets that don't fit into memory. - Currently, Dask is an entirely optional feature for xarray. However, the benefits of using Dask are sufficiently strong that Dask may become a required dependency in a future version of xarray. For a full example of how to use xarray's Dask integration, read the -`blog post introducing xarray and Dask`_. +`blog post introducing xarray and Dask`_. More up-to-date examples +may be found at the `Pangeo project's use-cases `_ +and at the `Dask examples website `_. .. _blog post introducing xarray and Dask: http://stephanhoyer.com/2015/06/11/xray-dask-out-of-core-labeled-arrays/ @@ -37,13 +38,14 @@ which allows Dask to take full advantage of multiple processors available on most modern computers. For more details on Dask, read `its documentation `__. +Note that xarray only makes use of ``dask.array`` and ``dask.delayed``. .. _dask.io: Reading and writing data ------------------------ -The usual way to create a dataset filled with Dask arrays is to load the +The usual way to create a ``Dataset`` filled with Dask arrays is to load the data from a netCDF file or files. You can do this by supplying a ``chunks`` argument to :py:func:`~xarray.open_dataset` or using the :py:func:`~xarray.open_mfdataset` function. @@ -71,8 +73,8 @@ argument to :py:func:`~xarray.open_dataset` or using the In this example ``latitude`` and ``longitude`` do not appear in the ``chunks`` dict, so only one chunk will be used along those dimensions. It is also -entirely equivalent to opening a dataset using ``open_dataset`` and then -chunking the data using the ``chunk`` method, e.g., +entirely equivalent to opening a dataset using :py:meth:`~xarray.open_dataset` +and then chunking the data using the ``chunk`` method, e.g., ``xr.open_dataset('example-data.nc').chunk({'time': 10})``. To open multiple files simultaneously in parallel using Dask delayed, @@ -80,13 +82,14 @@ use :py:func:`~xarray.open_mfdataset`:: xr.open_mfdataset('my/files/*.nc', parallel=True) -This function will automatically concatenate and merge dataset into one in +This function will automatically concatenate and merge datasets into one in the simple cases that it understands (see :py:func:`~xarray.auto_combine` -for the full disclaimer). By default, :py:func:`~xarray.open_mfdataset` will chunk each +for the full disclaimer). By default, :py:meth:`~xarray.open_mfdataset` will chunk each netCDF file into a single Dask array; again, supply the ``chunks`` argument to control the size of the resulting Dask arrays. In more complex cases, you can -open each file individually using ``open_dataset`` and merge the result, as -described in :ref:`combining data`. +open each file individually using :py:meth:`~xarray.open_dataset` and merge the result, as +described in :ref:`combining data`. Passing the keyword argument ``parallel=True`` to :py:meth:`~xarray.open_mfdataset` will speed up the reading of large multi-file datasets by +executing those read tasks in parallel using ``dask.delayed``. You'll notice that printing a dataset still shows a preview of array values, even if they are actually Dask arrays. We can do this quickly with Dask because @@ -106,7 +109,7 @@ usual way. ds.to_netcdf('manipulated-example-data.nc') By setting the ``compute`` argument to ``False``, :py:meth:`~xarray.Dataset.to_netcdf` -will return a Dask delayed object that can be computed later. +will return a ``dask.delayed`` object that can be computed later. .. ipython:: python @@ -153,8 +156,14 @@ explicit conversion step. One notable exception is indexing operations: to enable label based indexing, xarray will automatically load coordinate labels into memory. +.. tip:: + + By default, dask uses its multi-threaded scheduler, which distributes work across + multiple cores and allows for processing some datasets that do not fit into memory. + For running across a cluster, `setup the distributed scheduler `_. + The easiest way to convert an xarray data structure from lazy Dask arrays into -eager, in-memory NumPy arrays is to use the :py:meth:`~xarray.Dataset.load` method: +*eager*, in-memory NumPy arrays is to use the :py:meth:`~xarray.Dataset.load` method: .. ipython:: python @@ -191,11 +200,20 @@ Dask arrays using the :py:meth:`~xarray.Dataset.persist` method: ds = ds.persist() -This is particularly useful when using a distributed cluster because the data -will be loaded into distributed memory across your machines and be much faster -to use than reading repeatedly from disk. Warning that on a single machine -this operation will try to load all of your data into memory. You should make -sure that your dataset is not larger than available memory. +:py:meth:`~xarray.Dataset.persist` is particularly useful when using a +distributed cluster because the data will be loaded into distributed memory +across your machines and be much faster to use than reading repeatedly from +disk. + +.. warning:: + + On a single machine :py:meth:`~xarray.Dataset.persist` will try to load all of + your data into memory. You should make sure that your dataset is not larger than + available memory. + +.. note:: + For more on the differences between :py:meth:`~xarray.Dataset.persist` and + :py:meth:`~xarray.Dataset.compute` see this `Stack Overflow answer `_ and the `Dask documentation `_. For performance you may wish to consider chunk sizes. The correct choice of chunk size depends both on your data and on the operations you want to perform. @@ -381,6 +399,11 @@ one million elements (e.g., a 1000x1000 matrix). With large arrays (10+ GB), the cost of queueing up Dask operations can be noticeable, and you may need even larger chunksizes. +.. tip:: + + Check out the dask documentation on `chunks `_. + + Optimization Tips ----------------- @@ -390,4 +413,12 @@ With analysis pipelines involving both spatial subsetting and temporal resamplin 2. Save intermediate results to disk as a netCDF files (using ``to_netcdf()``) and then load them again with ``open_dataset()`` for further computations. For example, if subtracting temporal mean from a dataset, save the temporal mean to disk before subtracting. Again, in theory, Dask should be able to do the computation in a streaming fashion, but in practice this is a fail case for the Dask scheduler, because it tries to keep every chunk of an array that it computes in memory. (See `Dask issue #874 `_) -3. Specify smaller chunks across space when using ``open_mfdataset()`` (e.g., ``chunks={'latitude': 10, 'longitude': 10}``). This makes spatial subsetting easier, because there's no risk you will load chunks of data referring to different chunks (probably not necessary if you follow suggestion 1). +3. Specify smaller chunks across space when using :py:meth:`~xarray.open_mfdataset` (e.g., ``chunks={'latitude': 10, 'longitude': 10}``). This makes spatial subsetting easier, because there's no risk you will load chunks of data referring to different chunks (probably not necessary if you follow suggestion 1). + +4. Using the h5netcdf package by passing ``engine='h5netcdf'`` to :py:meth:`~xarray.open_mfdataset` + can be quicker than the default ``engine='netcdf4'`` that uses the netCDF4 package. + +5. Some dask-specific tips may be found `here `_. + +6. The dask `diagnostics `_ can be + useful in identifying performance bottlenecks. diff --git a/doc/faq.rst b/doc/faq.rst index 22a4f6cf095..28a1f7395c3 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -11,6 +11,38 @@ Frequently Asked Questions import xarray as xr np.random.seed(123456) + +Your documentation keeps mentioning pandas. What is pandas? +----------------------------------------------------------- + +pandas_ is a very popular data analysis package in Python +with wide usage in many fields. Our API is heavily inspired by pandas — +this is why there are so many references to pandas. + +.. _pandas: https://pandas.pydata.org + + +Do I need to know pandas to use xarray? +--------------------------------------- + +No! Our API is heavily inspired by pandas so while knowing pandas will let you +become productive more quickly, knowledge of pandas is not necessary to use xarray. + + +Should I use xarray instead of pandas? +-------------------------------------- + +It's not an either/or choice! xarray provides robust support for converting +back and forth between the tabular data-structures of pandas and its own +multi-dimensional data-structures. + +That said, you should only bother with xarray if some aspect of data is +fundamentally multi-dimensional. If your data is unstructured or +one-dimensional, pandas is usually the right choice: it has better performance +for common operations such as ``groupby`` and you'll find far more usage +examples online. + + Why is pandas not enough? ------------------------- @@ -56,20 +88,6 @@ of the "time" dimension. You never need to reshape arrays (e.g., with ``np.newaxis``) to align them for arithmetic operations in xarray. -Should I use xarray instead of pandas? --------------------------------------- - -It's not an either/or choice! xarray provides robust support for converting -back and forth between the tabular data-structures of pandas and its own -multi-dimensional data-structures. - -That said, you should only bother with xarray if some aspect of data is -fundamentally multi-dimensional. If your data is unstructured or -one-dimensional, pandas is usually the right choice: it has better performance -for common operations such as ``groupby`` and you'll find far more usage -examples online. - - Why don't aggregations return Python scalars? --------------------------------------------- diff --git a/doc/index.rst b/doc/index.rst index e5bd03801ff..972eb0a732e 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -11,7 +11,7 @@ intuitive, more concise, and less error-prone developer experience. The package includes a large and growing library of domain-agnostic functions for advanced analytics and visualization with these data structures. -Xarray was inspired by and borrows heavily from pandas_, the popular data +Xarray is inspired by and borrows heavily from pandas_, the popular data analysis package focused on labelled tabular data. It is particularly tailored to working with netCDF_ files, which were the source of xarray's data model, and integrates tightly with dask_ for parallel diff --git a/doc/io.rst b/doc/io.rst index 0943b598a7f..7f0c2333ce5 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -15,82 +15,6 @@ format (recommended). import xarray as xr np.random.seed(123456) -.. _io.pickle: - -Pickle ------- - -The simplest way to serialize an xarray object is to use Python's built-in pickle -module: - -.. ipython:: python - - import pickle - - ds = xr.Dataset({'foo': (('x', 'y'), np.random.rand(4, 5))}, - coords={'x': [10, 20, 30, 40], - 'y': pd.date_range('2000-01-01', periods=5), - 'z': ('x', list('abcd'))}) - - # use the highest protocol (-1) because it is way faster than the default - # text based pickle format - pkl = pickle.dumps(ds, protocol=-1) - - pickle.loads(pkl) - -Pickling is important because it doesn't require any external libraries -and lets you use xarray objects with Python modules like -:py:mod:`multiprocessing` or :ref:`Dask `. However, pickling is -**not recommended for long-term storage**. - -Restoring a pickle requires that the internal structure of the types for the -pickled data remain unchanged. Because the internal design of xarray is still -being refined, we make no guarantees (at this point) that objects pickled with -this version of xarray will work in future versions. - -.. note:: - - When pickling an object opened from a NetCDF file, the pickle file will - contain a reference to the file on disk. If you want to store the actual - array values, load it into memory first with :py:meth:`~xarray.Dataset.load` - or :py:meth:`~xarray.Dataset.compute`. - -.. _dictionary io: - -Dictionary ----------- - -We can convert a ``Dataset`` (or a ``DataArray``) to a dict using -:py:meth:`~xarray.Dataset.to_dict`: - -.. ipython:: python - - d = ds.to_dict() - d - -We can create a new xarray object from a dict using -:py:meth:`~xarray.Dataset.from_dict`: - -.. ipython:: python - - ds_dict = xr.Dataset.from_dict(d) - ds_dict - -Dictionary support allows for flexible use of xarray objects. It doesn't -require external libraries and dicts can easily be pickled, or converted to -json, or geojson. All the values are converted to lists, so dicts might -be quite large. - -To export just the dataset schema, without the data itself, use the -``data=False`` option: - -.. ipython:: python - - ds.to_dict(data=False) - -This can be useful for generating indices of dataset contents to expose to -search indices or other automated data discovery tools. - .. _io.netcdf: netCDF @@ -127,12 +51,25 @@ We can save a Dataset to disk using the .. ipython:: python + ds = xr.Dataset({'foo': (('x', 'y'), np.random.rand(4, 5))}, + coords={'x': [10, 20, 30, 40], + 'y': pd.date_range('2000-01-01', periods=5), + 'z': ('x', list('abcd'))}) + ds.to_netcdf('saved_on_disk.nc') By default, the file is saved as netCDF4 (assuming netCDF4-Python is installed). You can control the format and engine used to write the file with the ``format`` and ``engine`` arguments. +.. tip:: + + Using the `h5netcdf `_ package + by passing ``engine='h5netcdf'`` to :py:meth:`~xarray.open_dataset` can + sometimes be quicker than the default ``engine='netcdf4'`` that uses the + `netCDF4 `_ package. + + We can load netCDF files to create a new Dataset using :py:func:`~xarray.open_dataset`: @@ -149,7 +86,15 @@ convert the ``DataArray`` to a ``Dataset`` before saving, and then convert back when loading, ensuring that the ``DataArray`` that is loaded is always exactly the same as the one that was saved. -Data is always loaded lazily from netCDF files. You can manipulate, slice and subset +A dataset can also be loaded or written to a specific group within a netCDF +file. To load from a group, pass a ``group`` keyword argument to the +``open_dataset`` function. The group can be specified as a path-like +string, e.g., to access subgroup 'bar' within group 'foo' pass +'/foo/bar' as the ``group`` argument. When writing multiple groups in one file, +pass ``mode='a'`` to ``to_netcdf`` to ensure that each call does not delete the +file. + +Data is *always* loaded lazily from netCDF files. You can manipulate, slice and subset Dataset and DataArray objects, and no array values are loaded into memory until you try to perform some sort of actual computation. For an example of how these lazy arrays work, see the OPeNDAP section below. @@ -251,8 +196,6 @@ will remove encoding information. :suppress: ds_disk.close() - import os - os.remove('saved_on_disk.nc') .. _combining multiple files: @@ -681,6 +624,83 @@ that require NASA's URS authentication:: __ http://docs.python-requests.org __ http://pydap.readthedocs.io/en/latest/client.html#authentication +.. _io.pickle: + +Pickle +------ + +The simplest way to serialize an xarray object is to use Python's built-in pickle +module: + +.. ipython:: python + + import pickle + + # use the highest protocol (-1) because it is way faster than the default + # text based pickle format + pkl = pickle.dumps(ds, protocol=-1) + + pickle.loads(pkl) + +Pickling is important because it doesn't require any external libraries +and lets you use xarray objects with Python modules like +:py:mod:`multiprocessing` or :ref:`Dask `. However, pickling is +**not recommended for long-term storage**. + +Restoring a pickle requires that the internal structure of the types for the +pickled data remain unchanged. Because the internal design of xarray is still +being refined, we make no guarantees (at this point) that objects pickled with +this version of xarray will work in future versions. + +.. note:: + + When pickling an object opened from a NetCDF file, the pickle file will + contain a reference to the file on disk. If you want to store the actual + array values, load it into memory first with :py:meth:`~xarray.Dataset.load` + or :py:meth:`~xarray.Dataset.compute`. + +.. _dictionary io: + +Dictionary +---------- + +We can convert a ``Dataset`` (or a ``DataArray``) to a dict using +:py:meth:`~xarray.Dataset.to_dict`: + +.. ipython:: python + + d = ds.to_dict() + d + +We can create a new xarray object from a dict using +:py:meth:`~xarray.Dataset.from_dict`: + +.. ipython:: python + + ds_dict = xr.Dataset.from_dict(d) + ds_dict + +Dictionary support allows for flexible use of xarray objects. It doesn't +require external libraries and dicts can easily be pickled, or converted to +json, or geojson. All the values are converted to lists, so dicts might +be quite large. + +To export just the dataset schema, without the data itself, use the +``data=False`` option: + +.. ipython:: python + + ds.to_dict(data=False) + +This can be useful for generating indices of dataset contents to expose to +search indices or other automated data discovery tools. + +.. ipython:: python + :suppress: + + import os + os.remove('saved_on_disk.nc') + .. _io.rasterio: Rasterio diff --git a/doc/quick-overview.rst b/doc/quick-overview.rst index 1224f59515b..7d84199323d 100644 --- a/doc/quick-overview.rst +++ b/doc/quick-overview.rst @@ -48,21 +48,21 @@ Here are the key properties for a ``DataArray``: Indexing -------- -xarray supports four kind of indexing. Since we have assigned coordinate labels to the x dimension we can use label-based indexing along that dimension just like pandas. The four examples below all yield the same result but at varying levels of convenience and intuitiveness. +xarray supports four kind of indexing. Since we have assigned coordinate labels to the x dimension we can use label-based indexing along that dimension just like pandas. The four examples below all yield the same result (the value at `x=10`) but at varying levels of convenience and intuitiveness. .. ipython:: python # positional and by integer label, like numpy - data[[0, 1]] + data[0, :] - # positional and by coordinate label, like pandas - data.loc[10:20] + # loc or "location": positional and coordinate label, like pandas + data.loc[10] - # by dimension name and integer label - data.isel(x=slice(2)) + # isel or "integer select": by dimension name and integer label + data.isel(x=0) - # by dimension name and coordinate label - data.sel(x=[10, 20]) + # sel or "select": by dimension name and coordinate label + data.sel(x=10) Unlike positional indexing, label-based indexing frees us from having to know how our array is organized. All we need to know are the dimension name and the label we wish to index i.e. ``data.sel(x=10)`` works regardless of whether ``x`` is the first or second dimension of the array and regardless of whether ``10`` is the first or second element of ``x``. We have already told xarray that x is the first dimension when we created ``data``: xarray keeps track of this so we don't have to. For more, see :ref:`indexing`. diff --git a/doc/why-xarray.rst b/doc/why-xarray.rst index 25d558d99d5..be8284d88c2 100644 --- a/doc/why-xarray.rst +++ b/doc/why-xarray.rst @@ -1,6 +1,10 @@ Overview: Why xarray? ===================== +Xarray introduces labels in the form of dimensions, coordinates and attributes on top of +raw NumPy-like multidimensional arrays, which allows for a more intuitive, more concise, +and less error-prone developer experience. + What labels enable ------------------ @@ -18,13 +22,14 @@ Xarray doesn't just keep track of labels on arrays -- it uses them to provide a powerful and concise interface. For example: - Apply operations over dimensions by name: ``x.sum('time')``. -- Select values by label instead of integer location: +- Select values by label (or logical location) instead of integer location: ``x.loc['2014-01-01']`` or ``x.sel(time='2014-01-01')``. - Mathematical operations (e.g., ``x - y``) vectorize across multiple dimensions (array broadcasting) based on dimension names, not shape. -- Flexible split-apply-combine operations with groupby: +- Easily use the `split-apply-combine `_ + paradigm with ``groupby``: ``x.groupby('time.dayofyear').mean()``. -- Database like alignment based on coordinate labels that smoothly +- Database-like alignment based on coordinate labels that smoothly handles missing values: ``x, y = xr.align(x, y, join='outer')``. - Keep track of arbitrary metadata in the form of a Python dictionary: ``x.attrs``. @@ -33,8 +38,8 @@ The N-dimensional nature of xarray's data structures makes it suitable for deali with multi-dimensional scientific data, and its use of dimension names instead of axis labels (``dim='time'`` instead of ``axis=0``) makes such arrays much more manageable than the raw numpy ndarray: with xarray, you don't -need to keep track of the order of arrays dimensions or insert dummy dimensions -(e.g., ``np.newaxis``) to align arrays. +need to keep track of the order of an array's dimensions or insert dummy dimensions of +size 1 to align arrays (e.g., using ``np.newaxis``). The immediate payoff of using xarray is that you'll write less code. The long-term payoff is that you'll understand what you were thinking when you come @@ -44,7 +49,7 @@ Core data structures -------------------- xarray has two core data structures, which build upon and extend the core -strengths of NumPy_ and pandas_. Both are fundamentally N-dimensional: +strengths of NumPy_ and pandas_. Both data structures are fundamentally N-dimensional: - :py:class:`~xarray.DataArray` is our implementation of a labeled, N-dimensional array. It is an N-D generalization of a :py:class:`pandas.Series`. The name diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 03276d61cf0..71025cb3040 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -614,8 +614,9 @@ def sizes(self) -> Mapping[Hashable, int]: return self.dims def load(self, **kwargs) -> "Dataset": - """Manually trigger loading of this dataset's data from disk or a - remote source into memory and return this dataset. + """Manually trigger loading and/or computation of this dataset's data + from disk or a remote source into memory and return this dataset. + Unlike compute, the original dataset is modified and returned. Normally, it should not be necessary to call this method in user code, because all xarray functions should either work on deferred data or @@ -771,9 +772,9 @@ def _dask_postpersist(dsk, info, *args): return Dataset._construct_direct(variables, *args) def compute(self, **kwargs) -> "Dataset": - """Manually trigger loading of this dataset's data from disk or a - remote source into memory and return a new dataset. The original is - left unaltered. + """Manually trigger loading and/or computation of this dataset's data + from disk or a remote source into memory and return a new dataset. + Unlike load, the original dataset is left unaltered. Normally, it should not be necessary to call this method in user code, because all xarray functions should either work on deferred data or @@ -816,10 +817,10 @@ def persist(self, **kwargs) -> "Dataset": """ Trigger computation, keeping data as dask arrays This operation can be used to trigger computation on underlying dask - arrays, similar to ``.compute()``. However this operation keeps the - data as dask arrays. This is particularly useful when using the - dask.distributed scheduler and you want to load a large amount of data - into distributed memory. + arrays, similar to ``.compute()`` or ``.load()``. However this + operation keeps the data as dask arrays. This is particularly useful + when using the dask.distributed scheduler and you want to load a large + amount of data into distributed memory. Parameters ---------- From 4254b4af33843f711459e5242018cd1d678ad3a0 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Fri, 4 Oct 2019 23:17:56 +0100 Subject: [PATCH 3/6] Lint (#3373) * raise exception instance, not class * isort * isort * Bump mypy version --- .pre-commit-config.yaml | 2 +- asv_bench/benchmarks/__init__.py | 2 +- asv_bench/benchmarks/dataset_io.py | 2 +- ci/requirements/py36-min-all-deps.yml | 2 +- ci/requirements/py36.yml | 2 +- ci/requirements/py37-windows.yml | 2 +- ci/requirements/py37.yml | 2 +- properties/test_encode_decode.py | 2 +- xarray/backends/common.py | 12 ++++++------ xarray/backends/file_manager.py | 6 +++--- xarray/coding/cftime_offsets.py | 2 +- xarray/coding/variables.py | 4 ++-- xarray/core/common.py | 2 +- xarray/core/concat.py | 2 +- xarray/core/coordinates.py | 8 ++++---- xarray/core/dataset.py | 4 ++-- xarray/tests/test_backends.py | 2 +- xarray/tests/test_concat.py | 1 + 18 files changed, 30 insertions(+), 29 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a9fb3d699ff..502120cd5dc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,7 +11,7 @@ repos: hooks: - id: flake8 - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.720 # Must match ci/requirements/*.yml + rev: v0.730 # Must match ci/requirements/*.yml hooks: - id: mypy # run these occasionally, ref discussion https://github.com/pydata/xarray/pull/3194 diff --git a/asv_bench/benchmarks/__init__.py b/asv_bench/benchmarks/__init__.py index 1ffd3afa4ae..b0adb2feafd 100644 --- a/asv_bench/benchmarks/__init__.py +++ b/asv_bench/benchmarks/__init__.py @@ -18,7 +18,7 @@ def requires_dask(): try: import dask # noqa: F401 except ImportError: - raise NotImplementedError + raise NotImplementedError() def randn(shape, frac_nan=None, chunks=None, seed=0): diff --git a/asv_bench/benchmarks/dataset_io.py b/asv_bench/benchmarks/dataset_io.py index c1567d0b513..d1ffbc34706 100644 --- a/asv_bench/benchmarks/dataset_io.py +++ b/asv_bench/benchmarks/dataset_io.py @@ -458,7 +458,7 @@ def setup(self): try: import distributed except ImportError: - raise NotImplementedError + raise NotImplementedError() self.client = distributed.Client() self.write = create_delayed_write() diff --git a/ci/requirements/py36-min-all-deps.yml b/ci/requirements/py36-min-all-deps.yml index 1829f2a11e3..affbf8637fd 100644 --- a/ci/requirements/py36-min-all-deps.yml +++ b/ci/requirements/py36-min-all-deps.yml @@ -20,7 +20,7 @@ dependencies: - iris=2.2.0 - lxml=4.4.1 # optional dep of pydap - matplotlib=3.1.1 - - mypy==0.720 # Must match .pre-commit-config.yaml + - mypy==0.730 # Must match .pre-commit-config.yaml - nc-time-axis=1.2.0 - netcdf4=1.5.1.2 - numba=0.45.1 diff --git a/ci/requirements/py36.yml b/ci/requirements/py36.yml index 187a9c79fbf..bdb649f6f1b 100644 --- a/ci/requirements/py36.yml +++ b/ci/requirements/py36.yml @@ -20,7 +20,7 @@ dependencies: - iris>=1.10 - lxml # optional dep of pydap - matplotlib - - mypy==0.720 # Must match .pre-commit-config.yaml + - mypy==0.730 # Must match .pre-commit-config.yaml - nc-time-axis - netcdf4 - numba diff --git a/ci/requirements/py37-windows.yml b/ci/requirements/py37-windows.yml index 24297327393..79b54030bc6 100644 --- a/ci/requirements/py37-windows.yml +++ b/ci/requirements/py37-windows.yml @@ -20,7 +20,7 @@ dependencies: - iris>=1.10 - lxml # optional dep of pydap - matplotlib - - mypy==0.720 # Must match .pre-commit-config.yaml + - mypy==0.730 # Must match .pre-commit-config.yaml - nc-time-axis - netcdf4 - numba diff --git a/ci/requirements/py37.yml b/ci/requirements/py37.yml index 5a328c64cf9..a4fe2d82a6f 100644 --- a/ci/requirements/py37.yml +++ b/ci/requirements/py37.yml @@ -20,7 +20,7 @@ dependencies: - iris>=1.10 - lxml # optional dep of pydap - matplotlib - - mypy==0.720 # Must match .pre-commit-config.yaml + - mypy==0.730 # Must match .pre-commit-config.yaml - nc-time-axis - netcdf4 - numba diff --git a/properties/test_encode_decode.py b/properties/test_encode_decode.py index c7839608981..011e7a922d1 100644 --- a/properties/test_encode_decode.py +++ b/properties/test_encode_decode.py @@ -4,7 +4,7 @@ These ones pass, just as you'd hope! """ -import pytest +import pytest # isort:skip pytest.importorskip("hypothesis") diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 455b77907f9..72c7c5a517f 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -88,13 +88,13 @@ def __len__(self): return len(self.variables) def get_dimensions(self): # pragma: no cover - raise NotImplementedError + raise NotImplementedError() def get_attrs(self): # pragma: no cover - raise NotImplementedError + raise NotImplementedError() def get_variables(self): # pragma: no cover - raise NotImplementedError + raise NotImplementedError() def get_encoding(self): return {} @@ -247,13 +247,13 @@ def encode_attribute(self, a): return a def set_dimension(self, d, l): # pragma: no cover - raise NotImplementedError + raise NotImplementedError() def set_attribute(self, k, v): # pragma: no cover - raise NotImplementedError + raise NotImplementedError() def set_variable(self, k, v): # pragma: no cover - raise NotImplementedError + raise NotImplementedError() def store_dataset(self, dataset): """ diff --git a/xarray/backends/file_manager.py b/xarray/backends/file_manager.py index dfd38ff9f48..eac28852281 100644 --- a/xarray/backends/file_manager.py +++ b/xarray/backends/file_manager.py @@ -28,7 +28,7 @@ class FileManager: def acquire(self, needs_lock=True): """Acquire the file object from this manager.""" - raise NotImplementedError + raise NotImplementedError() def acquire_context(self, needs_lock=True): """Context manager for acquiring a file. Yields a file object. @@ -37,11 +37,11 @@ def acquire_context(self, needs_lock=True): (i.e., removes it from any cache) if an exception is raised from the context. It *does not* automatically close the file. """ - raise NotImplementedError + raise NotImplementedError() def close(self, needs_lock=True): """Close the file object associated with this manager, if needed.""" - raise NotImplementedError + raise NotImplementedError() class CachingFileManager(FileManager): diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index 223eff571ae..d7841fd43f8 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -181,7 +181,7 @@ def _get_day_of_month(other, day_option): elif day_option is None: # Note: unlike `_shift_month`, _get_day_of_month does not # allow day_option = None - raise NotImplementedError + raise NotImplementedError() else: raise ValueError(day_option) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index f54ae7867d8..7adaca4e9bc 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -36,12 +36,12 @@ class VariableCoder: def encode(self, variable, name=None): # pragma: no cover # type: (Variable, Any) -> Variable """Convert an encoded variable to a decoded variable.""" - raise NotImplementedError + raise NotImplementedError() def decode(self, variable, name=None): # pragma: no cover # type: (Variable, Any) -> Variable """Convert an decoded variable to a encoded variable.""" - raise NotImplementedError + raise NotImplementedError() class _ElementwiseFunctionArray(indexing.ExplicitlyIndexedNDArrayMixin): diff --git a/xarray/core/common.py b/xarray/core/common.py index a8fac245c02..bf15e9907c4 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1216,7 +1216,7 @@ def __exit__(self, exc_type, exc_value, traceback) -> None: def __getitem__(self, value): # implementations of this class should implement this method - raise NotImplementedError + raise NotImplementedError() def full_like(other, fill_value, dtype: DTypeLike = None): diff --git a/xarray/core/concat.py b/xarray/core/concat.py index e68c247d880..5c9beda3f74 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -4,7 +4,7 @@ from . import dtypes, utils from .alignment import align -from .merge import unique_variable, _VALID_COMPAT +from .merge import _VALID_COMPAT, unique_variable from .variable import IndexVariable, Variable, as_variable from .variable import concat as concat_vars diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 430e507396b..ce17973866e 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -6,8 +6,8 @@ Hashable, Iterator, Mapping, - Set, Sequence, + Set, Tuple, Union, cast, @@ -17,7 +17,7 @@ from . import formatting, indexing from .indexes import Indexes -from .merge import merge_coords, merge_coordinates_without_align +from .merge import merge_coordinates_without_align, merge_coords from .utils import Frozen, ReprObject, either_dict_or_kwargs from .variable import Variable @@ -53,10 +53,10 @@ def indexes(self) -> Indexes: @property def variables(self): - raise NotImplementedError + raise NotImplementedError() def _update_coords(self, coords, indexes): - raise NotImplementedError + raise NotImplementedError() def __iter__(self) -> Iterator["Hashable"]: # needs to be in the same order as the dataset variables diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 71025cb3040..d394e05b07a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -64,8 +64,8 @@ from .merge import ( dataset_merge_method, dataset_update_method, - merge_data_and_coords, merge_coordinates_without_align, + merge_data_and_coords, ) from .options import OPTIONS, _get_keep_attrs from .pycompat import dask_array_type @@ -76,9 +76,9 @@ decode_numpy_dict_values, either_dict_or_kwargs, hashable, - maybe_wrap_array, is_dict_like, is_list_like, + maybe_wrap_array, ) from .variable import IndexVariable, Variable, as_variable, broadcast_variables diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 87958824c7b..4645b4db796 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -237,7 +237,7 @@ class DatasetIOBase: file_format = None # type: Optional[str] def create_store(self): - raise NotImplementedError + raise NotImplementedError() @contextlib.contextmanager def roundtrip( diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 00428f70966..d2635e4451a 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -6,6 +6,7 @@ from xarray import DataArray, Dataset, Variable, concat from xarray.core import dtypes, merge + from . import ( InaccessibleArray, assert_array_equal, From 3e2a754ddaca501cba973270b2864c8897199d1f Mon Sep 17 00:00:00 2001 From: "Alan D. Snow" Date: Tue, 8 Oct 2019 09:36:52 -0500 Subject: [PATCH 4/6] added geocube and rioxarray to related projects (#3383) --- doc/related-projects.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/related-projects.rst b/doc/related-projects.rst index 647db5fd8e4..fd77ce56a0a 100644 --- a/doc/related-projects.rst +++ b/doc/related-projects.rst @@ -12,6 +12,7 @@ Geosciences - `aospy `_: Automated analysis and management of gridded climate data. - `climpred `_: Analysis of ensemble forecast models for climate prediction. +- `geocube `_: Tool to convert geopandas vector data into rasterized xarray data. - `infinite-diff `_: xarray-based finite-differencing, focused on gridded climate/meterology data - `marc_analysis `_: Analysis package for CESM/MARC experiments and output. - `MetPy `_: A collection of tools in Python for reading, visualizing, and performing calculations with weather data. @@ -25,6 +26,7 @@ Geosciences accessing data stored in GAMS Data eXchange (GDX) files. Also uses a custom subclass. - `Regionmask `_: plotting and creation of masks of spatial regions +- `rioxarray `_: geospatial xarray extension powered by rasterio - `salem `_: Adds geolocalised subsetting, masking, and plotting operations to xarray's data structures via accessors. - `SatPy `_ : Library for reading and manipulating meteorological remote sensing data and writing it to various image and data file formats. - `Spyfit `_: FTIR spectroscopy of the atmosphere From 6fb272c0fde4bfaca9b6322b18ac2cf962e26ee3 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Tue, 8 Oct 2019 22:23:46 +0100 Subject: [PATCH 5/6] Rolling minimum dependency versions policy (#3358) * - Downgrade numpy to 1.14, pandas to 0.20, scipy to 0.19 (24 months old) - Downgrade dask to 1.1 (6 months old) - Don't pin patch versions * Apply rolling policy (see #3222) * Automated tool to verify the minimum versions * Drop Python 3.5 * lint * Trivial cosmetic * Cosmetic * (temp) debug CI failure * Parallelize versions check script * Remove hacks for legacy dask * Documentation * Assorted cleanup * Assorted cleanup * Fix regression * Cleanup * type annotations upgraded to Python 3.6 * count_not_none backport * pd.Index.equals on legacy pandas returned False when comparing vs. a ndarray * Documentation * pathlib cleanup * Slide deprecations from 0.14 to 0.15 * More cleanups * More cleanups * Fix min_deps_check * Fix min_deps_check * Set policy of 12 months for pandas and scipy * Cleanup * Cleanup * Sphinx fix * Overhaul readthedocs environment * Fix test crash * Fix test crash * Prune readthedocs environment * Cleanup * Hack around versioneer bug on readthedocs CI * Code review * Prevent random timeouts in the readthedocs CI * What's New polish * Merge from Master * Trivial cosmetic * Reimplement pandas.core.common.count_not_none --- azure-pipelines.yml | 22 +- ci/min_deps_check.py | 187 ++++++++++++++ ci/requirements/doc.yml | 21 ++ ci/requirements/py35-bare-minimum.yml | 15 -- ci/requirements/py36-bare-minimum.yml | 11 + ci/requirements/py36-min-all-deps.yml | 61 ++--- ci/requirements/py36-min-nep18.yml | 12 +- ci/requirements/py36.yml | 13 +- ci/requirements/py37-windows.yml | 13 +- ci/requirements/py37.yml | 15 +- doc/conf.py | 2 +- doc/contributing.rst | 2 +- doc/environment.yml | 28 --- doc/groupby.rst | 4 +- doc/installing.rst | 74 ++++-- doc/io.rst | 1 + doc/pandas.rst | 3 +- doc/plotting.rst | 1 + doc/whats-new.rst | 40 ++- readthedocs.yml | 4 +- setup.py | 5 +- xarray/backends/api.py | 6 +- xarray/backends/file_manager.py | 2 +- xarray/backends/locks.py | 29 +-- xarray/backends/netCDF4_.py | 12 - xarray/backends/rasterio_.py | 13 +- xarray/backends/scipy_.py | 14 -- xarray/backends/zarr.py | 20 -- xarray/coding/cftime_offsets.py | 24 +- xarray/coding/cftimeindex.py | 5 - xarray/coding/times.py | 7 +- xarray/coding/variables.py | 18 +- xarray/core/alignment.py | 2 +- xarray/core/combine.py | 4 +- xarray/core/common.py | 33 +-- xarray/core/computation.py | 40 +-- xarray/core/dask_array_compat.py | 173 ------------- xarray/core/dask_array_ops.py | 34 +-- xarray/core/dataarray.py | 27 +- xarray/core/dataset.py | 78 ++---- xarray/core/duck_array_ops.py | 14 +- xarray/core/formatting.py | 7 +- xarray/core/indexing.py | 29 +-- xarray/core/merge.py | 4 +- xarray/core/missing.py | 8 +- xarray/core/npcompat.py | 284 ---------------------- xarray/core/pdcompat.py | 81 +----- xarray/core/rolling.py | 21 -- xarray/core/rolling_exp.py | 7 +- xarray/core/utils.py | 2 +- xarray/plot/utils.py | 18 +- xarray/testing.py | 6 +- xarray/tests/__init__.py | 31 +-- xarray/tests/test_accessor_str.py | 2 +- xarray/tests/test_backends.py | 33 +-- xarray/tests/test_cftimeindex.py | 122 +++++----- xarray/tests/test_cftimeindex_resample.py | 1 - xarray/tests/test_coding_times.py | 14 +- xarray/tests/test_combine.py | 2 +- xarray/tests/test_computation.py | 9 +- xarray/tests/test_dask.py | 33 +-- xarray/tests/test_dataarray.py | 11 +- xarray/tests/test_dataset.py | 39 +-- xarray/tests/test_distributed.py | 4 +- xarray/tests/test_duck_array_ops.py | 12 +- xarray/tests/test_indexing.py | 3 +- xarray/tests/test_plot.py | 11 +- xarray/tests/test_ufuncs.py | 15 +- xarray/tests/test_utils.py | 8 +- xarray/tests/test_variable.py | 8 - 70 files changed, 633 insertions(+), 1281 deletions(-) create mode 100755 ci/min_deps_check.py create mode 100644 ci/requirements/doc.yml delete mode 100644 ci/requirements/py35-bare-minimum.yml create mode 100644 ci/requirements/py36-bare-minimum.yml delete mode 100644 doc/environment.yml delete mode 100644 xarray/core/dask_array_compat.py diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d023aa317c7..c7f9de73cf4 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -8,8 +8,8 @@ jobs: - job: Linux strategy: matrix: - py35-bare-minimum: - conda_env: py35-bare-minimum + py36-bare-minimum: + conda_env: py36-bare-minimum py36-min-all-deps: conda_env: py36-min-all-deps py36-min-nep18: @@ -82,13 +82,29 @@ jobs: mypy . displayName: mypy type checks +- job: MinimumVersionsPolicy + pool: + vmImage: 'ubuntu-16.04' + steps: + - template: ci/azure/add-conda-to-path.yml + - bash: | + conda install -y pyyaml + python ci/min_deps_check.py ci/requirements/py36-bare-minimum.yml + python ci/min_deps_check.py ci/requirements/py36-min-all-deps.yml + displayName: minimum versions policy + - job: Docs pool: vmImage: 'ubuntu-16.04' steps: - template: ci/azure/install.yml parameters: - env_file: doc/environment.yml + env_file: ci/requirements/doc.yml + - bash: | + source activate xarray-tests + # Replicate the exact environment created by the readthedocs CI + conda install --yes --quiet -c pkgs/main mock pillow sphinx sphinx_rtd_theme + displayName: Replicate readthedocs CI environment - bash: | source activate xarray-tests cd doc diff --git a/ci/min_deps_check.py b/ci/min_deps_check.py new file mode 100755 index 00000000000..3bdd48ca76d --- /dev/null +++ b/ci/min_deps_check.py @@ -0,0 +1,187 @@ +"""Fetch from conda database all available versions of the xarray dependencies and their +publication date. Compare it against requirements/py36-min-all-deps.yml to verify the +policy on obsolete dependencies is being followed. Print a pretty report :) +""" +import subprocess +import sys +from concurrent.futures import ThreadPoolExecutor +from datetime import datetime, timedelta +from typing import Dict, Iterator, Tuple + +import yaml + +IGNORE_DEPS = { + "black", + "coveralls", + "flake8", + "hypothesis", + "mypy", + "pip", + "pytest", + "pytest-cov", + "pytest-env", +} + +POLICY_MONTHS = {"python": 42, "numpy": 24, "pandas": 12, "scipy": 12} +POLICY_MONTHS_DEFAULT = 6 + +has_errors = False + + +def error(msg: str) -> None: + global has_errors + has_errors = True + print("ERROR:", msg) + + +def parse_requirements(fname) -> Iterator[Tuple[str, int, int]]: + """Load requirements/py36-min-all-deps.yml + + Yield (package name, major version, minor version) + """ + global has_errors + + with open(fname) as fh: + contents = yaml.safe_load(fh) + for row in contents["dependencies"]: + if isinstance(row, dict) and list(row) == ["pip"]: + continue + pkg, eq, version = row.partition("=") + if pkg.rstrip("<>") in IGNORE_DEPS: + continue + if pkg.endswith("<") or pkg.endswith(">") or eq != "=": + error("package should be pinned with exact version: " + row) + continue + try: + major, minor = version.split(".") + except ValueError: + error("expected major.minor (without patch): " + row) + continue + try: + yield pkg, int(major), int(minor) + except ValueError: + error("failed to parse version: " + row) + + +def query_conda(pkg: str) -> Dict[Tuple[int, int], datetime]: + """Query the conda repository for a specific package + + Return map of {(major version, minor version): publication date} + """ + stdout = subprocess.check_output( + ["conda", "search", pkg, "--info", "-c", "defaults", "-c", "conda-forge"] + ) + out = {} # type: Dict[Tuple[int, int], datetime] + major = None + minor = None + + for row in stdout.decode("utf-8").splitlines(): + label, _, value = row.partition(":") + label = label.strip() + if label == "file name": + value = value.strip()[len(pkg) :] + major, minor = value.split("-")[1].split(".")[:2] + major = int(major) + minor = int(minor) + if label == "timestamp": + assert major is not None + assert minor is not None + ts = datetime.strptime(value.split()[0].strip(), "%Y-%m-%d") + + if (major, minor) in out: + out[major, minor] = min(out[major, minor], ts) + else: + out[major, minor] = ts + + # Hardcoded fix to work around incorrect dates in conda + if pkg == "python": + out.update( + { + (2, 7): datetime(2010, 6, 3), + (3, 5): datetime(2015, 9, 13), + (3, 6): datetime(2016, 12, 23), + (3, 7): datetime(2018, 6, 27), + (3, 8): datetime(2019, 10, 14), + } + ) + + return out + + +def process_pkg( + pkg: str, req_major: int, req_minor: int +) -> Tuple[str, int, int, str, int, int, str, str]: + """Compare package version from requirements file to available versions in conda. + Return row to build pandas dataframe: + + - package name + - major version in requirements file + - minor version in requirements file + - publication date of version in requirements file (YYYY-MM-DD) + - major version suggested by policy + - minor version suggested by policy + - publication date of version suggested by policy (YYYY-MM-DD) + - status ("<", "=", "> (!)") + """ + print("Analyzing %s..." % pkg) + versions = query_conda(pkg) + + try: + req_published = versions[req_major, req_minor] + except KeyError: + error("not found in conda: " + pkg) + return pkg, req_major, req_minor, "-", 0, 0, "-", "(!)" + + policy_months = POLICY_MONTHS.get(pkg, POLICY_MONTHS_DEFAULT) + policy_published = datetime.now() - timedelta(days=policy_months * 30) + + policy_major = req_major + policy_minor = req_minor + policy_published_actual = req_published + for (major, minor), published in reversed(sorted(versions.items())): + if published < policy_published: + break + policy_major = major + policy_minor = minor + policy_published_actual = published + + if (req_major, req_minor) < (policy_major, policy_minor): + status = "<" + elif (req_major, req_minor) > (policy_major, policy_minor): + status = "> (!)" + error("Package is too new: " + pkg) + else: + status = "=" + + return ( + pkg, + req_major, + req_minor, + req_published.strftime("%Y-%m-%d"), + policy_major, + policy_minor, + policy_published_actual.strftime("%Y-%m-%d"), + status, + ) + + +def main() -> None: + fname = sys.argv[1] + with ThreadPoolExecutor(8) as ex: + futures = [ + ex.submit(process_pkg, pkg, major, minor) + for pkg, major, minor in parse_requirements(fname) + ] + rows = [f.result() for f in futures] + + print("Package Required Policy Status") + print("------------- ----------------- ----------------- ------") + fmt = "{:13} {:>1d}.{:<2d} ({:10}) {:>1d}.{:<2d} ({:10}) {}" + for row in rows: + print(fmt.format(*row)) + + assert not has_errors + + +if __name__ == "__main__": + main() diff --git a/ci/requirements/doc.yml b/ci/requirements/doc.yml new file mode 100644 index 00000000000..e521ee4a4b8 --- /dev/null +++ b/ci/requirements/doc.yml @@ -0,0 +1,21 @@ +name: xarray-docs +channels: + # Don't change to pkgs/main, as it causes random timeouts in readthedocs + - conda-forge +dependencies: + - python=3.7 + - bottleneck + - cartopy + - h5netcdf + - ipython + - iris + - netcdf4 + - numpy + - numpydoc + - pandas<0.25 # Hack around https://github.com/pydata/xarray/issues/3369 + - rasterio + - seaborn + - sphinx + - sphinx-gallery + - sphinx_rtd_theme + - zarr diff --git a/ci/requirements/py35-bare-minimum.yml b/ci/requirements/py35-bare-minimum.yml deleted file mode 100644 index 7651a1bdcf1..00000000000 --- a/ci/requirements/py35-bare-minimum.yml +++ /dev/null @@ -1,15 +0,0 @@ -name: xarray-tests -channels: - - conda-forge -dependencies: - - python=3.5.3 - - pytest - - flake8 - - mock - - pip - - numpy=1.12 - - pandas=0.19.2 - - pip: - - pytest-env - - pytest-cov - - coveralls diff --git a/ci/requirements/py36-bare-minimum.yml b/ci/requirements/py36-bare-minimum.yml new file mode 100644 index 00000000000..05186bc8748 --- /dev/null +++ b/ci/requirements/py36-bare-minimum.yml @@ -0,0 +1,11 @@ +name: xarray-tests +channels: + - conda-forge +dependencies: + - python=3.6 + - coveralls + - pytest + - pytest-cov + - pytest-env + - numpy=1.14 + - pandas=0.24 diff --git a/ci/requirements/py36-min-all-deps.yml b/ci/requirements/py36-min-all-deps.yml index affbf8637fd..4e4f8550e16 100644 --- a/ci/requirements/py36-min-all-deps.yml +++ b/ci/requirements/py36-min-all-deps.yml @@ -2,42 +2,47 @@ name: xarray-tests channels: - conda-forge dependencies: - - python=3.6.7 + # MINIMUM VERSIONS POLICY: see doc/installing.rst + # Run ci/min_deps_check.py to verify that this file respects the policy. + # When upgrading python, numpy, or pandas, must also change + # doc/installing.rst and setup.py. + - python=3.6 - black - - boto3=1.9.235 - - bottleneck=1.2.1 - - cdms2=3.1.3 - - cfgrib=0.9.7.2 - - cftime=1.0.3.4 + - boto3=1.9 + - bottleneck=1.2 + - cartopy=0.17 + - cdms2=3.1 + - cfgrib=0.9 + - cftime=1.0 - coveralls - - dask=2.4.0 - - distributed=2.4.0 + - dask=1.2 + - distributed=1.27 - flake8 - - h5netcdf=0.7.4 - - h5py=2.10.0 - - hdf5=1.10.5 + - h5netcdf=0.7 + - h5py=2.9 # Policy allows for 2.10, but it's a conflict-fest + - hdf5=1.10 - hypothesis - - iris=2.2.0 - - lxml=4.4.1 # optional dep of pydap - - matplotlib=3.1.1 - - mypy==0.730 # Must match .pre-commit-config.yaml - - nc-time-axis=1.2.0 - - netcdf4=1.5.1.2 - - numba=0.45.1 - - numpy=1.17.2 - - pandas=0.25.1 + - iris=2.2 + - lxml=4.4 # Optional dep of pydap + - matplotlib=3.1 + - mypy=0.730 # Must match .pre-commit-config.yaml + - nc-time-axis=1.2 + - netcdf4=1.4 + - numba=0.44 + - numpy=1.14 + - pandas=0.24 - pip - - pseudonetcdf=3.0.2 - - pydap=3.2.2 - - pynio=1.5.5 + - pseudonetcdf=3.0 + - pydap=3.2 + - pynio=1.5 - pytest - pytest-cov - pytest-env - - rasterio=1.0.28 - - scipy=1.3.1 - - seaborn=0.9.0 + - rasterio=1.0 + - scipy=1.0 # Policy allows for 1.2, but scipy>=1.1 breaks numpy=1.14 + - seaborn=0.9 # - sparse # See py36-min-nep18.yml - - toolz=0.10.0 - - zarr=2.3.2 + - toolz=0.10 + - zarr=2.3 - pip: - numbagg==0.1 diff --git a/ci/requirements/py36-min-nep18.yml b/ci/requirements/py36-min-nep18.yml index 8680e412a99..5b291cf554c 100644 --- a/ci/requirements/py36-min-nep18.yml +++ b/ci/requirements/py36-min-nep18.yml @@ -4,14 +4,14 @@ channels: dependencies: # Optional dependencies that require NEP18, such as sparse, # require drastically newer packages than everything else - - python=3.6.7 + - python=3.6 - coveralls - - dask=2.4.0 - - distributed=2.4.0 + - dask=2.4 + - distributed=2.4 - numpy=1.17 - - pandas=0.25 + - pandas=0.24 - pytest - pytest-cov - pytest-env - - scipy=1.3 - - sparse=0.8.0 + - scipy=1.2 + - sparse=0.8 diff --git a/ci/requirements/py36.yml b/ci/requirements/py36.yml index bdb649f6f1b..cc91e8a12da 100644 --- a/ci/requirements/py36.yml +++ b/ci/requirements/py36.yml @@ -6,8 +6,9 @@ dependencies: - black - boto3 - bottleneck + - cartopy - cdms2 - - cfgrib>=0.9.2 + - cfgrib - cftime - coveralls - dask @@ -17,17 +18,17 @@ dependencies: - h5py - hdf5 - hypothesis - - iris>=1.10 + - iris - lxml # optional dep of pydap - matplotlib - - mypy==0.730 # Must match .pre-commit-config.yaml + - mypy=0.730 # Must match .pre-commit-config.yaml - nc-time-axis - netcdf4 - numba - - numpy>=1.12 - - pandas>=0.19 + - numpy + - pandas - pip - - pseudonetcdf>=3.0.1 + - pseudonetcdf - pydap - pynio - pytest diff --git a/ci/requirements/py37-windows.yml b/ci/requirements/py37-windows.yml index 79b54030bc6..bf485b59a49 100644 --- a/ci/requirements/py37-windows.yml +++ b/ci/requirements/py37-windows.yml @@ -6,6 +6,7 @@ dependencies: - black - boto3 - bottleneck + - cartopy # - cdms2 # Not available on Windows # - cfgrib>=0.9.2 # Causes Python interpreter crash on Windows - cftime @@ -17,17 +18,17 @@ dependencies: - h5py - hdf5 - hypothesis - - iris>=1.10 - - lxml # optional dep of pydap + - iris + - lxml # Optional dep of pydap - matplotlib - - mypy==0.730 # Must match .pre-commit-config.yaml + - mypy=0.730 # Must match .pre-commit-config.yaml - nc-time-axis - netcdf4 - numba - - numpy>=1.12 - - pandas>=0.19 + - numpy + - pandas - pip - - pseudonetcdf>=3.0.1 + - pseudonetcdf - pydap # - pynio # Not available on Windows - pytest diff --git a/ci/requirements/py37.yml b/ci/requirements/py37.yml index a4fe2d82a6f..5c9a1cec5b5 100644 --- a/ci/requirements/py37.yml +++ b/ci/requirements/py37.yml @@ -6,8 +6,9 @@ dependencies: - black - boto3 - bottleneck + - cartopy - cdms2 - - cfgrib>=0.9.2 + - cfgrib - cftime - coveralls - dask @@ -17,17 +18,17 @@ dependencies: - h5py - hdf5 - hypothesis - - iris>=1.10 - - lxml # optional dep of pydap + - iris + - lxml # Optional dep of pydap - matplotlib - - mypy==0.730 # Must match .pre-commit-config.yaml + - mypy=0.730 # Must match .pre-commit-config.yaml - nc-time-axis - netcdf4 - numba - - numpy>=1.12 - - pandas>=0.19 + - numpy + - pandas - pip - - pseudonetcdf>=3.0.1 + - pseudonetcdf - pydap - pynio - pytest diff --git a/doc/conf.py b/doc/conf.py index a80e470af26..7c1557a1e66 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -201,7 +201,7 @@ # Sometimes the savefig directory doesn't exist and needs to be created # https://github.com/ipython/ipython/issues/8733 -# becomes obsolete when we can pin ipython>=5.2; see doc/environment.yml +# becomes obsolete when we can pin ipython>=5.2; see ci/requirements/doc.yml ipython_savefig_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), "_build", "html", "_static" ) diff --git a/doc/contributing.rst b/doc/contributing.rst index 53b8cb51f60..66e8377600e 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -696,7 +696,7 @@ environment by:: or, to use a specific Python interpreter,:: - asv run -e -E existing:python3.5 + asv run -e -E existing:python3.6 This will display stderr from the benchmarks, and use your local ``python`` that comes from your ``$PATH``. diff --git a/doc/environment.yml b/doc/environment.yml deleted file mode 100644 index e1b5c5475f7..00000000000 --- a/doc/environment.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: xarray-docs -channels: - - conda-forge -dependencies: - - python=3.7 - - numpy=1.16.0 - - pandas=0.23.3 - - scipy=1.2.0 - - matplotlib=3.0.2 - - seaborn=0.9.0 - - dask=1.1.0 - - ipython=7.2.0 - - netCDF4=1.4.2 - - h5netcdf=0.7.4 - - cartopy=0.17.0 - - rasterio=1.0.24 - - zarr=2.2.0 - - iris=2.2.0 - - flake8=3.6.0 - - cftime=1.0.3.4 - - bottleneck=1.2.1 - - sphinx=1.8.2 - - numpydoc=0.8.0 - - sphinx-gallery=0.2.0 - - pillow=5.4.1 - - sphinx_rtd_theme=0.4.2 - - mock=2.0.0 - - pip diff --git a/doc/groupby.rst b/doc/groupby.rst index 03c0881d836..cc2682f2ee3 100644 --- a/doc/groupby.rst +++ b/doc/groupby.rst @@ -77,7 +77,7 @@ a customized coordinate, but xarray facilitates this via the x_bins = [0,25,50] ds.groupby_bins('x', x_bins).groups -The binning is implemented via `pandas.cut`__, whose documentation details how +The binning is implemented via :func:`pandas.cut`, whose documentation details how the bins are assigned. As seen in the example above, by default, the bins are labeled with strings using set notation to precisely identify the bin limits. To override this behavior, you can specify the bin labels explicitly. Here we @@ -88,8 +88,6 @@ choose `float` labels which identify the bin centers: x_bin_labels = [12.5,37.5] ds.groupby_bins('x', x_bins, labels=x_bin_labels).groups -__ http://pandas.pydata.org/pandas-docs/version/0.17.1/generated/pandas.cut.html - Apply ~~~~~ diff --git a/doc/installing.rst b/doc/installing.rst index a81f6c23328..b1bf072dbe1 100644 --- a/doc/installing.rst +++ b/doc/installing.rst @@ -6,9 +6,9 @@ Installation Required dependencies --------------------- -- Python (3.5.3 or later) -- `numpy `__ (1.12 or later) -- `pandas `__ (0.19.2 or later) +- Python (3.6 or later) +- `numpy `__ (1.14 or later) +- `pandas `__ (0.24 or later) Optional dependencies --------------------- @@ -32,7 +32,7 @@ For netCDF and IO for accessing CAMx, GEOS-Chem (bpch), NOAA ARL files, ICARTT files (ffi1001) and many other. - `rasterio `__: for reading GeoTiffs and - other gridded raster datasets. (version 1.0 or later) + other gridded raster datasets. - `iris `__: for conversion to and from iris' Cube objects - `cfgrib `__: for reading GRIB files via the @@ -41,30 +41,67 @@ For netCDF and IO For accelerating xarray ~~~~~~~~~~~~~~~~~~~~~~~ -- `scipy `__: necessary to enable the interpolation features for xarray objects +- `scipy `__: necessary to enable the interpolation features for + xarray objects - `bottleneck `__: speeds up NaN-skipping and rolling window aggregations by a large factor - (1.1 or later) - `numbagg `_: for exponential rolling window operations For parallel computing ~~~~~~~~~~~~~~~~~~~~~~ -- `dask.array `__ (0.16 or later): required for - :ref:`dask`. +- `dask.array `__: required for :ref:`dask`. For plotting ~~~~~~~~~~~~ - `matplotlib `__: required for :ref:`plotting` - (1.5 or later) -- `cartopy `__: recommended for - :ref:`plot-maps` +- `cartopy `__: recommended for :ref:`plot-maps` - `seaborn `__: for better color palettes - `nc-time-axis `__: for plotting - cftime.datetime objects (1.2.0 or later) + cftime.datetime objects + +Alternative data containers +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +- `sparse `_: for sparse arrays +- Any numpy-like objects that support + `NEP-18 `_. + Note that while such libraries theoretically should work, they are untested. + Integration tests are in the process of being written for individual libraries. + + +.. _mindeps_policy: + +Minimum dependency versions +--------------------------- +xarray adopts a rolling policy regarding the minimum supported version of its +dependencies: + +- **Python:** 42 months + (`NEP-29 `_) +- **numpy:** 24 months + (`NEP-29 `_) +- **pandas:** 12 months +- **scipy:** 12 months +- **sparse** and other libraries that rely on + `NEP-18 `_ + for integration: very latest available versions only, until the technology will have + matured. This extends to dask when used in conjunction with any of these libraries. + numpy >=1.17. +- **all other libraries:** 6 months + +The above should be interpreted as *the minor version (X.Y) initially published no more +than N months ago*. Patch versions (x.y.Z) are not pinned, and only the latest available +at the moment of publishing the xarray release is guaranteed to work. + +You can see the actual minimum tested versions: + +- `For NEP-18 libraries + `_ +- `For everything else + `_ Instructions @@ -93,13 +130,9 @@ pandas) installed first. Then, install xarray with pip:: Testing ------- -To run the test suite after installing xarray, first install (via pypi or conda) - -- `py.test `__: Simple unit testing library -- `mock `__: additional testing library required for python version 2 - -and run -``py.test --pyargs xarray``. +To run the test suite after installing xarray, install (via pypi or conda) `py.test +`__ and run ``pytest`` in the root directory of the xarray +repository. Performance Monitoring @@ -110,7 +143,8 @@ A fixed-point performance monitoring of (a part of) our codes can be seen on To run these benchmark tests in a local machine, first install -- `airspeed-velocity `__: a tool for benchmarking Python packages over their lifetime. +- `airspeed-velocity `__: a tool for benchmarking + Python packages over their lifetime. and run ``asv run # this will install some conda environments in ./.asv/envs`` diff --git a/doc/io.rst b/doc/io.rst index 7f0c2333ce5..dab2a195e90 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -451,6 +451,7 @@ This feature is availabe through :py:func:`DataArray.to_netcdf` and and currently raises a warning unless ``invalid_netcdf=True`` is set: .. ipython:: python + :okwarning: # Writing complex valued data da = xr.DataArray([1.+1.j, 2.+2.j, 3.+3.j]) diff --git a/doc/pandas.rst b/doc/pandas.rst index 4fa73eec18c..4f3088b4c34 100644 --- a/doc/pandas.rst +++ b/doc/pandas.rst @@ -65,8 +65,7 @@ For datasets containing dask arrays where the data should be lazily loaded, see To create a ``Dataset`` from a ``DataFrame``, use the :py:meth:`~xarray.Dataset.from_dataframe` class method or the equivalent -:py:meth:`pandas.DataFrame.to_xarray ` method (pandas -v0.18 or later): +:py:meth:`pandas.DataFrame.to_xarray ` method: .. ipython:: python diff --git a/doc/plotting.rst b/doc/plotting.rst index 3e61e85f78c..e9d30fb63c8 100644 --- a/doc/plotting.rst +++ b/doc/plotting.rst @@ -249,6 +249,7 @@ As an alternative, also a step plot similar to matplotlib's ``plt.step`` can be made using 1D data. .. ipython:: python + :okwarning: @savefig plotting_example_step.png width=4in air1d[:20].plot.step(where='mid') diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 7103d7b8ab3..5b73059b34c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -13,32 +13,59 @@ What's New import xarray as xr np.random.seed(123456) -.. _whats-new.0.13.1: +.. _whats-new.0.14.0: -v0.13.1 (unreleased) +v0.14.0 (unreleased) -------------------- +Breaking changes +~~~~~~~~~~~~~~~~ +- This release introduces a rolling policy for minimum dependency versions: + :ref:`mindeps_policy`. + + Several minimum versions have been increased: + + ============ ================== ==== + Package Old New + ============ ================== ==== + Python 3.5.3 3.6 + numpy 1.12 1.14 + pandas 0.19.2 0.24 + dask 0.16 (tested: 2.4) 1.2 + bottleneck 1.1 (tested: 1.2) 1.2 + matplotlib 1.5 (tested: 3.1) 3.1 + ============ ================== ==== + + Obsolete patch versions (x.y.Z) are not tested anymore. + The oldest supported versions of all optional dependencies are now covered by + automated tests (before, only the very latest versions were tested). + + (:issue:`3222`, :issue:`3293`, :issue:`3340`, :issue:`3346`, :issue:`3358`). + By `Guido Imperiale `_. + New functions/methods ~~~~~~~~~~~~~~~~~~~~~ Enhancements ~~~~~~~~~~~~ -- Add a repr for :py:class:`~xarray.core.GroupBy` objects. By `Deepak Cherian `_. +- Add a repr for :py:class:`~xarray.core.GroupBy` objects (:issue:`3344`). Example:: >>> da.groupby("time.season") DataArrayGroupBy, grouped over 'season' 4 groups with labels 'DJF', 'JJA', 'MAM', 'SON' + By `Deepak Cherian `_. + Bug fixes ~~~~~~~~~ - Reintroduce support for :mod:`weakref` (broken in v0.13.0). Support has been reinstated for :class:`DataArray` and :class:`Dataset` objects only. Internal xarray - objects remain unaddressable by weakref in order to save memory. - (:issue:`3317`) by `Guido Imperiale `_. + objects remain unaddressable by weakref in order to save memory + (:issue:`3317`). By `Guido Imperiale `_. - Line plots with the ``x`` or ``y`` argument set to a 1D non-dimensional coord - now plot the correct data for 2D DataArrays. + now plot the correct data for 2D DataArrays (:issue:`3334`). By `Tom Nicholas `_. Documentation @@ -58,6 +85,7 @@ Documentation (:pull:`3353`). By `Gregory Gundersen `_. + .. _whats-new.0.13.0: v0.13.0 (17 Sep 2019) diff --git a/readthedocs.yml b/readthedocs.yml index 8e9c09c9414..6429780e7d7 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -1,8 +1,8 @@ build: image: latest conda: - file: doc/environment.yml + file: ci/requirements/doc.yml python: - version: 3.6 + version: 3.7 setup_py_install: true formats: [] diff --git a/setup.py b/setup.py index 5cfa4d9f9df..08d4f54764f 100644 --- a/setup.py +++ b/setup.py @@ -16,14 +16,13 @@ "Intended Audience :: Science/Research", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Topic :: Scientific/Engineering", ] -PYTHON_REQUIRES = ">=3.5.3" -INSTALL_REQUIRES = ["numpy >= 1.12", "pandas >= 0.19.2"] +PYTHON_REQUIRES = ">=3.6" +INSTALL_REQUIRES = ["numpy >= 1.14", "pandas >= 0.24"] needs_pytest = {"pytest", "test", "ptr"}.intersection(sys.argv) SETUP_REQUIRES = ["pytest-runner >= 4.2"] if needs_pytest else [] TESTS_REQUIRE = ["pytest >= 2.7.1"] diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 458a2d0cc42..8f6881b804a 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -912,7 +912,7 @@ def open_mfdataset( # Remove this after deprecation cycle from #2616 is complete basic_msg = dedent( """\ - In xarray version 0.14 the default behaviour of `open_mfdataset` + In xarray version 0.15 the default behaviour of `open_mfdataset` will change. To retain the existing behavior, pass combine='nested'. To use future default behavior, pass combine='by_coords'. See @@ -963,11 +963,11 @@ def open_mfdataset( return combined -WRITEABLE_STORES = { +WRITEABLE_STORES: Dict[str, Callable] = { "netcdf4": backends.NetCDF4DataStore.open, "scipy": backends.ScipyDataStore, "h5netcdf": backends.H5NetCDFStore, -} # type: Dict[str, Callable] +} def to_netcdf( diff --git a/xarray/backends/file_manager.py b/xarray/backends/file_manager.py index eac28852281..0ff574b5d81 100644 --- a/xarray/backends/file_manager.py +++ b/xarray/backends/file_manager.py @@ -13,7 +13,7 @@ assert FILE_CACHE.maxsize, "file cache must be at least size one" -REF_COUNTS = {} # type: Dict[Any, int] +REF_COUNTS: Dict[Any, int] = {} _DEFAULT_MODE = utils.ReprObject("") diff --git a/xarray/backends/locks.py b/xarray/backends/locks.py index 865ce1ddccd..d0bf790f074 100644 --- a/xarray/backends/locks.py +++ b/xarray/backends/locks.py @@ -21,7 +21,7 @@ NETCDFC_LOCK = SerializableLock() -_FILE_LOCKS = weakref.WeakValueDictionary() # type: MutableMapping[Any, threading.Lock] +_FILE_LOCKS: MutableMapping[Any, threading.Lock] = weakref.WeakValueDictionary() def _get_threaded_lock(key): @@ -72,17 +72,11 @@ def _get_scheduler(get=None, collection=None): dask.base.get_scheduler """ try: - # dask 0.18.1 and later - from dask.base import get_scheduler - - actual_get = get_scheduler(get, collection) + import dask # noqa: F401 except ImportError: - try: - from dask.utils import effective_get + return None - actual_get = effective_get(get, collection) - except ImportError: - return None + actual_get = dask.base.get_scheduler(get, collection) try: from dask.distributed import Client @@ -90,15 +84,12 @@ def _get_scheduler(get=None, collection=None): if isinstance(actual_get.__self__, Client): return "distributed" except (ImportError, AttributeError): - try: - import dask.multiprocessing - - if actual_get == dask.multiprocessing.get: - return "multiprocessing" - else: - return "threaded" - except ImportError: - return "threaded" + pass + + if actual_get is dask.multiprocessing.get: + return "multiprocessing" + else: + return "threaded" def get_write_lock(key): diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 813942c2f32..203a2157e70 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -1,9 +1,7 @@ import functools import operator -import warnings from collections import OrderedDict from contextlib import suppress -from distutils.version import LooseVersion import numpy as np @@ -354,16 +352,6 @@ def open( ): import netCDF4 - if len(filename) == 88 and LooseVersion(netCDF4.__version__) < "1.3.1": - warnings.warn( - "A segmentation fault may occur when the " - "file path has exactly 88 characters as it does " - "in this case. The issue is known to occur with " - "version 1.2.4 of netCDF4 and can be addressed by " - "upgrading netCDF4 to at least version 1.3.1. " - "More details can be found here: " - "https://github.com/pydata/xarray/issues/1745" - ) if format is None: format = "NETCDF4" diff --git a/xarray/backends/rasterio_.py b/xarray/backends/rasterio_.py index 316f13470b7..deff2eaed66 100644 --- a/xarray/backends/rasterio_.py +++ b/xarray/backends/rasterio_.py @@ -1,7 +1,6 @@ import os import warnings from collections import OrderedDict -from distutils.version import LooseVersion import numpy as np @@ -253,18 +252,14 @@ def open_rasterio(filename, parse_coordinates=None, chunks=None, cache=None, loc coords["band"] = np.asarray(riods.indexes) # Get coordinates - if LooseVersion(rasterio.__version__) < "1.0": - transform = riods.affine - else: - transform = riods.transform - if transform.is_rectilinear: + if riods.transform.is_rectilinear: # 1d coordinates parse = True if parse_coordinates is None else parse_coordinates if parse: nx, ny = riods.width, riods.height # xarray coordinates are pixel centered - x, _ = (np.arange(nx) + 0.5, np.zeros(nx) + 0.5) * transform - _, y = (np.zeros(ny) + 0.5, np.arange(ny) + 0.5) * transform + x, _ = (np.arange(nx) + 0.5, np.zeros(nx) + 0.5) * riods.transform + _, y = (np.zeros(ny) + 0.5, np.arange(ny) + 0.5) * riods.transform coords["y"] = y coords["x"] = x else: @@ -287,7 +282,7 @@ def open_rasterio(filename, parse_coordinates=None, chunks=None, cache=None, loc # For serialization store as tuple of 6 floats, the last row being # always (0, 0, 1) per definition (see # https://github.com/sgillies/affine) - attrs["transform"] = tuple(transform)[:6] + attrs["transform"] = tuple(riods.transform)[:6] if hasattr(riods, "crs") and riods.crs: # CRS is a dict-like object specific to rasterio # If CRS is not None, we convert it back to a PROJ4 string using diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index c4f9666f0c1..7f93ca237b1 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -1,6 +1,4 @@ -import warnings from collections import OrderedDict -from distutils.version import LooseVersion from io import BytesIO import numpy as np @@ -113,18 +111,6 @@ class ScipyDataStore(WritableCFDataStore): def __init__( self, filename_or_obj, mode="r", format=None, group=None, mmap=None, lock=None ): - import scipy - import scipy.io - - if mode != "r" and scipy.__version__ < LooseVersion("0.13"): # pragma: no cover - warnings.warn( - "scipy %s detected; " - "the minimal recommended version is 0.13. " - "Older version of this library do not reliably " - "read and write files." % scipy.__version__, - ImportWarning, - ) - if group is not None: raise ValueError( "cannot save to a group with the " "scipy.io.netcdf backend" diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 9a115de55ef..b550efe052e 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,6 +1,5 @@ import warnings from collections import OrderedDict -from distutils.version import LooseVersion import numpy as np @@ -254,25 +253,6 @@ def open_group( ): import zarr - min_zarr = "2.2" - - if LooseVersion(zarr.__version__) < min_zarr: # pragma: no cover - raise NotImplementedError( - "Zarr version %s or greater is " - "required by xarray. See zarr " - "installation " - "http://zarr.readthedocs.io/en/stable/" - "#installation" % min_zarr - ) - - if consolidated or consolidate_on_close: - if LooseVersion(zarr.__version__) <= "2.2.1.dev2": # pragma: no cover - raise NotImplementedError( - "Zarr version 2.2.1.dev2 or greater " - "is required by for consolidated " - "metadata." - ) - open_kwargs = dict(mode=mode, synchronizer=synchronizer, path=group) if consolidated: # TODO: an option to pass the metadata_key keyword diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index d7841fd43f8..515d309d75b 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -43,10 +43,11 @@ import re from datetime import timedelta from functools import partial -from typing import ClassVar +from typing import ClassVar, Optional import numpy as np +from ..core.pdcompat import count_not_none from .cftimeindex import CFTimeIndex, _parse_iso8601_with_reso from .times import format_cftime_datetime @@ -73,8 +74,8 @@ def get_date_type(calendar): class BaseCFTimeOffset: - _freq = None # type: ClassVar[str] - _day_option = None # type: ClassVar[str] + _freq: ClassVar[Optional[str]] = None + _day_option: ClassVar[Optional[str]] = None def __init__(self, n=1): if not isinstance(n, int): @@ -350,8 +351,8 @@ class QuarterOffset(BaseCFTimeOffset): """Quarter representation copied off of pandas/tseries/offsets.py """ - _freq = None # type: ClassVar[str] - _default_month = None # type: ClassVar[int] + _freq: ClassVar[str] + _default_month: ClassVar[int] def __init__(self, n=1, month=None): BaseCFTimeOffset.__init__(self, n) @@ -447,9 +448,9 @@ def rollback(self, date): class YearOffset(BaseCFTimeOffset): - _freq = None # type: ClassVar[str] - _day_option = None # type: ClassVar[str] - _default_month = None # type: ClassVar[int] + _freq: ClassVar[str] + _day_option: ClassVar[str] + _default_month: ClassVar[int] def __init__(self, n=1, month=None): BaseCFTimeOffset.__init__(self, n) @@ -774,11 +775,6 @@ def _generate_range(start, end, periods, offset): current = next_date -def _count_not_none(*args): - """Compute the number of non-None arguments.""" - return sum([arg is not None for arg in args]) - - def cftime_range( start=None, end=None, @@ -957,7 +953,7 @@ def cftime_range( pandas.date_range """ # Adapted from pandas.core.indexes.datetimes._generate_range. - if _count_not_none(start, end, periods, freq) != 3: + if count_not_none(start, end, periods, freq) != 3: raise ValueError( "Of the arguments 'start', 'end', 'periods', and 'freq', three " "must be specified at a time." diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py index 16ab258d32e..802dd94f06c 100644 --- a/xarray/coding/cftimeindex.py +++ b/xarray/coding/cftimeindex.py @@ -437,11 +437,6 @@ def __sub__(self, other): def __rsub__(self, other): return pd.TimedeltaIndex(other - np.array(self)) - def _add_delta(self, deltas): - # To support TimedeltaIndex + CFTimeIndex with older versions of - # pandas. No longer used as of pandas 0.23. - return self + deltas - def to_datetimeindex(self, unsafe=False): """If possible, convert this index to a pandas.DatetimeIndex. diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 7b5a7c56a53..1508fb50b38 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from pandas.errors import OutOfBoundsDatetime from ..core import indexing from ..core.common import contains_cftime_datetimes @@ -21,12 +22,6 @@ unpack_for_encoding, ) -try: - from pandas.errors import OutOfBoundsDatetime -except ImportError: - # pandas < 0.20 - from pandas.tslib import OutOfBoundsDatetime - # standard calendars recognized by cftime _STANDARD_CALENDARS = {"standard", "gregorian", "proleptic_gregorian"} diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 7adaca4e9bc..f78502d81be 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -1,7 +1,7 @@ """Coders for individual Variable objects.""" import warnings from functools import partial -from typing import Any +from typing import Any, Hashable import numpy as np import pandas as pd @@ -33,14 +33,18 @@ class VariableCoder: variables in the underlying store. """ - def encode(self, variable, name=None): # pragma: no cover - # type: (Variable, Any) -> Variable - """Convert an encoded variable to a decoded variable.""" + def encode( + self, variable: Variable, name: Hashable = None + ) -> Variable: # pragma: no cover + """Convert an encoded variable to a decoded variable + """ raise NotImplementedError() - def decode(self, variable, name=None): # pragma: no cover - # type: (Variable, Any) -> Variable - """Convert an decoded variable to a encoded variable.""" + def decode( + self, variable: Variable, name: Hashable = None + ) -> Variable: # pragma: no cover + """Convert an decoded variable to a encoded variable + """ raise NotImplementedError() diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 3bc60db0a0b..b4fee1773b8 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -549,7 +549,7 @@ def reindex_variables( if dim in variables: var = variables[dim] - args = (var.attrs, var.encoding) # type: tuple + args: tuple = (var.attrs, var.encoding) else: args = () reindexed[dim] = IndexVariable((dim,), target, *args) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 38befd5698f..8c3555941c4 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -789,7 +789,7 @@ def auto_combine( if not from_openmfds: basic_msg = dedent( """\ - In xarray version 0.14 `auto_combine` will be deprecated. See + In xarray version 0.15 `auto_combine` will be deprecated. See http://xarray.pydata.org/en/stable/combining.html#combining-multi""" ) warnings.warn(basic_msg, FutureWarning, stacklevel=2) @@ -831,7 +831,7 @@ def auto_combine( message += dedent( """\ The datasets supplied require both concatenation and merging. From - xarray version 0.14 this will operation will require either using the + xarray version 0.15 this will operation will require either using the new `combine_nested` function (or the `combine='nested'` option to open_mfdataset), with a nested list structure such that you can combine along the dimensions {}. Alternatively if your datasets have global diff --git a/xarray/core/common.py b/xarray/core/common.py index bf15e9907c4..b1a513e05a0 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -193,10 +193,9 @@ def __init_subclass__(cls): """Verify that all subclasses explicitly define ``__slots__``. If they don't, raise error in the core xarray module and a FutureWarning in third-party extensions. - This check is only triggered in Python 3.6+. """ if not hasattr(object.__new__(cls), "__dict__"): - cls.__setattr__ = cls._setattr_slots + pass elif cls.__module__.startswith("xarray."): raise AttributeError("%s must explicitly define __slots__" % cls.__name__) else: @@ -230,12 +229,11 @@ def __getattr__(self, name: str) -> Any: "%r object has no attribute %r" % (type(self).__name__, name) ) - # This complicated three-method design boosts overall performance of simple - # operations - particularly DataArray methods that perform a _to_temp_dataset() - # round-trip - by a whopping 8% compared to a single method that checks - # hasattr(self, "__dict__") at runtime before every single assignment (like - # _setattr_py35 does). All of this is just temporary until the FutureWarning can be - # changed into a hard crash. + # This complicated two-method design boosts overall performance of simple operations + # - particularly DataArray methods that perform a _to_temp_dataset() round-trip - by + # a whopping 8% compared to a single method that checks hasattr(self, "__dict__") at + # runtime before every single assignment. All of this is just temporary until the + # FutureWarning can be changed into a hard crash. def _setattr_dict(self, name: str, value: Any) -> None: """Deprecated third party subclass (see ``__init_subclass__`` above) """ @@ -251,7 +249,7 @@ def _setattr_dict(self, name: str, value: Any) -> None: stacklevel=2, ) - def _setattr_slots(self, name: str, value: Any) -> None: + def __setattr__(self, name: str, value: Any) -> None: """Objects with ``__slots__`` raise AttributeError if you try setting an undeclared attribute. This is desirable, but the error message could use some improvement. @@ -269,14 +267,6 @@ def _setattr_slots(self, name: str, value: Any) -> None: % (name, type(self).__name__) ) from e - def _setattr_py35(self, name: str, value: Any) -> None: - if hasattr(self, "__dict__"): - return self._setattr_dict(name, value) - return self._setattr_slots(name, value) - - # Overridden in Python >=3.6 by __init_subclass__ - __setattr__ = _setattr_py35 - def __dir__(self) -> List[str]: """Provide method name lookup and completion. Only provide 'public' methods. @@ -392,7 +382,7 @@ def get_index(self, key: Hashable) -> pd.Index: def _calc_assign_results( self: C, kwargs: Mapping[Hashable, Union[T, Callable[[C], T]]] ) -> MutableMapping[Hashable, T]: - results = SortedKeysDict() # type: SortedKeysDict[Hashable, T] + results: MutableMapping[Hashable, T] = SortedKeysDict() for k, v in kwargs.items(): if callable(v): results[k] = v(self) @@ -1040,13 +1030,8 @@ def resample( grouper = CFTimeGrouper(freq, closed, label, base, loffset) else: - # TODO: to_offset() call required for pandas==0.19.2 grouper = pd.Grouper( - freq=freq, - closed=closed, - label=label, - base=base, - loffset=pd.tseries.frequencies.to_offset(loffset), + freq=freq, closed=closed, label=label, base=base, loffset=loffset ) group = DataArray( dim_coord, coords=dim_coord.coords, dims=dim_coord.dims, name=RESAMPLE_DIM diff --git a/xarray/core/computation.py b/xarray/core/computation.py index 4b9428847f4..a55613dd4b4 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -5,12 +5,12 @@ import itertools import operator from collections import Counter, OrderedDict -from distutils.version import LooseVersion from typing import ( TYPE_CHECKING, AbstractSet, Any, Callable, + Hashable, Iterable, List, Mapping, @@ -33,7 +33,6 @@ from .coordinates import Coordinates # noqa from .dataset import Dataset -_DEFAULT_FROZEN_SET = frozenset() # type: frozenset _NO_FILL_VALUE = utils.ReprObject("") _DEFAULT_NAME = utils.ReprObject("") _JOINS_WITHOUT_FILL_VALUES = frozenset({"inner", "exact"}) @@ -492,8 +491,11 @@ def unified_dim_sizes( SLICE_NONE = slice(None) -def broadcast_compat_data(variable, broadcast_dims, core_dims): - # type: (Variable, tuple, tuple) -> Any +def broadcast_compat_data( + variable: Variable, + broadcast_dims: Tuple[Hashable, ...], + core_dims: Tuple[Hashable, ...], +) -> Any: data = variable.data old_dims = variable.dims @@ -654,7 +656,7 @@ def func(*arrays): def _apply_blockwise( func, args, input_dims, output_dims, signature, output_dtypes, output_sizes=None ): - from .dask_array_compat import blockwise + import dask.array if signature.num_outputs > 1: raise NotImplementedError( @@ -717,7 +719,7 @@ def _apply_blockwise( trimmed_dims = dims[-ndim:] if ndim else () blockwise_args.extend([arg, trimmed_dims]) - return blockwise( + return dask.array.blockwise( func, out_ind, *blockwise_args, @@ -995,13 +997,6 @@ def earth_mover_distance(first_samples, if vectorize: if signature.all_core_dims: - # we need the signature argument - if LooseVersion(np.__version__) < "1.12": # pragma: no cover - raise NotImplementedError( - "numpy 1.12 or newer required when using vectorize=True " - "in xarray.apply_ufunc with non-scalar output core " - "dimensions." - ) func = np.vectorize( func, otypes=output_dtypes, signature=signature.to_gufunc_string() ) @@ -1169,25 +1164,6 @@ def dot(*arrays, dims=None, **kwargs): ] output_core_dims = [tuple(d for d in all_dims if d not in dims + broadcast_dims)] - # older dask than 0.17.4, we use tensordot if possible. - if isinstance(arr.data, dask_array_type): - import dask - - if LooseVersion(dask.__version__) < LooseVersion("0.17.4"): - if len(broadcast_dims) == 0 and len(arrays) == 2: - axes = [ - [arr.get_axis_num(d) for d in arr.dims if d in dims] - for arr in arrays - ] - return apply_ufunc( - duck_array_ops.tensordot, - *arrays, - dask="allowed", - input_core_dims=input_core_dims, - output_core_dims=output_core_dims, - kwargs={"axes": axes} - ) - # construct einsum subscripts, such as '...abc,...ab->...c' # Note: input_core_dims are always moved to the last position subscripts_list = [ diff --git a/xarray/core/dask_array_compat.py b/xarray/core/dask_array_compat.py deleted file mode 100644 index fe2cdc5c553..00000000000 --- a/xarray/core/dask_array_compat.py +++ /dev/null @@ -1,173 +0,0 @@ -from distutils.version import LooseVersion - -import dask.array as da -import numpy as np -from dask import __version__ as dask_version - -try: - blockwise = da.blockwise -except AttributeError: - blockwise = da.atop - - -try: - from dask.array import isin -except ImportError: # pragma: no cover - # Copied from dask v0.17.3. - # Used under the terms of Dask's license, see licenses/DASK_LICENSE. - - def _isin_kernel(element, test_elements, assume_unique=False): - values = np.in1d(element.ravel(), test_elements, assume_unique=assume_unique) - return values.reshape(element.shape + (1,) * test_elements.ndim) - - def isin(element, test_elements, assume_unique=False, invert=False): - element = da.asarray(element) - test_elements = da.asarray(test_elements) - element_axes = tuple(range(element.ndim)) - test_axes = tuple(i + element.ndim for i in range(test_elements.ndim)) - mapped = blockwise( - _isin_kernel, - element_axes + test_axes, - element, - element_axes, - test_elements, - test_axes, - adjust_chunks={axis: lambda _: 1 for axis in test_axes}, - dtype=bool, - assume_unique=assume_unique, - ) - result = mapped.any(axis=test_axes) - if invert: - result = ~result - return result - - -if LooseVersion(dask_version) > LooseVersion("0.19.2"): - gradient = da.gradient - -else: # pragma: no cover - # Copied from dask v0.19.2 - # Used under the terms of Dask's license, see licenses/DASK_LICENSE. - import math - from numbers import Integral, Real - - try: - AxisError = np.AxisError - except AttributeError: - try: - np.array([0]).sum(axis=5) - except Exception as e: - AxisError = type(e) - - def validate_axis(axis, ndim): - """ Validate an input to axis= keywords """ - if isinstance(axis, (tuple, list)): - return tuple(validate_axis(ax, ndim) for ax in axis) - if not isinstance(axis, Integral): - raise TypeError("Axis value must be an integer, got %s" % axis) - if axis < -ndim or axis >= ndim: - raise AxisError( - "Axis %d is out of bounds for array of dimension " "%d" % (axis, ndim) - ) - if axis < 0: - axis += ndim - return axis - - def _gradient_kernel(x, block_id, coord, axis, array_locs, grad_kwargs): - """ - x: nd-array - array of one block - coord: 1d-array or scalar - coordinate along which the gradient is computed. - axis: int - axis along which the gradient is computed - array_locs: - actual location along axis. None if coordinate is scalar - grad_kwargs: - keyword to be passed to np.gradient - """ - block_loc = block_id[axis] - if array_locs is not None: - coord = coord[array_locs[0][block_loc] : array_locs[1][block_loc]] - grad = np.gradient(x, coord, axis=axis, **grad_kwargs) - return grad - - def gradient(f, *varargs, axis=None, **kwargs): - f = da.asarray(f) - - kwargs["edge_order"] = math.ceil(kwargs.get("edge_order", 1)) - if kwargs["edge_order"] > 2: - raise ValueError("edge_order must be less than or equal to 2.") - - drop_result_list = False - if axis is None: - axis = tuple(range(f.ndim)) - elif isinstance(axis, Integral): - drop_result_list = True - axis = (axis,) - - axis = validate_axis(axis, f.ndim) - - if len(axis) != len(set(axis)): - raise ValueError("duplicate axes not allowed") - - axis = tuple(ax % f.ndim for ax in axis) - - if varargs == (): - varargs = (1,) - if len(varargs) == 1: - varargs = len(axis) * varargs - if len(varargs) != len(axis): - raise TypeError( - "Spacing must either be a single scalar, or a scalar / " - "1d-array per axis" - ) - - if issubclass(f.dtype.type, (np.bool8, Integral)): - f = f.astype(float) - elif issubclass(f.dtype.type, Real) and f.dtype.itemsize < 4: - f = f.astype(float) - - results = [] - for i, ax in enumerate(axis): - for c in f.chunks[ax]: - if np.min(c) < kwargs["edge_order"] + 1: - raise ValueError( - "Chunk size must be larger than edge_order + 1. " - "Minimum chunk for aixs {} is {}. Rechunk to " - "proceed.".format(np.min(c), ax) - ) - - if np.isscalar(varargs[i]): - array_locs = None - else: - if isinstance(varargs[i], da.Array): - raise NotImplementedError( - "dask array coordinated is not supported." - ) - # coordinate position for each block taking overlap into - # account - chunk = np.array(f.chunks[ax]) - array_loc_stop = np.cumsum(chunk) + 1 - array_loc_start = array_loc_stop - chunk - 2 - array_loc_stop[-1] -= 1 - array_loc_start[0] = 0 - array_locs = (array_loc_start, array_loc_stop) - - results.append( - f.map_overlap( - _gradient_kernel, - dtype=f.dtype, - depth={j: 1 if j == ax else 0 for j in range(f.ndim)}, - boundary="none", - coord=varargs[i], - axis=ax, - array_locs=array_locs, - grad_kwargs=kwargs, - ) - ) - - if drop_result_list: - results = results[0] - - return results diff --git a/xarray/core/dask_array_ops.py b/xarray/core/dask_array_ops.py index 11fdb86e9b0..37f261cc3ad 100644 --- a/xarray/core/dask_array_ops.py +++ b/xarray/core/dask_array_ops.py @@ -1,26 +1,13 @@ -from distutils.version import LooseVersion - import numpy as np from . import dtypes, nputils -try: - import dask - import dask.array as da - - # Note: dask has used `ghost` before 0.18.2 - if LooseVersion(dask.__version__) <= LooseVersion("0.18.2"): - overlap = da.ghost.ghost - trim_internal = da.ghost.trim_internal - else: - overlap = da.overlap.overlap - trim_internal = da.overlap.trim_internal -except ImportError: - pass - def dask_rolling_wrapper(moving_func, a, window, min_count=None, axis=-1): - """wrapper to apply bottleneck moving window funcs on dask arrays""" + """Wrapper to apply bottleneck moving window funcs on dask arrays + """ + import dask.array as da + dtype, fill_value = dtypes.maybe_promote(a.dtype) a = a.astype(dtype) # inputs for overlap @@ -30,18 +17,21 @@ def dask_rolling_wrapper(moving_func, a, window, min_count=None, axis=-1): depth[axis] = (window + 1) // 2 boundary = {d: fill_value for d in range(a.ndim)} # Create overlap array. - ag = overlap(a, depth=depth, boundary=boundary) + ag = da.overlap.overlap(a, depth=depth, boundary=boundary) # apply rolling func out = ag.map_blocks( moving_func, window, min_count=min_count, axis=axis, dtype=a.dtype ) # trim array - result = trim_internal(out, depth) + result = da.overlap.trim_internal(out, depth) return result def rolling_window(a, axis, window, center, fill_value): - """ Dask's equivalence to np.utils.rolling_window """ + """Dask's equivalence to np.utils.rolling_window + """ + import dask.array as da + orig_shape = a.shape if axis < 0: axis = a.ndim + axis @@ -59,7 +49,7 @@ def rolling_window(a, axis, window, center, fill_value): % (window, depth[axis], min(a.chunks[axis])) ) - # Although dask.overlap pads values to boundaries of the array, + # Although da.overlap pads values to boundaries of the array, # the size of the generated array is smaller than what we want # if center == False. if center: @@ -88,7 +78,7 @@ def rolling_window(a, axis, window, center, fill_value): boundary = {d: fill_value for d in range(a.ndim)} # create overlap arrays - ag = overlap(a, depth=depth, boundary=boundary) + ag = da.overlap.overlap(a, depth=depth, boundary=boundary) # apply rolling func def func(x, window, axis=-1): diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 7ad6f3cbae8..d536d0de2c5 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1,5 +1,4 @@ import functools -import sys import warnings from collections import OrderedDict from numbers import Number @@ -323,7 +322,7 @@ def __init__( if encoding is not None: warnings.warn( "The `encoding` argument to `DataArray` is deprecated, and . " - "will be removed in 0.14. " + "will be removed in 0.15. " "Instead, specify the encoding when writing to disk or " "set the `encoding` attribute directly.", FutureWarning, @@ -419,7 +418,7 @@ def _overwrite_indexes(self, indexes: Mapping[Hashable, Any]) -> "DataArray": obj = self._replace(coords=coords) # switch from dimension to level names, if necessary - dim_names = {} # type: Dict[Any, str] + dim_names: Dict[Any, str] = {} for dim, idx in indexes.items(): if not isinstance(idx, pd.MultiIndex) and idx.name != dim: dim_names[dim] = idx.name @@ -1184,12 +1183,11 @@ def reindex_like( * None (default): don't fill gaps * pad / ffill: propagate last valid index value forward * backfill / bfill: propagate next valid index value backward - * nearest: use nearest valid index value (requires pandas>=0.16) + * nearest: use nearest valid index value tolerance : optional Maximum distance between original and new labels for inexact matches. The values of the index at the matching locations must satisfy the equation ``abs(index[indexer] - target) <= tolerance``. - Requires pandas>=0.17. copy : bool, optional If ``copy=True``, data in the return value is always copied. If ``copy=False`` and reindexing is unnecessary, or can be performed @@ -1250,7 +1248,7 @@ def reindex( * None (default): don't fill gaps * pad / ffill: propagate last valid index value forward * backfill / bfill: propagate next valid index value backward - * nearest: use nearest valid index value (requires pandas>=0.16) + * nearest: use nearest valid index value tolerance : optional Maximum distance between original and new labels for inexact matches. The values of the index at the matching locations must @@ -1504,9 +1502,7 @@ def expand_dims( with length 1. If provided as a dict, then the keys are the new dimensions and the values are either integers (giving the length of the new dimensions) or sequence/ndarray (giving the coordinates of - the new dimensions). **WARNING** for python 3.5, if ``dim`` is - dict-like, then it must be an ``OrderedDict``. This is to ensure - that the order in which the dims are given is maintained. + the new dimensions). axis : integer, list (or tuple) of integers, or None Axis position(s) where new axis is to be inserted (position(s) on the result array). If a list (or tuple) of integers is passed, @@ -1517,8 +1513,7 @@ def expand_dims( The keywords are arbitrary dimensions being inserted and the values are either the lengths of the new dims (if int is given), or their coordinates. Note, this is an alternative to passing a dict to the - dim kwarg and will only be used if dim is None. **WARNING** for - python 3.5 ``dim_kwargs`` is not available. + dim kwarg and will only be used if dim is None. Returns ------- @@ -1534,16 +1529,6 @@ def expand_dims( elif dim is not None and not isinstance(dim, Mapping): dim = OrderedDict(((cast(Hashable, dim), 1),)) - # TODO: get rid of the below code block when python 3.5 is no longer - # supported. - python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 - not_ordereddict = dim is not None and not isinstance(dim, OrderedDict) - if not python36_plus and not_ordereddict: - raise TypeError("dim must be an OrderedDict for python <3.6") - elif not python36_plus and dim_kwargs: - raise ValueError("dim_kwargs isn't available for python <3.6") - dim_kwargs = OrderedDict(dim_kwargs) - dim = either_dict_or_kwargs(dim, dim_kwargs, "expand_dims") ds = self._to_temp_dataset().expand_dims(dim, axis) return self._from_temp_dataset(ds) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d394e05b07a..1d9ef6f7a72 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3,7 +3,6 @@ import sys import warnings from collections import OrderedDict, defaultdict -from distutils.version import LooseVersion from numbers import Number from pathlib import Path from typing import ( @@ -41,7 +40,6 @@ formatting, groupby, ops, - pdcompat, resample, rolling, utils, @@ -132,8 +130,9 @@ def _get_virtual_variable( raise KeyError(key) split_key = key.split(".", 1) + var_name: Optional[str] if len(split_key) == 2: - ref_name, var_name = split_key # type: str, Optional[str] + ref_name, var_name = split_key elif len(split_key) == 1: ref_name, var_name = key, None else: @@ -165,7 +164,7 @@ def calculate_dimensions(variables: Mapping[Hashable, Variable]) -> "Dict[Any, i Returns dictionary mapping from dimension names to sizes. Raises ValueError if any of the dimension sizes conflict. """ - dims = {} # type: Dict[Any, int] + dims: Dict[Any, int] = {} last_used = {} scalar_vars = {k for k, v in variables.items() if not v.dims} for k, var in variables.items(): @@ -197,15 +196,17 @@ def merge_indexes( Not public API. Used in Dataset and DataArray set_index methods. """ - vars_to_replace = {} # Dict[Any, Variable] - vars_to_remove = [] # type: list + vars_to_replace: Dict[Hashable, Variable] = {} + vars_to_remove: List[Hashable] = [] error_msg = "{} is not the name of an existing variable." for dim, var_names in indexes.items(): if isinstance(var_names, str) or not isinstance(var_names, Sequence): var_names = [var_names] - names, codes, levels = [], [], [] # type: (list, list, list) + names: List[Hashable] = [] + codes: List[List[int]] = [] + levels: List[List[int]] = [] current_index_variable = variables.get(dim) for n in var_names: @@ -225,13 +226,8 @@ def merge_indexes( if current_index_variable is not None and append: current_index = current_index_variable.to_index() if isinstance(current_index, pd.MultiIndex): - try: - current_codes = current_index.codes - except AttributeError: - # fpr pandas<0.24 - current_codes = current_index.labels names.extend(current_index.names) - codes.extend(current_codes) + codes.extend(current_index.codes) levels.extend(current_index.levels) else: names.append("%s_level_0" % dim) @@ -490,7 +486,7 @@ def __init__( if compat is not None: warnings.warn( "The `compat` argument to Dataset is deprecated and will be " - "removed in 0.14." + "removed in 0.15." "Instead, use `merge` to control how variables are combined", FutureWarning, stacklevel=2, @@ -965,7 +961,7 @@ def _overwrite_indexes(self, indexes: Mapping[Any, pd.Index]) -> "Dataset": obj = self._replace(variables, indexes=new_indexes) # switch from dimension to level names, if necessary - dim_names = {} # type: Dict[Hashable, str] + dim_names: Dict[Hashable, str] = {} for dim, idx in indexes.items(): if not isinstance(idx, pd.MultiIndex) and idx.name != dim: dim_names[dim] = idx.name @@ -1130,7 +1126,7 @@ def _copy_listed(self, names: Iterable[Hashable]) -> "Dataset": if (var_name,) == var.dims: indexes[var_name] = var.to_index() - needed_dims = set() # type: set + needed_dims: Set[Hashable] = set() for v in variables.values(): needed_dims.update(v.dims) @@ -1669,7 +1665,7 @@ def chunks(self) -> Mapping[Hashable, Tuple[int, ...]]: """Block dimensions for this dataset's data or None if it's not a dask array. """ - chunks = {} # type: Dict[Hashable, Tuple[int, ...]] + chunks: Dict[Hashable, Tuple[int, ...]] = {} for v in self.variables.values(): if v.chunks is not None: for dim, c in zip(v.dims, v.chunks): @@ -1714,13 +1710,7 @@ def chunk( ------- chunked : xarray.Dataset """ - try: - from dask.base import tokenize - except ImportError: - # raise the usual error if dask is entirely missing - import dask # noqa: F401 - - raise ImportError("xarray requires dask version 0.9 or newer") + from dask.base import tokenize if isinstance(chunks, Number): chunks = dict.fromkeys(self.dims, chunks) @@ -1770,7 +1760,7 @@ def _validate_indexers( raise ValueError("dimensions %r do not exist" % invalid) # all indexers should be int, slice, np.ndarrays, or Variable - indexers_list = [] # type: List[Tuple[Any, Union[slice, Variable]]] + indexers_list: List[Tuple[Any, Union[slice, Variable]]] = [] for k, v in indexers.items(): if isinstance(v, slice): indexers_list.append((k, v)) @@ -1964,7 +1954,7 @@ def sel( carried out. See :ref:`indexing` for the details. One of indexers or indexers_kwargs must be provided. method : {None, 'nearest', 'pad'/'ffill', 'backfill'/'bfill'}, optional - Method to use for inexact matches (requires pandas>=0.16): + Method to use for inexact matches: * None (default): only exact matches * pad / ffill: propagate last valid index value forward @@ -1974,7 +1964,6 @@ def sel( Maximum distance between original and new labels for inexact matches. The values of the index at the matching locations must satisfy the equation ``abs(index[indexer] - target) <= tolerance``. - Requires pandas>=0.17. drop : bool, optional If ``drop=True``, drop coordinates variables in `indexers` instead of making them scalar. @@ -2204,12 +2193,11 @@ def reindex_like( * None (default): don't fill gaps * pad / ffill: propagate last valid index value forward * backfill / bfill: propagate next valid index value backward - * nearest: use nearest valid index value (requires pandas>=0.16) + * nearest: use nearest valid index value tolerance : optional Maximum distance between original and new labels for inexact matches. The values of the index at the matching locations must satisfy the equation ``abs(index[indexer] - target) <= tolerance``. - Requires pandas>=0.17. copy : bool, optional If ``copy=True``, data in the return value is always copied. If ``copy=False`` and reindexing is unnecessary, or can be performed @@ -2265,12 +2253,11 @@ def reindex( * None (default): don't fill gaps * pad / ffill: propagate last valid index value forward * backfill / bfill: propagate next valid index value backward - * nearest: use nearest valid index value (requires pandas>=0.16) + * nearest: use nearest valid index value tolerance : optional Maximum distance between original and new labels for inexact matches. The values of the index at the matching locations must satisfy the equation ``abs(index[indexer] - target) <= tolerance``. - Requires pandas>=0.17. copy : bool, optional If ``copy=True``, data in the return value is always copied. If ``copy=False`` and reindexing is unnecessary, or can be performed @@ -2925,14 +2912,6 @@ def expand_dims( expanded : same type as caller This object, but with an additional dimension(s). """ - # TODO: get rid of the below code block when python 3.5 is no longer - # supported. - if sys.version < "3.6": - if isinstance(dim, Mapping) and not isinstance(dim, OrderedDict): - raise TypeError("dim must be an OrderedDict for python <3.6") - if dim_kwargs: - raise ValueError("dim_kwargs isn't available for python <3.6") - if dim is None: pass elif isinstance(dim, Mapping): @@ -3186,13 +3165,6 @@ def _stack_once(self, dims, new_dim): # consider dropping levels that are unused? levels = [self.get_index(dim) for dim in dims] - if LooseVersion(pd.__version__) < LooseVersion("0.19.0"): - # RangeIndex levels in a MultiIndex are broken for appending in - # pandas before v0.19.0 - levels = [ - pd.Int64Index(level) if isinstance(level, pd.RangeIndex) else level - for level in levels - ] idx = utils.multiindex_from_product_levels(levels, names=dims) variables[new_dim] = IndexVariable(new_dim, idx) @@ -3360,12 +3332,7 @@ def ensure_stackable(val): def _unstack_once(self, dim: Hashable) -> "Dataset": index = self.get_index(dim) - # GH2619. For MultiIndex, we need to call remove_unused. - if LooseVersion(pd.__version__) >= "0.20": - index = index.remove_unused_levels() - else: # for pandas 0.19 - index = pdcompat.remove_unused_levels(index) - + index = index.remove_unused_levels() full_idx = pd.MultiIndex.from_product(index.levels, names=index.names) # take a shortcut in case the MultiIndex was not modified. @@ -4987,13 +4954,6 @@ def sortby(self, variables, ascending=True): for data_array in aligned_other_vars: if data_array.ndim != 1: raise ValueError("Input DataArray is not 1-D.") - if data_array.dtype == object and LooseVersion( - np.__version__ - ) < LooseVersion("1.11.0"): - raise NotImplementedError( - "sortby uses np.lexsort under the hood, which requires " - "numpy 1.11.0 or later to support object data-type." - ) (key,) = data_array.dims vars_by_dim[key].append(data_array) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index fcd0400566f..126168d418b 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -17,10 +17,8 @@ try: import dask.array as dask_array - from . import dask_array_compat except ImportError: dask_array = None # type: ignore - dask_array_compat = None # type: ignore def _dask_or_eager_func( @@ -120,9 +118,7 @@ def notnull(data): transpose = _dask_or_eager_func("transpose") _where = _dask_or_eager_func("where", array_args=slice(3)) -isin = _dask_or_eager_func( - "isin", eager_module=npcompat, dask_module=dask_array_compat, array_args=slice(2) -) +isin = _dask_or_eager_func("isin", array_args=slice(2)) take = _dask_or_eager_func("take") broadcast_to = _dask_or_eager_func("broadcast_to") @@ -133,15 +129,13 @@ def notnull(data): array_any = _dask_or_eager_func("any") tensordot = _dask_or_eager_func("tensordot", array_args=slice(2)) -einsum = _dask_or_eager_func( - "einsum", array_args=slice(1, None), requires_dask="0.17.3" -) +einsum = _dask_or_eager_func("einsum", array_args=slice(1, None)) def gradient(x, coord, axis, edge_order): if isinstance(x, dask_array_type): - return dask_array_compat.gradient(x, coord, axis=axis, edge_order=edge_order) - return npcompat.gradient(x, coord, axis=axis, edge_order=edge_order) + return dask_array.gradient(x, coord, axis=axis, edge_order=edge_order) + return np.gradient(x, coord, axis=axis, edge_order=edge_order) def trapz(y, x, axis): diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index c6b2537c958..0c7f073819d 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -7,17 +7,12 @@ import numpy as np import pandas as pd +from pandas.errors import OutOfBoundsDatetime from .duck_array_ops import array_equiv from .options import OPTIONS from .pycompat import dask_array_type, sparse_array_type -try: - from pandas.errors import OutOfBoundsDatetime -except ImportError: - # pandas < 0.20 - from pandas.tslib import OutOfBoundsDatetime - def pretty_print(x, numchars): """Given an object `x`, call `str(x)` and format the returned string so diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 6d42c254438..010c4818ca5 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -96,28 +96,12 @@ def _is_nested_tuple(possible_tuple): ) -def _index_method_kwargs(method, tolerance): - # backwards compatibility for pandas<0.16 (method) or pandas<0.17 - # (tolerance) - kwargs = {} - if method is not None: - kwargs["method"] = method - if tolerance is not None: - kwargs["tolerance"] = tolerance - return kwargs - - -def get_loc(index, label, method=None, tolerance=None): - kwargs = _index_method_kwargs(method, tolerance) - return index.get_loc(label, **kwargs) - - def get_indexer_nd(index, labels, method=None, tolerance=None): - """ Call pd.Index.get_indexer(labels). """ - kwargs = _index_method_kwargs(method, tolerance) - + """Wrapper around :meth:`pandas.Index.get_indexer` supporting n-dimensional + labels + """ flat_labels = np.ravel(labels) - flat_indexer = index.get_indexer(flat_labels, **kwargs) + flat_indexer = index.get_indexer(flat_labels, method=method, tolerance=tolerance) indexer = flat_indexer.reshape(labels.shape) return indexer @@ -193,7 +177,9 @@ def convert_label_indexer(index, label, index_name="", method=None, tolerance=No if isinstance(index, pd.MultiIndex): indexer, new_index = index.get_loc_level(label.item(), level=0) else: - indexer = get_loc(index, label.item(), method, tolerance) + indexer = index.get_loc( + label.item(), method=method, tolerance=tolerance + ) elif label.dtype.kind == "b": indexer = label else: @@ -1382,7 +1368,6 @@ def __array__(self, dtype: DTypeLike = None) -> np.ndarray: @property def shape(self) -> Tuple[int]: - # .shape is broken on pandas prior to v0.15.2 return (len(self.array),) def __getitem__( diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 8159e8ebcf8..6eb0acd760e 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -342,8 +342,8 @@ def determine_coords( from .dataarray import DataArray from .dataset import Dataset - coord_names = set() # type: set - noncoord_names = set() # type: set + coord_names: Set[Hashable] = set() + noncoord_names: Set[Hashable] = set() for mapping in list_of_mappings: if isinstance(mapping, Dataset): diff --git a/xarray/core/missing.py b/xarray/core/missing.py index fdabdb156b6..dfe209e3f7e 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -17,10 +17,10 @@ class BaseInterpolator: """Generic interpolator class for normalizing interpolation methods """ - cons_kwargs = None # type: Dict[str, Any] - call_kwargs = None # type: Dict[str, Any] - f = None # type: Callable - method = None # type: str + cons_kwargs: Dict[str, Any] + call_kwargs: Dict[str, Any] + f: Callable + method: str def __call__(self, x): return self.f(x, **self.call_kwargs) diff --git a/xarray/core/npcompat.py b/xarray/core/npcompat.py index 22c14d9ff40..1018332df29 100644 --- a/xarray/core/npcompat.py +++ b/xarray/core/npcompat.py @@ -30,294 +30,10 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import builtins import operator -from distutils.version import LooseVersion from typing import Union import numpy as np -try: - from numpy import isin -except ImportError: - - def isin(element, test_elements, assume_unique=False, invert=False): - """ - Calculates `element in test_elements`, broadcasting over `element` - only. Returns a boolean array of the same shape as `element` that is - True where an element of `element` is in `test_elements` and False - otherwise. - - Parameters - ---------- - element : array_like - Input array. - test_elements : array_like - The values against which to test each value of `element`. - This argument is flattened if it is an array or array_like. - See notes for behavior with non-array-like parameters. - assume_unique : bool, optional - If True, the input arrays are both assumed to be unique, which - can speed up the calculation. Default is False. - invert : bool, optional - If True, the values in the returned array are inverted, as if - calculating `element not in test_elements`. Default is False. - ``np.isin(a, b, invert=True)`` is equivalent to (but faster - than) ``np.invert(np.isin(a, b))``. - - Returns - ------- - isin : ndarray, bool - Has the same shape as `element`. The values `element[isin]` - are in `test_elements`. - - See Also - -------- - in1d : Flattened version of this function. - numpy.lib.arraysetops : Module with a number of other functions for - performing set operations on arrays. - - Notes - ----- - - `isin` is an element-wise function version of the python keyword `in`. - ``isin(a, b)`` is roughly equivalent to - ``np.array([item in b for item in a])`` if `a` and `b` are 1-D - sequences. - - `element` and `test_elements` are converted to arrays if they are not - already. If `test_elements` is a set (or other non-sequence collection) - it will be converted to an object array with one element, rather than - an array of the values contained in `test_elements`. This is a - consequence of the `array` constructor's way of handling non-sequence - collections. Converting the set to a list usually gives the desired - behavior. - - .. versionadded:: 1.13.0 - - Examples - -------- - >>> element = 2*np.arange(4).reshape((2, 2)) - >>> element - array([[0, 2], - [4, 6]]) - >>> test_elements = [1, 2, 4, 8] - >>> mask = np.isin(element, test_elements) - >>> mask - array([[ False, True], - [ True, False]]) - >>> element[mask] - array([2, 4]) - >>> mask = np.isin(element, test_elements, invert=True) - >>> mask - array([[ True, False], - [ False, True]]) - >>> element[mask] - array([0, 6]) - - Because of how `array` handles sets, the following does not - work as expected: - - >>> test_set = {1, 2, 4, 8} - >>> np.isin(element, test_set) - array([[ False, False], - [ False, False]]) - - Casting the set to a list gives the expected result: - - >>> np.isin(element, list(test_set)) - array([[ False, True], - [ True, False]]) - """ - element = np.asarray(element) - return np.in1d( - element, test_elements, assume_unique=assume_unique, invert=invert - ).reshape(element.shape) - - -if LooseVersion(np.__version__) >= LooseVersion("1.13"): - gradient = np.gradient -else: - - def normalize_axis_tuple(axes, N): - if isinstance(axes, int): - axes = (axes,) - return tuple([N + a if a < 0 else a for a in axes]) - - def gradient(f, *varargs, axis=None, edge_order=1): - f = np.asanyarray(f) - N = f.ndim # number of dimensions - - axes = axis - del axis - - if axes is None: - axes = tuple(range(N)) - else: - axes = normalize_axis_tuple(axes, N) - - len_axes = len(axes) - n = len(varargs) - if n == 0: - # no spacing argument - use 1 in all axes - dx = [1.0] * len_axes - elif n == 1 and np.ndim(varargs[0]) == 0: - # single scalar for all axes - dx = varargs * len_axes - elif n == len_axes: - # scalar or 1d array for each axis - dx = list(varargs) - for i, distances in enumerate(dx): - if np.ndim(distances) == 0: - continue - elif np.ndim(distances) != 1: - raise ValueError("distances must be either scalars or 1d") - if len(distances) != f.shape[axes[i]]: - raise ValueError( - "when 1d, distances must match the " - "length of the corresponding dimension" - ) - diffx = np.diff(distances) - # if distances are constant reduce to the scalar case - # since it brings a consistent speedup - if (diffx == diffx[0]).all(): - diffx = diffx[0] - dx[i] = diffx - else: - raise TypeError("invalid number of arguments") - - if edge_order > 2: - raise ValueError("'edge_order' greater than 2 not supported") - - # use central differences on interior and one-sided differences on the - # endpoints. This preserves second order-accuracy over the full domain. - - outvals = [] - - # create slice objects --- initially all are [:, :, ..., :] - slice1 = [slice(None)] * N - slice2 = [slice(None)] * N - slice3 = [slice(None)] * N - slice4 = [slice(None)] * N - - otype = f.dtype.char - if otype not in ["f", "d", "F", "D", "m", "M"]: - otype = "d" - - # Difference of datetime64 elements results in timedelta64 - if otype == "M": - # Need to use the full dtype name because it contains unit - # information - otype = f.dtype.name.replace("datetime", "timedelta") - elif otype == "m": - # Needs to keep the specific units, can't be a general unit - otype = f.dtype - - # Convert datetime64 data into ints. Make dummy variable `y` - # that is a view of ints if the data is datetime64, otherwise - # just set y equal to the array `f`. - if f.dtype.char in ["M", "m"]: - y = f.view("int64") - else: - y = f - - for i, axis in enumerate(axes): - if y.shape[axis] < edge_order + 1: - raise ValueError( - "Shape of array too small to calculate a numerical " - "gradient, at least (edge_order + 1) elements are " - "required." - ) - # result allocation - out = np.empty_like(y, dtype=otype) - - uniform_spacing = np.ndim(dx[i]) == 0 - - # Numerical differentiation: 2nd order interior - slice1[axis] = slice(1, -1) - slice2[axis] = slice(None, -2) - slice3[axis] = slice(1, -1) - slice4[axis] = slice(2, None) - - if uniform_spacing: - out[slice1] = (f[slice4] - f[slice2]) / (2.0 * dx[i]) - else: - dx1 = dx[i][0:-1] - dx2 = dx[i][1:] - a = -(dx2) / (dx1 * (dx1 + dx2)) - b = (dx2 - dx1) / (dx1 * dx2) - c = dx1 / (dx2 * (dx1 + dx2)) - # fix the shape for broadcasting - shape = np.ones(N, dtype=int) - shape[axis] = -1 - a.shape = b.shape = c.shape = shape - # 1D equivalent -- - # out[1:-1] = a * f[:-2] + b * f[1:-1] + c * f[2:] - out[slice1] = a * f[slice2] + b * f[slice3] + c * f[slice4] - - # Numerical differentiation: 1st order edges - if edge_order == 1: - slice1[axis] = 0 - slice2[axis] = 1 - slice3[axis] = 0 - dx_0 = dx[i] if uniform_spacing else dx[i][0] - # 1D equivalent -- out[0] = (y[1] - y[0]) / (x[1] - x[0]) - out[slice1] = (y[slice2] - y[slice3]) / dx_0 - - slice1[axis] = -1 - slice2[axis] = -1 - slice3[axis] = -2 - dx_n = dx[i] if uniform_spacing else dx[i][-1] - # 1D equivalent -- out[-1] = (y[-1] - y[-2]) / (x[-1] - x[-2]) - out[slice1] = (y[slice2] - y[slice3]) / dx_n - - # Numerical differentiation: 2nd order edges - else: - slice1[axis] = 0 - slice2[axis] = 0 - slice3[axis] = 1 - slice4[axis] = 2 - if uniform_spacing: - a = -1.5 / dx[i] - b = 2.0 / dx[i] - c = -0.5 / dx[i] - else: - dx1 = dx[i][0] - dx2 = dx[i][1] - a = -(2.0 * dx1 + dx2) / (dx1 * (dx1 + dx2)) - b = (dx1 + dx2) / (dx1 * dx2) - c = -dx1 / (dx2 * (dx1 + dx2)) - # 1D equivalent -- out[0] = a * y[0] + b * y[1] + c * y[2] - out[slice1] = a * y[slice2] + b * y[slice3] + c * y[slice4] - - slice1[axis] = -1 - slice2[axis] = -3 - slice3[axis] = -2 - slice4[axis] = -1 - if uniform_spacing: - a = 0.5 / dx[i] - b = -2.0 / dx[i] - c = 1.5 / dx[i] - else: - dx1 = dx[i][-2] - dx2 = dx[i][-1] - a = (dx2) / (dx1 * (dx1 + dx2)) - b = -(dx2 + dx1) / (dx1 * dx2) - c = (2.0 * dx2 + dx1) / (dx2 * (dx1 + dx2)) - # 1D equivalent -- out[-1] = a * f[-3] + b * f[-2] + c * f[-1] - out[slice1] = a * y[slice2] + b * y[slice3] + c * y[slice4] - - outvals.append(out) - - # reset the slice object in this dimension to ":" - slice1[axis] = slice(None) - slice2[axis] = slice(None) - slice3[axis] = slice(None) - slice4[axis] = slice(None) - - if len_axes == 1: - return outvals[0] - else: - return outvals - # Vendored from NumPy 1.12; we need a version that support duck typing, even # on dask arrays with __array_function__ enabled. diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index 91998482e3e..7591fff3abe 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -39,9 +39,9 @@ from distutils.version import LooseVersion -import numpy as np import pandas as pd + # allow ourselves to type checks for Panel even after it's removed if LooseVersion(pd.__version__) < "0.25.0": Panel = pd.Panel @@ -51,78 +51,9 @@ class Panel: # type: ignore pass -# for pandas 0.19 -def remove_unused_levels(self): - """ - create a new MultiIndex from the current that removing - unused levels, meaning that they are not expressed in the labels - The resulting MultiIndex will have the same outward - appearance, meaning the same .values and ordering. It will also - be .equals() to the original. - .. versionadded:: 0.20.0 - Returns - ------- - MultiIndex - Examples - -------- - >>> i = pd.MultiIndex.from_product([range(2), list('ab')]) - MultiIndex(levels=[[0, 1], ['a', 'b']], - codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) - >>> i[2:] - MultiIndex(levels=[[0, 1], ['a', 'b']], - codes=[[1, 1], [0, 1]]) - The 0 from the first level is not represented - and can be removed - >>> i[2:].remove_unused_levels() - MultiIndex(levels=[[1], ['a', 'b']], - codes=[[0, 0], [0, 1]]) - """ - import pandas.core.algorithms as algos - - new_levels = [] - new_labels = [] - - changed = False - for lev, lab in zip(self.levels, self.labels): - - # Since few levels are typically unused, bincount() is more - # efficient than unique() - however it only accepts positive values - # (and drops order): - uniques = np.where(np.bincount(lab + 1) > 0)[0] - 1 - has_na = int(len(uniques) and (uniques[0] == -1)) - - if len(uniques) != len(lev) + has_na: - # We have unused levels - changed = True - - # Recalculate uniques, now preserving order. - # Can easily be cythonized by exploiting the already existing - # "uniques" and stop parsing "lab" when all items are found: - uniques = algos.unique(lab) - if has_na: - na_idx = np.where(uniques == -1)[0] - # Just ensure that -1 is in first position: - uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]] +def count_not_none(*args) -> int: + """Compute the number of non-None arguments. - # labels get mapped from uniques to 0:len(uniques) - # -1 (if present) is mapped to last position - label_mapping = np.zeros(len(lev) + has_na) - # ... and reassigned value -1: - label_mapping[uniques] = np.arange(len(uniques)) - has_na - - lab = label_mapping[lab] - - # new levels are simple - lev = lev.take(uniques[has_na:]) - - new_levels.append(lev) - new_labels.append(lab) - - result = self._shallow_copy() - - if changed: - result._reset_identity() - result._set_levels(new_levels, validate=False) - result._set_labels(new_labels, validate=False) - - return result + Copied from pandas.core.common.count_not_none (not part of the public API) + """ + return sum([arg is not None for arg in args]) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index a812e7472ca..3e86ebbfd73 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -1,7 +1,5 @@ import functools -import warnings from collections import OrderedDict -from distutils.version import LooseVersion import numpy as np @@ -71,17 +69,6 @@ def __init__(self, obj, windows, min_periods=None, center=False): ------- rolling : type of input argument """ - - if bottleneck is not None and ( - LooseVersion(bottleneck.__version__) < LooseVersion("1.0") - ): - warnings.warn( - "xarray requires bottleneck version of 1.0 or " - "greater for rolling operations. Rolling " - "aggregation methods will use numpy instead" - "of bottleneck." - ) - if len(windows) != 1: raise ValueError("exactly one dim/window should be provided") @@ -332,14 +319,6 @@ def _bottleneck_reduce(self, func, **kwargs): padded = self.obj.variable if self.center: - if ( - LooseVersion(np.__version__) < LooseVersion("1.13") - and self.obj.dtype.kind == "b" - ): - # with numpy < 1.13 bottleneck cannot handle np.nan-Boolean - # mixed array correctly. We cast boolean array to float. - padded = padded.astype(float) - if isinstance(padded.data, dask_array_type): # Workaround to make the padded chunk size is larger than # self.window-1 diff --git a/xarray/core/rolling_exp.py b/xarray/core/rolling_exp.py index 2139d246f46..ac6768e8a9c 100644 --- a/xarray/core/rolling_exp.py +++ b/xarray/core/rolling_exp.py @@ -1,5 +1,6 @@ import numpy as np +from .pdcompat import count_not_none from .pycompat import dask_array_type @@ -24,13 +25,11 @@ def move_exp_nanmean(array, *, axis, alpha): def _get_center_of_mass(comass, span, halflife, alpha): """ - Vendored from pandas.core.window._get_center_of_mass + Vendored from pandas.core.window.common._get_center_of_mass See licenses/PANDAS_LICENSE for the function's license """ - from pandas.core import common as com - - valid_count = com.count_not_none(comass, span, halflife, alpha) + valid_count = count_not_none(comass, span, halflife, alpha) if valid_count > 1: raise ValueError("comass, span, halflife, and alpha " "are mutually exclusive") diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 0d730edeaeb..12024ff8245 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -444,7 +444,7 @@ class OrderedSet(MutableSet[T]): __slots__ = ("_ordered_dict",) def __init__(self, values: AbstractSet[T] = None): - self._ordered_dict = OrderedDict() # type: MutableMapping[T, None] + self._ordered_dict: MutableMapping[T, None] = OrderedDict() if values is not None: # Disable type checking - both mypy and PyCharm believes that # we're altering the type of self in place (see signature of diff --git a/xarray/plot/utils.py b/xarray/plot/utils.py index f69a8af7a2f..e070ea16855 100644 --- a/xarray/plot/utils.py +++ b/xarray/plot/utils.py @@ -2,7 +2,6 @@ import textwrap import warnings from datetime import datetime -from distutils.version import LooseVersion from inspect import getfullargspec from typing import Any, Iterable, Mapping, Tuple, Union @@ -13,12 +12,9 @@ from ..core.utils import is_scalar try: - import nc_time_axis + import nc_time_axis # noqa: F401 - if LooseVersion(nc_time_axis.__version__) < LooseVersion("1.2.0"): - nc_time_axis_available = False - else: - nc_time_axis_available = True + nc_time_axis_available = True except ImportError: nc_time_axis_available = False @@ -52,15 +48,7 @@ def register_pandas_datetime_converter_if_needed(): # based on https://github.com/pandas-dev/pandas/pull/17710 global _registered if not _registered: - try: - from pandas.plotting import register_matplotlib_converters - - register_matplotlib_converters() - except ImportError: - # register_matplotlib_converters new in pandas 0.22 - from pandas.tseries import converter - - converter.register() + pd.plotting.register_matplotlib_converters() _registered = True diff --git a/xarray/testing.py b/xarray/testing.py index 787ec1aadb0..f01cbe896b9 100644 --- a/xarray/testing.py +++ b/xarray/testing.py @@ -1,6 +1,6 @@ """Testing functions exposed to the user API""" from collections import OrderedDict -from typing import Hashable, Union +from typing import Hashable, Set, Union import numpy as np import pandas as pd @@ -162,7 +162,7 @@ def _assert_indexes_invariants_checks(indexes, possible_coord_variables, dims): def _assert_variable_invariants(var: Variable, name: Hashable = None): if name is None: - name_or_empty = () # type: tuple + name_or_empty: tuple = () else: name_or_empty = (name,) assert isinstance(var._dims, tuple), name_or_empty + (var._dims,) @@ -212,7 +212,7 @@ def _assert_dataset_invariants(ds: Dataset): assert type(ds._dims) is dict, ds._dims assert all(isinstance(v, int) for v in ds._dims.values()), ds._dims - var_dims = set() # type: set + var_dims: Set[Hashable] = set() for v in ds._variables.values(): var_dims.update(v.dims) assert ds._dims.keys() == var_dims, (set(ds._dims), var_dims) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 4f5a3e37888..8b4d3073e1c 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -17,11 +17,7 @@ from xarray.core.options import set_options from xarray.plot.utils import import_seaborn -try: - from pandas.testing import assert_frame_equal -except ImportError: - # old location, for pandas < 0.20 - from pandas.util.testing import assert_frame_equal # noqa: F401 +from pandas.testing import assert_frame_equal # noqa: F401 # import mpl and change the backend before other mpl imports try: @@ -61,7 +57,6 @@ def LooseVersion(vstring): has_matplotlib, requires_matplotlib = _importorskip("matplotlib") -has_matplotlib2, requires_matplotlib2 = _importorskip("matplotlib", minversion="2") has_scipy, requires_scipy = _importorskip("scipy") has_pydap, requires_pydap = _importorskip("pydap.client") has_netCDF4, requires_netCDF4 = _importorskip("netCDF4") @@ -69,30 +64,17 @@ def LooseVersion(vstring): has_pynio, requires_pynio = _importorskip("Nio") has_pseudonetcdf, requires_pseudonetcdf = _importorskip("PseudoNetCDF") has_cftime, requires_cftime = _importorskip("cftime") -has_nc_time_axis, requires_nc_time_axis = _importorskip( - "nc_time_axis", minversion="1.2.0" -) -has_cftime_1_0_2_1, requires_cftime_1_0_2_1 = _importorskip( - "cftime", minversion="1.0.2.1" -) has_dask, requires_dask = _importorskip("dask") has_bottleneck, requires_bottleneck = _importorskip("bottleneck") +has_nc_time_axis, requires_nc_time_axis = _importorskip("nc_time_axis") has_rasterio, requires_rasterio = _importorskip("rasterio") -has_pathlib, requires_pathlib = _importorskip("pathlib") -has_zarr, requires_zarr = _importorskip("zarr", minversion="2.2") -has_np113, requires_np113 = _importorskip("numpy", minversion="1.13.0") +has_zarr, requires_zarr = _importorskip("zarr") has_iris, requires_iris = _importorskip("iris") has_cfgrib, requires_cfgrib = _importorskip("cfgrib") has_numbagg, requires_numbagg = _importorskip("numbagg") has_sparse, requires_sparse = _importorskip("sparse") # some special cases -has_h5netcdf07, requires_h5netcdf07 = _importorskip("h5netcdf", minversion="0.7") -has_h5py29, requires_h5py29 = _importorskip("h5py", minversion="2.9.0") -has_h5fileobj = has_h5netcdf07 and has_h5py29 -requires_h5fileobj = pytest.mark.skipif( - not has_h5fileobj, reason="requires h5py>2.9.0 & h5netcdf>0.7" -) has_scipy_or_netCDF4 = has_scipy or has_netCDF4 requires_scipy_or_netCDF4 = pytest.mark.skipif( not has_scipy_or_netCDF4, reason="requires scipy or netCDF4" @@ -101,8 +83,6 @@ def LooseVersion(vstring): requires_cftime_or_netCDF4 = pytest.mark.skipif( not has_cftime_or_netCDF4, reason="requires cftime or netCDF4" ) -if not has_pathlib: - has_pathlib, requires_pathlib = _importorskip("pathlib2") try: import_seaborn() has_seaborn = True @@ -116,10 +96,7 @@ def LooseVersion(vstring): if has_dask: import dask - if LooseVersion(dask.__version__) < "0.18": - dask.set_options(get=dask.get) - else: - dask.config.set(scheduler="single-threaded") + dask.config.set(scheduler="single-threaded") flaky = pytest.mark.flaky network = pytest.mark.network diff --git a/xarray/tests/test_accessor_str.py b/xarray/tests/test_accessor_str.py index 56bf6dbb3a2..5cd815eebf0 100644 --- a/xarray/tests/test_accessor_str.py +++ b/xarray/tests/test_accessor_str.py @@ -56,7 +56,7 @@ def dtype(request): def test_dask(): import dask.array as da - arr = da.from_array(["a", "b", "c"]) + arr = da.from_array(["a", "b", "c"], chunks=-1) xarr = xr.DataArray(arr) result = xarr.str.len().compute() diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 4645b4db796..0120e2ca0fe 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -14,6 +14,7 @@ import numpy as np import pandas as pd +from pandas.errors import OutOfBoundsDatetime import pytest import xarray as xr @@ -51,10 +52,8 @@ requires_cfgrib, requires_cftime, requires_dask, - requires_h5fileobj, requires_h5netcdf, requires_netCDF4, - requires_pathlib, requires_pseudonetcdf, requires_pydap, requires_pynio, @@ -80,13 +79,6 @@ except ImportError: pass -try: - from pandas.errors import OutOfBoundsDatetime -except ImportError: - # pandas < 0.20 - from pandas.tslib import OutOfBoundsDatetime - - ON_WINDOWS = sys.platform == "win32" @@ -233,8 +225,8 @@ class NetCDF3Only: class DatasetIOBase: - engine = None # type: Optional[str] - file_format = None # type: Optional[str] + engine: Optional[str] = None + file_format: Optional[str] = None def create_store(self): raise NotImplementedError() @@ -1355,19 +1347,6 @@ def test_unsorted_index_raises(self): except IndexError as err: assert "first by calling .load" in str(err) - def test_88_character_filename_segmentation_fault(self): - # should be fixed in netcdf4 v1.3.1 - with mock.patch("netCDF4.__version__", "1.2.4"): - with warnings.catch_warnings(): - message = ( - "A segmentation fault may occur when the " - "file path has exactly 88 characters" - ) - warnings.filterwarnings("error", message) - with pytest.raises(Warning): - # Need to construct 88 character filepath - xr.Dataset().to_netcdf("a" * (88 - len(os.getcwd()) - 1)) - def test_setncattr_string(self): list_of_strings = ["list", "of", "strings"] one_element_list_of_strings = ["one element"] @@ -2334,7 +2313,7 @@ def test_dump_encodings_h5py(self): assert actual.x.encoding["compression_opts"] is None -@requires_h5fileobj +@requires_h5netcdf class TestH5NetCDFFileObject(TestH5NetCDFData): engine = "h5netcdf" @@ -2754,7 +2733,6 @@ def test_open_mfdataset_2d(self): (2, 2, 2, 2), ) - @requires_pathlib def test_open_mfdataset_pathlib(self): original = Dataset({"foo": ("x", np.random.randn(10))}) with create_tmp_file() as tmp1: @@ -2768,7 +2746,6 @@ def test_open_mfdataset_pathlib(self): ) as actual: assert_identical(original, actual) - @requires_pathlib def test_open_mfdataset_2d_pathlib(self): original = Dataset({"foo": (["x", "y"], np.random.randn(10, 8))}) with create_tmp_file() as tmp1: @@ -2903,7 +2880,6 @@ def test_save_mfdataset_invalid_dataarray(self): with raises_regex(TypeError, "supports writing Dataset"): save_mfdataset([da], ["dataarray"]) - @requires_pathlib def test_save_mfdataset_pathlib_roundtrip(self): original = Dataset({"foo": ("x", np.random.randn(10))}) datasets = [original.isel(x=slice(5)), original.isel(x=slice(5, 10))] @@ -4231,7 +4207,6 @@ def test_dataarray_to_netcdf_return_bytes(self): output = data.to_netcdf() assert isinstance(output, bytes) - @requires_pathlib def test_dataarray_to_netcdf_no_name_pathlib(self): original_da = DataArray(np.arange(12).reshape((3, 4))) diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index fcc9acf75bb..e49dc72abdd 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -15,13 +15,7 @@ ) from xarray.tests import assert_array_equal, assert_identical -from . import ( - has_cftime, - has_cftime_1_0_2_1, - has_cftime_or_netCDF4, - raises_regex, - requires_cftime, -) +from . import has_cftime, has_cftime_or_netCDF4, raises_regex, requires_cftime from .test_coding_times import ( _ALL_CALENDARS, _NON_STANDARD_CALENDARS, @@ -175,14 +169,14 @@ def index_with_name(date_type): return CFTimeIndex(dates, name="foo") -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize(("name", "expected_name"), [("bar", "bar"), (None, "foo")]) def test_constructor_with_name(index_with_name, name, expected_name): result = CFTimeIndex(index_with_name, name=name).name assert result == expected_name -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_assert_all_valid_date_type(date_type, index): import cftime @@ -203,7 +197,7 @@ def test_assert_all_valid_date_type(date_type, index): assert_all_valid_date_type(np.array([date_type(1, 1, 1), date_type(1, 2, 1)])) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize( ("field", "expected"), [ @@ -221,21 +215,21 @@ def test_cftimeindex_field_accessors(index, field, expected): assert_array_equal(result, expected) -@pytest.mark.skipif(not has_cftime_1_0_2_1, reason="cftime not installed") +@requires_cftime def test_cftimeindex_dayofyear_accessor(index): result = index.dayofyear expected = [date.dayofyr for date in index] assert_array_equal(result, expected) -@pytest.mark.skipif(not has_cftime_1_0_2_1, reason="cftime not installed") +@requires_cftime def test_cftimeindex_dayofweek_accessor(index): result = index.dayofweek expected = [date.dayofwk for date in index] assert_array_equal(result, expected) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize( ("string", "date_args", "reso"), [ @@ -255,7 +249,7 @@ def test_parse_iso8601_with_reso(date_type, string, date_args, reso): assert result_reso == expected_reso -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_parse_string_to_bounds_year(date_type, dec_days): parsed = date_type(2, 2, 10, 6, 2, 8, 1) expected_start = date_type(2, 1, 1) @@ -265,7 +259,7 @@ def test_parse_string_to_bounds_year(date_type, dec_days): assert result_end == expected_end -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_parse_string_to_bounds_month_feb(date_type, feb_days): parsed = date_type(2, 2, 10, 6, 2, 8, 1) expected_start = date_type(2, 2, 1) @@ -275,7 +269,7 @@ def test_parse_string_to_bounds_month_feb(date_type, feb_days): assert result_end == expected_end -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_parse_string_to_bounds_month_dec(date_type, dec_days): parsed = date_type(2, 12, 1) expected_start = date_type(2, 12, 1) @@ -285,7 +279,7 @@ def test_parse_string_to_bounds_month_dec(date_type, dec_days): assert result_end == expected_end -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize( ("reso", "ex_start_args", "ex_end_args"), [ @@ -307,13 +301,13 @@ def test_parsed_string_to_bounds_sub_monthly( assert result_end == expected_end -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_parsed_string_to_bounds_raises(date_type): with pytest.raises(KeyError): _parsed_string_to_bounds(date_type, "a", date_type(1, 1, 1)) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_get_loc(date_type, index): result = index.get_loc("0001") assert result == slice(0, 2) @@ -328,7 +322,7 @@ def test_get_loc(date_type, index): index.get_loc("1234") -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize("kind", ["loc", "getitem"]) def test_get_slice_bound(date_type, index, kind): result = index.get_slice_bound("0001", "left", kind) @@ -348,7 +342,7 @@ def test_get_slice_bound(date_type, index, kind): assert result == expected -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize("kind", ["loc", "getitem"]) def test_get_slice_bound_decreasing_index(date_type, monotonic_decreasing_index, kind): result = monotonic_decreasing_index.get_slice_bound("0001", "left", kind) @@ -372,7 +366,7 @@ def test_get_slice_bound_decreasing_index(date_type, monotonic_decreasing_index, assert result == expected -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize("kind", ["loc", "getitem"]) def test_get_slice_bound_length_one_index(date_type, length_one_index, kind): result = length_one_index.get_slice_bound("0001", "left", kind) @@ -392,19 +386,19 @@ def test_get_slice_bound_length_one_index(date_type, length_one_index, kind): assert result == expected -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_string_slice_length_one_index(length_one_index): da = xr.DataArray([1], coords=[length_one_index], dims=["time"]) result = da.sel(time=slice("0001", "0001")) assert_identical(result, da) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_date_type_property(date_type, index): assert index.date_type is date_type -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_contains(date_type, index): assert "0001-01-01" in index assert "0001" in index @@ -413,7 +407,7 @@ def test_contains(date_type, index): assert date_type(3, 1, 1) not in index -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_groupby(da): result = da.groupby("time.month").sum("time") expected = xr.DataArray([4, 6], coords=[[1, 2]], dims=["month"]) @@ -427,7 +421,7 @@ def test_groupby(da): } -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize( "sel_arg", list(SEL_STRING_OR_LIST_TESTS.values()), @@ -439,7 +433,7 @@ def test_sel_string_or_list(da, index, sel_arg): assert_identical(result, expected) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_sel_date_slice_or_list(da, index, date_type): expected = xr.DataArray([1, 2], coords=[index[:2]], dims=["time"]) result = da.sel(time=slice(date_type(1, 1, 1), date_type(1, 12, 30))) @@ -449,14 +443,14 @@ def test_sel_date_slice_or_list(da, index, date_type): assert_identical(result, expected) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_sel_date_scalar(da, date_type, index): expected = xr.DataArray(1).assign_coords(time=index[0]) result = da.sel(time=date_type(1, 1, 1)) assert_identical(result, expected) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize( "sel_kwargs", [{"method": "nearest"}, {"method": "nearest", "tolerance": timedelta(days=70)}], @@ -471,7 +465,7 @@ def test_sel_date_scalar_nearest(da, date_type, index, sel_kwargs): assert_identical(result, expected) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize( "sel_kwargs", [{"method": "pad"}, {"method": "pad", "tolerance": timedelta(days=365)}], @@ -486,7 +480,7 @@ def test_sel_date_scalar_pad(da, date_type, index, sel_kwargs): assert_identical(result, expected) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize( "sel_kwargs", [{"method": "backfill"}, {"method": "backfill", "tolerance": timedelta(days=365)}], @@ -501,7 +495,7 @@ def test_sel_date_scalar_backfill(da, date_type, index, sel_kwargs): assert_identical(result, expected) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize( "sel_kwargs", [ @@ -515,7 +509,7 @@ def test_sel_date_scalar_tolerance_raises(da, date_type, sel_kwargs): da.sel(time=date_type(1, 5, 1), **sel_kwargs) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize( "sel_kwargs", [{"method": "nearest"}, {"method": "nearest", "tolerance": timedelta(days=70)}], @@ -534,7 +528,7 @@ def test_sel_date_list_nearest(da, date_type, index, sel_kwargs): assert_identical(result, expected) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize( "sel_kwargs", [{"method": "pad"}, {"method": "pad", "tolerance": timedelta(days=365)}], @@ -545,7 +539,7 @@ def test_sel_date_list_pad(da, date_type, index, sel_kwargs): assert_identical(result, expected) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize( "sel_kwargs", [{"method": "backfill"}, {"method": "backfill", "tolerance": timedelta(days=365)}], @@ -556,7 +550,7 @@ def test_sel_date_list_backfill(da, date_type, index, sel_kwargs): assert_identical(result, expected) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize( "sel_kwargs", [ @@ -570,7 +564,7 @@ def test_sel_date_list_tolerance_raises(da, date_type, sel_kwargs): da.sel(time=[date_type(1, 2, 1), date_type(1, 5, 1)], **sel_kwargs) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_isel(da, index): expected = xr.DataArray(1).assign_coords(time=index[0]) result = da.isel(time=0) @@ -597,7 +591,7 @@ def range_args(date_type): ] -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_indexing_in_series_getitem(series, index, scalar_args, range_args): for arg in scalar_args: assert series[arg] == 1 @@ -607,7 +601,7 @@ def test_indexing_in_series_getitem(series, index, scalar_args, range_args): assert series[arg].equals(expected) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_indexing_in_series_loc(series, index, scalar_args, range_args): for arg in scalar_args: assert series.loc[arg] == 1 @@ -617,7 +611,7 @@ def test_indexing_in_series_loc(series, index, scalar_args, range_args): assert series.loc[arg].equals(expected) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_indexing_in_series_iloc(series, index): expected = 1 assert series.iloc[0] == expected @@ -626,7 +620,7 @@ def test_indexing_in_series_iloc(series, index): assert series.iloc[:2].equals(expected) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_series_dropna(index): series = pd.Series([0.0, 1.0, np.nan, np.nan], index=index) expected = series.iloc[:2] @@ -634,7 +628,7 @@ def test_series_dropna(index): assert result.equals(expected) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_indexing_in_dataframe_loc(df, index, scalar_args, range_args): expected = pd.Series([1], name=index[0]) for arg in scalar_args: @@ -647,7 +641,7 @@ def test_indexing_in_dataframe_loc(df, index, scalar_args, range_args): assert result.equals(expected) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_indexing_in_dataframe_iloc(df, index): expected = pd.Series([1], name=index[0]) result = df.iloc[0] @@ -676,13 +670,13 @@ def test_concat_cftimeindex(date_type): assert not isinstance(da.indexes["time"], CFTimeIndex) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_empty_cftimeindex(): index = CFTimeIndex([]) assert index.date_type is None -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_cftimeindex_add(index): date_type = index.date_type expected_dates = [ @@ -697,7 +691,7 @@ def test_cftimeindex_add(index): assert isinstance(result, CFTimeIndex) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize("calendar", _CFTIME_CALENDARS) def test_cftimeindex_add_timedeltaindex(calendar): a = xr.cftime_range("2000", periods=5, calendar=calendar) @@ -708,7 +702,7 @@ def test_cftimeindex_add_timedeltaindex(calendar): assert isinstance(result, CFTimeIndex) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_cftimeindex_radd(index): date_type = index.date_type expected_dates = [ @@ -723,7 +717,7 @@ def test_cftimeindex_radd(index): assert isinstance(result, CFTimeIndex) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize("calendar", _CFTIME_CALENDARS) def test_timedeltaindex_add_cftimeindex(calendar): a = xr.cftime_range("2000", periods=5, calendar=calendar) @@ -734,7 +728,7 @@ def test_timedeltaindex_add_cftimeindex(calendar): assert isinstance(result, CFTimeIndex) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_cftimeindex_sub(index): date_type = index.date_type expected_dates = [ @@ -750,7 +744,7 @@ def test_cftimeindex_sub(index): assert isinstance(result, CFTimeIndex) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize("calendar", _CFTIME_CALENDARS) def test_cftimeindex_sub_cftimeindex(calendar): a = xr.cftime_range("2000", periods=5, calendar=calendar) @@ -761,7 +755,7 @@ def test_cftimeindex_sub_cftimeindex(calendar): assert isinstance(result, pd.TimedeltaIndex) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize("calendar", _CFTIME_CALENDARS) def test_cftimeindex_sub_cftime_datetime(calendar): a = xr.cftime_range("2000", periods=5, calendar=calendar) @@ -771,7 +765,7 @@ def test_cftimeindex_sub_cftime_datetime(calendar): assert isinstance(result, pd.TimedeltaIndex) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize("calendar", _CFTIME_CALENDARS) def test_cftime_datetime_sub_cftimeindex(calendar): a = xr.cftime_range("2000", periods=5, calendar=calendar) @@ -781,7 +775,7 @@ def test_cftime_datetime_sub_cftimeindex(calendar): assert isinstance(result, pd.TimedeltaIndex) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize("calendar", _CFTIME_CALENDARS) def test_cftimeindex_sub_timedeltaindex(calendar): a = xr.cftime_range("2000", periods=5, calendar=calendar) @@ -792,13 +786,13 @@ def test_cftimeindex_sub_timedeltaindex(calendar): assert isinstance(result, CFTimeIndex) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_cftimeindex_rsub(index): with pytest.raises(TypeError): timedelta(days=1) - index -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize("freq", ["D", timedelta(days=1)]) def test_cftimeindex_shift(index, freq): date_type = index.date_type @@ -814,14 +808,14 @@ def test_cftimeindex_shift(index, freq): assert isinstance(result, CFTimeIndex) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_cftimeindex_shift_invalid_n(): index = xr.cftime_range("2000", periods=3) with pytest.raises(TypeError): index.shift("a", "D") -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime def test_cftimeindex_shift_invalid_freq(): index = xr.cftime_range("2000", periods=3) with pytest.raises(TypeError): @@ -850,18 +844,18 @@ def test_parse_array_of_cftime_strings(): np.testing.assert_array_equal(result, expected) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize("calendar", _ALL_CALENDARS) def test_strftime_of_cftime_array(calendar): date_format = "%Y%m%d%H%M" cf_values = xr.cftime_range("2000", periods=5, calendar=calendar) dt_values = pd.date_range("2000", periods=5) - expected = dt_values.strftime(date_format) + expected = pd.Index(dt_values.strftime(date_format)) result = cf_values.strftime(date_format) assert result.equals(expected) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize("calendar", _ALL_CALENDARS) @pytest.mark.parametrize("unsafe", [False, True]) def test_to_datetimeindex(calendar, unsafe): @@ -879,7 +873,7 @@ def test_to_datetimeindex(calendar, unsafe): assert isinstance(result, pd.DatetimeIndex) -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize("calendar", _ALL_CALENDARS) def test_to_datetimeindex_out_of_range(calendar): index = xr.cftime_range("0001", periods=5, calendar=calendar) @@ -887,7 +881,7 @@ def test_to_datetimeindex_out_of_range(calendar): index.to_datetimeindex() -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.parametrize("calendar", ["all_leap", "360_day"]) def test_to_datetimeindex_feb_29(calendar): index = xr.cftime_range("2001-02-28", periods=2, calendar=calendar) @@ -895,7 +889,7 @@ def test_to_datetimeindex_feb_29(calendar): index.to_datetimeindex() -@pytest.mark.skipif(not has_cftime, reason="cftime not installed") +@requires_cftime @pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/24263") def test_multiindex(): index = xr.cftime_range("2001-01-01", periods=100, calendar="360_day") diff --git a/xarray/tests/test_cftimeindex_resample.py b/xarray/tests/test_cftimeindex_resample.py index bbc8dd82c95..c4f32795b59 100644 --- a/xarray/tests/test_cftimeindex_resample.py +++ b/xarray/tests/test_cftimeindex_resample.py @@ -8,7 +8,6 @@ from xarray.core.resample_cftime import CFTimeGrouper pytest.importorskip("cftime") -pytest.importorskip("pandas", minversion="0.24") # Create a list of pairs of similar-length initial and resample frequencies diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index a778ff8147f..406b9c1ba69 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -4,6 +4,8 @@ import numpy as np import pandas as pd import pytest +from pandas.errors import OutOfBoundsDatetime + from xarray import DataArray, Dataset, Variable, coding, decode_cf from xarray.coding.times import ( @@ -28,11 +30,6 @@ requires_cftime_or_netCDF4, ) -try: - from pandas.errors import OutOfBoundsDatetime -except ImportError: - # pandas < 0.20 - from pandas.tslib import OutOfBoundsDatetime _NON_STANDARD_CALENDARS_SET = { "noleap", @@ -119,7 +116,9 @@ def test_cf_datetime(num_dates, units, calendar): warnings.filterwarnings("ignore", "Unable to decode time axis") actual = coding.times.decode_cf_datetime(num_dates, units, calendar) - abs_diff = np.atleast_1d(abs(actual - expected)).astype(np.timedelta64) + abs_diff = np.asarray(abs(actual - expected)).ravel() + abs_diff = pd.to_timedelta(abs_diff.tolist()).to_numpy() + # once we no longer support versions of netCDF4 older than 1.1.5, # we could do this check with near microsecond accuracy: # https://github.com/Unidata/netcdf4-python/issues/355 @@ -829,8 +828,7 @@ def test_encode_cf_datetime_overflow(shape): def test_encode_cf_datetime_pandas_min(): - # Test that encode_cf_datetime does not fail for versions - # of pandas < 0.21.1 (GH 2623). + # GH 2623 dates = pd.date_range("2000", periods=3) num, units, calendar = encode_cf_datetime(dates) expected_num = np.array([0.0, 1.0, 2.0]) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index 6037669ac07..0d1e5951b32 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -714,7 +714,7 @@ def test_check_for_impossible_ordering(self): @pytest.mark.filterwarnings( - "ignore:In xarray version 0.14 `auto_combine` " "will be deprecated" + "ignore:In xarray version 0.15 `auto_combine` " "will be deprecated" ) @pytest.mark.filterwarnings("ignore:Also `open_mfdataset` will no longer") @pytest.mark.filterwarnings("ignore:The datasets supplied") diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index 784a988b7cc..3df84c0460b 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -2,7 +2,6 @@ import operator import pickle from collections import OrderedDict -from distutils.version import LooseVersion import numpy as np import pandas as pd @@ -942,12 +941,6 @@ def test_dot(use_dask): assert (actual.data == np.einsum("ij,ijk->k", a, b)).all() assert isinstance(actual.variable.data, type(da_a.variable.data)) - if use_dask: - import dask - - if LooseVersion(dask.__version__) < LooseVersion("0.17.3"): - pytest.skip("needs dask.array.einsum") - # for only a single array is passed without dims argument, just return # as is actual = xr.dot(da_a) @@ -1008,7 +1001,7 @@ def test_dot(use_dask): assert (actual.data == np.zeros(actual.shape)).all() # Invalid cases - if not use_dask or LooseVersion(dask.__version__) > LooseVersion("0.17.4"): + if not use_dask: with pytest.raises(TypeError): xr.dot(da_a, dims="a", invalid=None) with pytest.raises(TypeError): diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 76b3ed1a8d6..c142ca7643b 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -46,16 +46,9 @@ def __call__(self, dsk, keys, **kwargs): return dask.get(dsk, keys, **kwargs) -def _set_dask_scheduler(scheduler=dask.get): - """ Backwards compatible way of setting scheduler. """ - if LooseVersion(dask.__version__) >= LooseVersion("0.18.0"): - return dask.config.set(scheduler=scheduler) - return dask.set_options(get=scheduler) - - def raise_if_dask_computes(max_computes=0): scheduler = CountingScheduler(max_computes) - return _set_dask_scheduler(scheduler) + return dask.config.set(scheduler=scheduler) def test_raise_if_dask_computes(): @@ -67,9 +60,7 @@ def test_raise_if_dask_computes(): class DaskTestCase: def assertLazyAnd(self, expected, actual, test): - with _set_dask_scheduler(dask.get): - # dask.get is the syncronous scheduler, which get's set also by - # dask.config.set(scheduler="syncronous") in current versions. + with dask.config.set(scheduler="synchronous"): test(actual, expected) if isinstance(actual, Dataset): @@ -512,10 +503,7 @@ def counting_get(*args, **kwargs): count[0] += 1 return dask.get(*args, **kwargs) - if dask.__version__ < "0.19.4": - ds.load(get=counting_get) - else: - ds.load(scheduler=counting_get) + ds.load(scheduler=counting_get) assert count[0] == 1 @@ -543,7 +531,7 @@ def test_dataarray_repr_legacy(self): {!r} Coordinates: - y (x) int64 dask.array + y (x) int64 dask.array Dimensions without coordinates: x""".format( data ) @@ -838,8 +826,6 @@ def build_dask_array(name): ) -# test both the perist method and the dask.persist function -# the dask.persist function requires a new version of dask @pytest.mark.parametrize( "persist", [lambda x: x.persist(), lambda x: dask.persist(x)[0]] ) @@ -892,21 +878,12 @@ def test_dataarray_with_dask_coords(): def test_basic_compute(): ds = Dataset({"foo": ("x", range(5)), "bar": ("x", range(5))}).chunk({"x": 2}) for get in [dask.threaded.get, dask.multiprocessing.get, dask.local.get_sync, None]: - with ( - dask.config.set(scheduler=get) - if LooseVersion(dask.__version__) >= LooseVersion("0.19.4") - else dask.config.set(scheduler=get) - if LooseVersion(dask.__version__) >= LooseVersion("0.18.0") - else dask.set_options(get=get) - ): + with dask.config.set(scheduler=get): ds.compute() ds.foo.compute() ds.foo.variable.compute() -@pytest.mark.skipif( - LooseVersion(dask.__version__) < LooseVersion("0.20.0"), reason="needs newer dask" -) def test_dask_layers_and_dependencies(): ds = Dataset({"foo": ("x", range(5)), "bar": ("x", range(5))}).chunk() diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 717025afb23..4bae0d864a3 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -26,7 +26,6 @@ requires_bottleneck, requires_dask, requires_iris, - requires_np113, requires_numbagg, requires_scipy, requires_sparse, @@ -159,9 +158,7 @@ def test_struct_array_dims(self): when dimension is a structured array. """ # GH837, GH861 - # checking array subraction when dims are the same - # note: names need to be in sorted order to align consistently with - # pandas < 0.24 and >= 0.24. + # checking array subtraction when dims are the same p_data = np.array( [("Abe", 180), ("Stacy", 150), ("Dick", 200)], dtype=[("name", "|S256"), ("height", object)], @@ -3372,7 +3369,7 @@ def test_to_pandas(self): # roundtrips for shape in [(3,), (3, 4), (3, 4, 5)]: - if len(shape) > 2 and not LooseVersion(pd.__version__) < "0.25.0": + if len(shape) > 2 and LooseVersion(pd.__version__) >= "0.25.0": continue dims = list("abc")[: len(shape)] da = DataArray(np.random.randn(*shape), dims=dims) @@ -4186,12 +4183,12 @@ def test_rolling_wrapped_bottleneck(da, name, center, min_periods): assert_equal(actual, da["time"]) +@requires_dask @pytest.mark.parametrize("name", ("mean", "count")) @pytest.mark.parametrize("center", (True, False, None)) @pytest.mark.parametrize("min_periods", (1, None)) @pytest.mark.parametrize("window", (7, 8)) def test_rolling_wrapped_dask(da_dask, name, center, min_periods, window): - pytest.importorskip("dask.array") # dask version rolling_obj = da_dask.rolling(time=window, min_periods=min_periods, center=center) actual = getattr(rolling_obj, name)().load() @@ -4297,7 +4294,6 @@ def test_rolling_reduce(da, center, min_periods, window, name): assert actual.dims == expected.dims -@requires_np113 @pytest.mark.parametrize("center", (True, False)) @pytest.mark.parametrize("min_periods", (None, 1, 2, 3)) @pytest.mark.parametrize("window", (1, 2, 3, 4)) @@ -4658,7 +4654,6 @@ def test_no_dict(): d.__dict__ -@pytest.mark.skipif(sys.version_info < (3, 6), reason="requires python3.6 or higher") def test_subclass_slots(): """Test that DataArray subclasses must explicitly define ``__slots__``. diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 5d856c9f323..fdd5a419383 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -25,7 +25,7 @@ open_dataset, set_options, ) -from xarray.core import dtypes, indexing, npcompat, utils +from xarray.core import dtypes, indexing, utils from xarray.core.common import duck_array_ops, full_like from xarray.core.npcompat import IS_NEP18_ACTIVE from xarray.core.pycompat import integer_types @@ -2142,9 +2142,7 @@ def test_drop_index_labels(self): expected = data.isel(x=slice(0, 0)) assert_identical(expected, actual) - # This exception raised by pandas changed from ValueError -> KeyError - # in pandas 0.23. - with pytest.raises((ValueError, KeyError)): + with pytest.raises(KeyError): # not contained in axis data.drop(["c"], dim="x") @@ -2492,13 +2490,8 @@ def test_expand_dims_error(self): ) with raises_regex(TypeError, "value of new dimension"): original.expand_dims(OrderedDict((("d", 3.2),))) - - # TODO: only the code under the if-statement is needed when python 3.5 - # is no longer supported. - python36_plus = sys.version_info[0] == 3 and sys.version_info[1] > 5 - if python36_plus: - with raises_regex(ValueError, "both keyword and positional"): - original.expand_dims(OrderedDict((("d", 4),)), e=4) + with raises_regex(ValueError, "both keyword and positional"): + original.expand_dims(OrderedDict((("d", 4),)), e=4) def test_expand_dims_int(self): original = Dataset( @@ -2605,21 +2598,6 @@ def test_expand_dims_mixed_int_and_coords(self): ) assert_identical(actual, expected) - @pytest.mark.skipif( - sys.version_info[:2] > (3, 5), - reason="we only raise these errors for Python 3.5", - ) - def test_expand_dims_kwargs_python35(self): - original = Dataset({"x": ("a", np.random.randn(3))}) - with raises_regex(ValueError, "dim_kwargs isn't"): - original.expand_dims(e=["l", "m", "n"]) - with raises_regex(TypeError, "must be an OrderedDict"): - original.expand_dims({"e": ["l", "m", "n"]}) - - @pytest.mark.skipif( - sys.version_info[:2] < (3, 6), - reason="keyword arguments are only ordered on Python 3.6+", - ) def test_expand_dims_kwargs_python36plus(self): original = Dataset( {"x": ("a", np.random.randn(3)), "y": (["b", "a"], np.random.randn(4, 3))}, @@ -5554,7 +5532,7 @@ def test_differentiate(dask, edge_order): # along x actual = da.differentiate("x", edge_order) expected_x = xr.DataArray( - npcompat.gradient(da, da["x"], axis=0, edge_order=edge_order), + np.gradient(da, da["x"], axis=0, edge_order=edge_order), dims=da.dims, coords=da.coords, ) @@ -5569,7 +5547,7 @@ def test_differentiate(dask, edge_order): # along y actual = da.differentiate("y", edge_order) expected_y = xr.DataArray( - npcompat.gradient(da, da["y"], axis=1, edge_order=edge_order), + np.gradient(da, da["y"], axis=1, edge_order=edge_order), dims=da.dims, coords=da.coords, ) @@ -5612,7 +5590,7 @@ def test_differentiate_datetime(dask): # along x actual = da.differentiate("x", edge_order=1, datetime_unit="D") expected_x = xr.DataArray( - npcompat.gradient( + np.gradient( da, da["x"].variable._to_numeric(datetime_unit="D"), axis=0, edge_order=1 ), dims=da.dims, @@ -5649,7 +5627,7 @@ def test_differentiate_cftime(dask): da = da.chunk({"time": 4}) actual = da.differentiate("time", edge_order=1, datetime_unit="D") - expected_data = npcompat.gradient( + expected_data = np.gradient( da, da["time"].variable._to_numeric(datetime_unit="D"), axis=0, edge_order=1 ) expected = xr.DataArray(expected_data, coords=da.coords, dims=da.dims) @@ -5772,7 +5750,6 @@ def test_no_dict(): d.__dict__ -@pytest.mark.skipif(sys.version_info < (3, 6), reason="requires python3.6 or higher") def test_subclass_slots(): """Test that Dataset subclasses must explicitly define ``__slots__``. diff --git a/xarray/tests/test_distributed.py b/xarray/tests/test_distributed.py index a3bea6db85f..b3c0ce37a54 100644 --- a/xarray/tests/test_distributed.py +++ b/xarray/tests/test_distributed.py @@ -3,8 +3,8 @@ import pytest -dask = pytest.importorskip("dask", minversion="0.18") # isort:skip -distributed = pytest.importorskip("distributed", minversion="1.21") # isort:skip +dask = pytest.importorskip("dask") # isort:skip +distributed = pytest.importorskip("distributed") # isort:skip from dask.distributed import Client, Lock from distributed.utils_test import cluster, gen_cluster diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 766a391b57f..62ea19be97b 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -1,5 +1,4 @@ import warnings -from distutils.version import LooseVersion from textwrap import dedent import numpy as np @@ -28,7 +27,6 @@ arm_xfail, assert_array_equal, has_dask, - has_np113, raises_regex, requires_cftime, requires_dask, @@ -353,7 +351,7 @@ def test_reduce(dim_num, dtype, dask, func, skipna, aggdim): warnings.filterwarnings("ignore", "All-NaN slice") warnings.filterwarnings("ignore", "invalid value encountered in") - if has_np113 and da.dtype.kind == "O" and skipna: + if da.dtype.kind == "O" and skipna: # Numpy < 1.13 does not handle object-type array. try: if skipna: @@ -531,12 +529,8 @@ def test_min_count(dim_num, dtype, dask, func, aggdim): min_count = 3 actual = getattr(da, func)(dim=aggdim, skipna=True, min_count=min_count) - - if LooseVersion(pd.__version__) >= LooseVersion("0.22.0"): - # min_count is only implenented in pandas > 0.22 - expected = series_reduce(da, func, skipna=True, dim=aggdim, min_count=min_count) - assert_allclose(actual, expected) - + expected = series_reduce(da, func, skipna=True, dim=aggdim, min_count=min_count) + assert_allclose(actual, expected) assert_dask_array(actual, dask) diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index ba108b2dbaf..ae405015659 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -83,8 +83,7 @@ def test_convert_label_indexer(self): indexing.convert_label_indexer(mindex, 0) with pytest.raises(ValueError): indexing.convert_label_indexer(index, {"three": 0}) - with pytest.raises((KeyError, IndexError)): - # pandas 0.21 changed this from KeyError to IndexError + with pytest.raises(IndexError): indexing.convert_label_indexer(mindex, (slice(None), 1, "no_level")) def test_convert_unsorted_datetime_index_raises(self): diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py index 99a72d68ad8..e3b29b86e4d 100644 --- a/xarray/tests/test_plot.py +++ b/xarray/tests/test_plot.py @@ -25,7 +25,6 @@ raises_regex, requires_cftime, requires_matplotlib, - requires_matplotlib2, requires_nc_time_axis, requires_seaborn, ) @@ -360,7 +359,6 @@ def test_convenient_facetgrid(self): d[0].plot(x="x", y="y", col="z", ax=plt.gca()) @pytest.mark.slow - @requires_matplotlib2 def test_subplot_kws(self): a = easy_array((10, 15, 4)) d = DataArray(a, dims=["y", "x", "z"]) @@ -1962,10 +1960,11 @@ def test_datetime_hue(self, hue_style): ds2.plot.scatter(x="A", y="B", hue="hue", hue_style=hue_style) def test_facetgrid_hue_style(self): - # Can't move this to pytest.mark.parametrize because py35-bare-minimum - # doesn't have mpl. - for hue_style, map_type in zip( - ["discrete", "continuous"], [list, mpl.collections.PathCollection] + # Can't move this to pytest.mark.parametrize because py36-bare-minimum + # doesn't have matplotlib. + for hue_style, map_type in ( + ("discrete", list), + ("continuous", mpl.collections.PathCollection), ): g = self.ds.plot.scatter( x="A", y="B", row="row", col="col", hue="hue", hue_style=hue_style diff --git a/xarray/tests/test_ufuncs.py b/xarray/tests/test_ufuncs.py index 1095cc360dd..26241152dfa 100644 --- a/xarray/tests/test_ufuncs.py +++ b/xarray/tests/test_ufuncs.py @@ -8,7 +8,7 @@ from . import assert_array_equal from . import assert_identical as assert_identical_ -from . import mock, raises_regex, requires_np113 +from . import mock, raises_regex def assert_identical(a, b): @@ -19,7 +19,6 @@ def assert_identical(a, b): assert_array_equal(a, b) -@requires_np113 def test_unary(): args = [ 0, @@ -32,7 +31,6 @@ def test_unary(): assert_identical(a + 1, np.cos(a)) -@requires_np113 def test_binary(): args = [ 0, @@ -49,7 +47,6 @@ def test_binary(): assert_identical(t2 + 1, np.maximum(t2 + 1, t1)) -@requires_np113 def test_binary_out(): args = [ 1, @@ -64,7 +61,6 @@ def test_binary_out(): assert_identical(actual_exponent, arg) -@requires_np113 def test_groupby(): ds = xr.Dataset({"a": ("x", [0, 0, 0])}, {"c": ("x", [0, 0, 1])}) ds_grouped = ds.groupby("c") @@ -87,7 +83,6 @@ def test_groupby(): np.maximum(ds.a.variable, ds_grouped) -@requires_np113 def test_alignment(): ds1 = xr.Dataset({"a": ("x", [1, 2])}, {"x": [0, 1]}) ds2 = xr.Dataset({"a": ("x", [2, 3]), "b": 4}, {"x": [1, 2]}) @@ -104,14 +99,12 @@ def test_alignment(): assert_identical_(actual, expected) -@requires_np113 def test_kwargs(): x = xr.DataArray(0) result = np.add(x, 1, dtype=np.float64) assert result.dtype == np.float64 -@requires_np113 def test_xarray_defers_to_unrecognized_type(): class Other: def __array_ufunc__(self, *args, **kwargs): @@ -123,7 +116,6 @@ def __array_ufunc__(self, *args, **kwargs): assert np.sin(xarray_obj, out=other) == "other" -@requires_np113 def test_xarray_handles_dask(): da = pytest.importorskip("dask.array") x = xr.DataArray(np.ones((2, 2)), dims=["x", "y"]) @@ -133,7 +125,6 @@ def test_xarray_handles_dask(): assert isinstance(result, xr.DataArray) -@requires_np113 def test_dask_defers_to_xarray(): da = pytest.importorskip("dask.array") x = xr.DataArray(np.ones((2, 2)), dims=["x", "y"]) @@ -143,14 +134,12 @@ def test_dask_defers_to_xarray(): assert isinstance(result, xr.DataArray) -@requires_np113 def test_gufunc_methods(): xarray_obj = xr.DataArray([1, 2, 3]) with raises_regex(NotImplementedError, "reduce method"): np.add.reduce(xarray_obj, 1) -@requires_np113 def test_out(): xarray_obj = xr.DataArray([1, 2, 3]) @@ -164,7 +153,6 @@ def test_out(): assert_identical(other, np.array([1, 2, 3])) -@requires_np113 def test_gufuncs(): xarray_obj = xr.DataArray([1, 2, 3]) fake_gufunc = mock.Mock(signature="(n)->()", autospec=np.sin) @@ -182,7 +170,6 @@ def test_xarray_ufuncs_deprecation(): assert len(record) == 0 -@requires_np113 @pytest.mark.filterwarnings("ignore::RuntimeWarning") @pytest.mark.parametrize( "name", diff --git a/xarray/tests/test_utils.py b/xarray/tests/test_utils.py index 254983364f9..859306b88cb 100644 --- a/xarray/tests/test_utils.py +++ b/xarray/tests/test_utils.py @@ -73,9 +73,7 @@ def test_multiindex_from_product_levels(): [pd.Index(["b", "a"]), pd.Index([1, 3, 2])] ) np.testing.assert_array_equal( - # compat for pandas < 0.24 - result.codes if hasattr(result, "codes") else result.labels, - [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], + result.codes, [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] ) np.testing.assert_array_equal(result.levels[0], ["b", "a"]) np.testing.assert_array_equal(result.levels[1], [1, 3, 2]) @@ -89,9 +87,7 @@ def test_multiindex_from_product_levels_non_unique(): [pd.Index(["b", "a"]), pd.Index([1, 1, 2])] ) np.testing.assert_array_equal( - # compat for pandas < 0.24 - result.codes if hasattr(result, "codes") else result.labels, - [[0, 0, 0, 1, 1, 1], [0, 0, 1, 0, 0, 1]], + result.codes, [[0, 0, 0, 1, 1, 1], [0, 0, 1, 0, 0, 1]] ) np.testing.assert_array_equal(result.levels[0], ["b", "a"]) np.testing.assert_array_equal(result.levels[1], [1, 2]) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 7f9538c9ea9..172a23d9a76 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -2,7 +2,6 @@ from collections import OrderedDict from copy import copy, deepcopy from datetime import datetime, timedelta -from distutils.version import LooseVersion from textwrap import dedent import numpy as np @@ -1837,13 +1836,6 @@ def test_getitem_fancy(self): def test_getitem_1d_fancy(self): super().test_getitem_1d_fancy() - def test_equals_all_dtypes(self): - import dask - - if "0.18.2" <= LooseVersion(dask.__version__) < "0.19.1": - pytest.xfail("https://github.com/pydata/xarray/issues/2318") - super().test_equals_all_dtypes() - def test_getitem_with_mask_nd_indexer(self): import dask.array as da From 132733a917171fcb1f269406eb9e6668cbb7e376 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 8 Oct 2019 22:13:47 +0000 Subject: [PATCH 6/6] Fix concat bug when concatenating unlabeled dimensions. (#3362) * Fix concat bug when concatenating unlabeled dimensions. * Add whats-new * Add back older test. * fix test * Revert "fix test" This reverts commit c33ca34a012c97c82be278fb0b8c1aeb000a284d. * better fix --- doc/whats-new.rst | 2 ++ xarray/core/concat.py | 13 ++++++++++--- xarray/tests/test_concat.py | 10 ++++++++-- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 5b73059b34c..81206cc5cc1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -67,6 +67,8 @@ Bug fixes - Line plots with the ``x`` or ``y`` argument set to a 1D non-dimensional coord now plot the correct data for 2D DataArrays (:issue:`3334`). By `Tom Nicholas `_. +- Fix error in concatenating unlabeled dimensions (:pull:`3362`). + By `Deepak Cherian `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 5c9beda3f74..75c72c99a42 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -177,8 +177,6 @@ def _calc_concat_over(datasets, dim, dim_names, data_vars, coords, compat): if dim not in ds.dims: if dim in ds: ds = ds.set_coords(dim) - else: - raise ValueError("%r is not present in all datasets" % dim) concat_over.update(k for k, v in ds.variables.items() if dim in v.dims) concat_dim_lengths.append(ds.dims.get(dim, 1)) @@ -362,12 +360,21 @@ def ensure_common_dims(vars): # n.b. this loop preserves variable order, needed for groupby. for k in datasets[0].variables: if k in concat_over: - vars = ensure_common_dims([ds.variables[k] for ds in datasets]) + try: + vars = ensure_common_dims([ds.variables[k] for ds in datasets]) + except KeyError: + raise ValueError("%r is not present in all datasets." % k) combined = concat_vars(vars, dim, positions) assert isinstance(combined, Variable) result_vars[k] = combined result = Dataset(result_vars, attrs=result_attrs) + absent_coord_names = coord_names - set(result.variables) + if absent_coord_names: + raise ValueError( + "Variables %r are coordinates in some datasets but not others." + % absent_coord_names + ) result = result.set_coords(coord_names) result.encoding = result_encoding diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index d2635e4451a..1114027387e 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -42,8 +42,10 @@ def test_concat_compat(): for var in ["has_x", "no_x_y"]: assert "y" not in result[var] + with raises_regex(ValueError, "coordinates in some datasets but not others"): + concat([ds1, ds2], dim="q") with raises_regex(ValueError, "'q' is not present in all datasets"): - concat([ds1, ds2], dim="q", data_vars="all", compat="broadcast_equals") + concat([ds2, ds1], dim="q") class TestConcatDataset: @@ -90,7 +92,11 @@ def test_concat_coords_kwarg(self, data, dim, coords): assert_equal(data["extra"], actual["extra"]) def test_concat(self, data): - split_data = [data.isel(dim1=slice(3)), data.isel(dim1=slice(3, None))] + split_data = [ + data.isel(dim1=slice(3)), + data.isel(dim1=3), + data.isel(dim1=slice(4, None)), + ] assert_identical(data, concat(split_data, "dim1")) def test_concat_dim_precedence(self, data):