From aedb9a4943843ace5cd23ac6814e689aac5c1eb6 Mon Sep 17 00:00:00 2001 From: Max Hargreaves Date: Tue, 19 Sep 2023 16:57:42 -0700 Subject: [PATCH 1/6] Change: progress --- pyproject.toml | 5 + setup.cfg | 4 +- src/biocframe/BiocFrame.py | 439 ++++++++++++++++++-------------- src/biocframe/__init__.py | 24 +- src/biocframe/_type_checks.py | 11 +- src/biocframe/_validators.py | 98 ++++--- src/biocframe/io/from_pandas.py | 7 +- src/biocframe/types.py | 41 ++- src/biocframe/utils.py | 54 ++-- tests/test_utils.py | 6 +- 10 files changed, 396 insertions(+), 293 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a7cea75..5980f62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,9 @@ line-length = 120 src = ["src"] exclude = ["tests"] extend-ignore = ["F821"] +select = ["E", "F", "I", "D", "PLC", "A", "RUF"] +ignore = ["E501", "D203", "D213", "A003"] +unfixable = ["F401", "F841"] [tool.ruff.pydocstyle] convention = "google" @@ -19,5 +22,7 @@ convention = "google" [tool.ruff.per-file-ignores] "__init__.py" = ["E402", "F401"] +"**/__init__.py" = ["PLC0414"] + [tool.black] force-exclude = "__init__.py" diff --git a/setup.cfg b/setup.cfg index b38a996..b8fa774 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,7 +41,7 @@ package_dir = =src # Require a min/specific Python version (comma-separated conditions) -# python_requires = >=3.8 +python_requires = >=3.8 # Add here dependencies of your project (line-separated), e.g. requests>=2.2,<3.0. # Version specifiers like >=2.2,<3.0 avoid problems due to API changes in @@ -50,6 +50,7 @@ package_dir = install_requires = importlib-metadata; python_version<"3.8" prettytable + pandas [options.packages.find] where = src @@ -68,7 +69,6 @@ testing = pytest pytest-cov numpy - pandas [options.entry_points] # Add here console scripts like: diff --git a/src/biocframe/BiocFrame.py b/src/biocframe/BiocFrame.py index 2b01302..79ffa9f 100644 --- a/src/biocframe/BiocFrame.py +++ b/src/biocframe/BiocFrame.py @@ -1,18 +1,43 @@ -from collections import OrderedDict -from typing import List, MutableMapping, Optional, Sequence, Tuple, Union - -from pandas.api.types import is_numeric_dtype +"""A Bioconductor-like data frame.""" + +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, + Union, + cast, + overload, +) + +from pandas.api.types import is_numeric_dtype # type: ignore from prettytable import PrettyTable from ._type_checks import is_list_of_type from ._validators import validate_cols, validate_rows, validate_unique_list -from .types import SlicerArgTypes, SlicerTypes -from .utils import _match_to_indices, _slice_or_index +from .types import ( + AllSlice, + AtomicSlice, + BiocSeq, + ColType, + DataType, + RangeSlice, + SimpleSlice, +) +from .utils import match_to_indices, slice_or_index + +try: + from pandas import DataFrame, RangeIndex +except Exception: + pass __author__ = "jkanche" __copyright__ = "jkanche" __license__ = "MIT" +ItemType = Union["BiocFrame", ColType] + class BiocFrameIter: """An iterator to a :py:class:`~biocframe.BiocFrame.BiocFrame` object. @@ -25,12 +50,12 @@ def __init__(self, obj: "BiocFrame") -> None: self._bframe = obj self._current_index = 0 - def __iter__(self): + def __iter__(self) -> "BiocFrameIter": return self def __next__(self): if self._current_index < len(self._bframe): - iter_row_index = ( + iter_row_index: Optional[str] = ( self._bframe.row_names[self._current_index] if self._bframe.row_names is not None else None @@ -46,7 +71,7 @@ def __next__(self): class BiocFrame: """`BiocFrame` is an alternative to :class:`~pandas.DataFrame`. - Columns may extend :class:`~collections.abc.Sequence`, + Columns may extend :class:`~collections.abc.List`, and must implement the length (``__len__``) and slice (``__getitem__``) dunder methods. This allows :py:class:`~biocframe.BiocFrame.BiocFrame` to accept nested `BiocFrame` objects as columns. @@ -86,59 +111,67 @@ class BiocFrame: .. code-block:: python sliced_bframe = bframe[1:2, [True, False, False]] - - Attributes: - data (MutableMapping[str, Union[Sequence, MutableMapping]], optional): - Dictionary of column names as `keys` and their values. all columns must have - the same length. Defaults to None. - number_of_rows (int, optional): Number of rows. Defaults to None. - row_names (Sequence, optional): Row index names. Defaults to None. - column_names (Sequence[str], optional): Column names, if not provided, - is automatically inferred from data. Defaults to None. - metadata (MutableMapping, optional): Additional metadata. Defaults to None. - - Raises: - ValueError: if rows or columns mismatch from data. """ def __init__( self, - data: Optional[MutableMapping[str, Union[Sequence, MutableMapping]]] = None, + data: Optional[DataType] = None, number_of_rows: Optional[int] = None, - row_names: Optional[Sequence[str]] = None, - column_names: Optional[Sequence[str]] = None, - metadata: Optional[MutableMapping] = None, + row_names: Optional[List[str]] = None, + column_names: Optional[List[str]] = None, + metadata: Optional[Dict[str, Any]] = None, ) -> None: - self._number_of_rows = number_of_rows + """Initialize a `BiocFrame` object. + + Args: + data (Dict[str, Union[List, Dict, BioSeq]], optional): + Dictionary of column names as `keys` and their values. all columns must have + the same length. Defaults to None. + number_of_rows (int, optional): Number of rows. Defaults to None. + row_names (List, optional): Row index names. Defaults to None. + column_names (List[str], optional): Column names, if not provided, + is automatically inferred from data keys. Defaults to None. + metadata (Dict, optional): Additional metadata. Defaults to None. + + Raises: + ValueError: if rows or columns mismatch from data. + """ + self._data: DataType = {} if data is None else data self._row_names = row_names - self._data = {} if data is None else data - self._column_names = column_names + self._number_of_rows = validate_rows( + self._data, number_of_rows, self._row_names + ) + self._column_names, self._data = validate_cols( + column_names, self._data + ) + self._number_of_columns = len(self._column_names) self._metadata = metadata - self._validate() + @classmethod + def from_pandas(cls, df: "DataFrame") -> "BiocFrame": + """Read a :py:class:`~biocframe.BiocFrame.BiocFrame` from :py:class:`~pandas.DataFrame` object. - def _validate(self): - """Internal method to validate the object. + Args: + df (:py:class:`~pandas.DataFrame`): Input data. Raises: - ValueError: When all columns does not contain the - same number of rows. - ValueError: When row index is not unique. - """ + TypeError: If ``input`` is not a :py:class:`~pandas.DataFrame`. - self._number_of_rows = validate_rows( - self._data, self._number_of_rows, self._row_names - ) - self._column_names, self._data = validate_cols(self._column_names, self._data) + Returns: + BiocFrame: A :py:class:`~biocframe.BiocFrame.BiocFrame` object. + """ + r_data: Dict[str, List[Any]] = df.to_dict("list") # type: ignore + r_index: Optional[List[str]] = None - self._number_of_columns = len(self._column_names) + if df.index is not RangeIndex: # type: ignore + r_index = df.index.to_list() # type: ignore - if self._number_of_rows is None: - self._number_of_rows = 0 + return BiocFrame(data=r_data, row_names=r_index) def __repr__(self) -> str: + """Get a machine-readable string representation of the object.""" table = PrettyTable(padding_width=1) - table.field_names = [str(col) for col in self.column_names] + table.field_names = [str(col) for col in self._column_names] _rows = [] rows_to_show = 2 @@ -155,7 +188,7 @@ def __repr__(self) -> str: if self.shape[0] > 2 * rows_to_show: # add ... - _rows.append(["..." for _ in range(len(self.column_names))]) + _rows.append(["..." for _ in range(len(self._column_names))]) _last = self.shape[0] - _top if _last <= rows_to_show: @@ -189,31 +222,31 @@ def shape(self) -> Tuple[int, int]: return (self._number_of_rows, self._number_of_columns) @property - def row_names(self) -> Optional[List]: - """Access row index (names). + def row_names(self) -> Optional[List[str]]: + """Get/set the row names. - Returns: - (List, optional): Row names if available, else None. - """ - return self._row_names - - @row_names.setter - def row_names(self, names: Optional[Sequence]): - """Set new row index. All values in ``names`` must be unique. + Set new row index. All values in ``names`` must be unique. Args: - names (Sequence, optional): A list of unique values. + names (List, optional): A list of unique values, or `None`. If + `None` row names are removed. + + Returns: + (List, optional): Row names if available, else None. Raises: ValueError: Length of ``names`` does not match number of rows. ValueError: ``names`` is not unique. """ + return self._row_names + @row_names.setter + def row_names(self, names: Optional[List[str]]) -> None: if names is not None: if len(names) != self.shape[0]: raise ValueError( - "Length of `names` does not match the number of rows, need to be " - f"{self.shape[0]} but provided {len(names)}." + "Length of `names` does not match the number of rows, " + f"need to be {self.shape[0]} but provided {len(names)}." ) if not validate_unique_list(names): @@ -222,58 +255,53 @@ def row_names(self, names: Optional[Sequence]): self._row_names = names @property - def data(self) -> MutableMapping[str, Union[Sequence, MutableMapping]]: + def data(self) -> DataType: """Access data as :py:class:`dict`. Returns: - MutableMapping[str, Union[Sequence, MutableMapping]]: + Dict[str, Union[List, Dict]]: Dictionary of columns and their values. """ return self._data @property - def column_names(self) -> list: - """Access column names. + def column_names(self) -> List[str]: + """Get/set the column_names. + + Args: + names (List[str]): A list of unique values. Returns: list: A list of column names. - """ - return self._column_names - - @column_names.setter - def column_names(self, names: Sequence[str]): - """Set new column names. New names must be unique. - - Args: - names (Sequence[str]): A list of unique values. Raises: ValueError: Length of ``names`` does not match number of columns. ValueError: ``names`` is not unique. """ + return self._column_names - if names is None: - raise ValueError("`names` cannot be `None`!") - + @column_names.setter + def column_names(self, names: List[str]) -> None: if len(names) != self._number_of_columns: raise ValueError( - "Length of `names` does not match number of columns, need to be " - f"{self._number_of_columns} but provided {len(names)}." + "Length of `names` does not match number of columns. Needs to " + f"be {self._number_of_columns} but provided {len(names)}." ) if not (validate_unique_list(names)): raise ValueError("Column names must be unique!") - new_data = OrderedDict() - for idx in range(len(names)): - new_data[names[idx]] = self._data[self.column_names[idx]] - self._column_names = names - self._data = new_data + self._data = { + names[i]: v for i, (_, v) in enumerate(self.data.items()) + } @property - def metadata(self) -> Optional[dict]: - """Access metadata. + def metadata(self) -> Optional[Dict[str, Any]]: + """Get/set the metadata. + + Args: + metadata (Dict, Optional): New metadata object. Returns: (dict, optional): Metadata if available. @@ -281,18 +309,7 @@ def metadata(self) -> Optional[dict]: return self._metadata @metadata.setter - def metadata(self, metadata: Optional[MutableMapping]): - """Set new metadata. - - Args: - metadata (MutableMapping, Optional): New metadata object. - """ - if metadata is not None: - if not isinstance(metadata, dict): - raise TypeError( - f"`metadata` must be a dictionary, provided {type(metadata)}" - ) - + def metadata(self, metadata: Optional[Dict[str, Any]]): self._metadata = metadata def has_column(self, name: str) -> bool: @@ -306,53 +323,35 @@ def has_column(self, name: str) -> bool: """ return name in self.column_names - def column(self, index_or_name: Union[str, int]) -> Union[Sequence, MutableMapping]: - """Access a column by integer position or column label. - - Args: - index_or_name (Union[str, int]): Name of the column, must be present in - :py:attr:`~biocframe.BiocFrame.BiocFrame.column_names`. - - Alternatively, you may provide the integer index of the column to - access. - - Raises: - ValueError: if ``index_or_name`` is not in column names. - ValueError: if integer index is greater than number of columns. - TypeError: if ``index_or_name`` is neither a string nor an integer. - - Returns: - Union[Sequence, MutableMapping]: Column with its original type preserved. - """ - - return self[:, index_or_name] - - def row(self, index_or_name: Union[str, int]) -> dict: - """Access a row by integer position or row name. - - Args: - index_or_name (Union[str, int]): Integer index of the row to access. - - Alternatively, you may provide a string specifying the row to access, - only if :py:attr:`~biocframe.BiocFrame.BiocFrame.row_names` are - available. - - Raises: - ValueError: if ``index_or_name`` not in row names. - ValueError: if integer index greater than number of rows. - TypeError: if ``index_or_name`` is neither a string nor an integer. + @overload + def _slice( + self, + row_indices_or_names: Optional[AtomicSlice], + column_indices_or_names: Optional[AtomicSlice], + ) -> Dict[str, Any]: + ... - Returns: - dict: A dictionary with keys as column names and their values. - """ + @overload + def _slice( + self, + row_indices_or_names: Optional[RangeSlice], + column_indices_or_names: Optional[RangeSlice], + ) -> ItemType: + ... - return self[index_or_name, :] + @overload + def _slice( + self, + row_indices_or_names: Union[AtomicSlice, slice], + column_indices_or_names: Union[AtomicSlice, slice], + ) -> ItemType: + ... def _slice( self, - row_indices_or_names: Optional[SlicerTypes] = None, - column_indices_or_names: Optional[SlicerTypes] = None, - ) -> Union["BiocFrame", dict, list]: + row_indices_or_names: Optional[AllSlice] = None, + column_indices_or_names: Optional[AllSlice] = None, + ) -> ItemType: """Internal method to slice by index or values. Args: @@ -370,8 +369,6 @@ def _slice( - If a single column is sliced, returns a :py:class:`list`. - For all other scenarios, returns the same type as caller with the subsetted rows and columns. """ - - new_data = OrderedDict() new_row_names = self.row_names new_column_names = self.column_names is_row_unary = False @@ -379,14 +376,15 @@ def _slice( # slice the columns and data if column_indices_or_names is not None: - new_column_indices, is_col_unary = _match_to_indices( + new_column_indices, is_col_unary = match_to_indices( self.column_names, column_indices_or_names ) - new_column_names = _slice_or_index(new_column_names, new_column_indices) + new_column_names = cast( + List[str], slice_or_index(new_column_names, new_column_indices) + ) - for col in new_column_names: - new_data[col] = self._data[col] + new_data = {col: self._data[col] for col in new_column_names} # slice the rows of the data new_number_of_rows = None @@ -395,44 +393,55 @@ def _slice( if temp_row_names is None: temp_row_names = list(range(self.shape[0])) - new_row_indices, is_row_unary = _match_to_indices( + new_row_indices, is_row_unary = match_to_indices( temp_row_names, row_indices_or_names ) - new_row_names = _slice_or_index(temp_row_names, new_row_indices) + new_row_names = slice_or_index(temp_row_names, new_row_indices) new_number_of_rows = len(new_row_names) for k, v in new_data.items(): - if hasattr(v, "shape"): - tmp = [slice(None)] * len(v.shape) + if isinstance(v, BiocSeq): + tmp: List[SimpleSlice] = [slice(None)] * len(v.shape) tmp[0] = new_row_indices new_data[k] = v[(*tmp,)] else: - new_data[k] = _slice_or_index(v, new_row_indices) + new_data[k] = slice_or_index(v, new_row_indices) else: new_number_of_rows = self.shape[0] if is_row_unary is True: - rdata = {} - for col in new_column_names: - rdata[col] = new_data[col][0] - return rdata - elif is_col_unary is True: + return { + col: next( + iter( + new_data[col].values() # type: ignore + if isinstance(new_data[col], dict) + else new_data[col] + ) + ) + for col in new_column_names + } + + if is_col_unary is True: return new_data[new_column_names[0]] - current_class_const = type(self) - return current_class_const( + return type(self)( data=new_data, number_of_rows=new_number_of_rows, row_names=new_row_names, column_names=new_column_names, ) + @overload + def __getitem__(self, __key: AtomicSlice) -> Dict[str, Any]: + ... + + @overload + def __getitem__(self, __key: RangeSlice) -> ItemType: + ... + # TODO: implement in-place or views - def __getitem__( - self, - args: SlicerArgTypes, - ) -> Union["BiocFrame", dict, list]: + def __getitem__(self, __key: AllSlice) -> ItemType: """Subset the data frame. This operation returns a new object with the same type as caller. @@ -498,46 +507,84 @@ def __getitem__( - If a single column is sliced, returns a :py:class:`list`. - For all other scenarios, returns the same type as caller with the subsetted rows and columns. """ - # not an array, single str, slice by column - if isinstance(args, str): - return self._slice(None, args) + if isinstance(__key, str): + return self._slice(None, __key) - if isinstance(args, int): - return self._slice(args, None) + if isinstance(__key, bool): + return self._slice(__key, None) + + if isinstance(__key, int): + return self._slice(__key, None) # not an array, a slice - if isinstance(args, slice): - return self._slice(args, None) + if isinstance(__key, slice): + return self._slice(__key, None) - if isinstance(args, list): + if isinstance(__key, list): # column names if everything is a string - if is_list_of_type(args, str): - return self._slice(None, args) - elif is_list_of_type(args, int): - return self._slice(args, None) + if is_list_of_type(__key, str): + return self._slice(None, __key) + elif is_list_of_type(__key, int): + return self._slice(__key, None) + elif is_list_of_type(__key, bool): + return self._slice(__key, None) else: raise ValueError("`args` is not supported.") # tuple - if isinstance(args, tuple): - if len(args) == 0: - raise ValueError("`args` must contain at least one slice.") - - if len(args) == 1: - return self._slice(args[0], None) - elif len(args) == 2: - return self._slice( - args[0], - args[1], - ) - else: - raise ValueError("Length of `args` is more than 2.") + if len(__key) == 0: + raise ValueError("`args` must contain at least one slice.") + + if len(__key) == 1: + return self._slice(__key[0], None) + elif len(__key) == 2: + return self._slice(__key[0], __key[1]) + else: + raise ValueError("Length of `args` is more than 2.") - raise TypeError("`args` is not supported.") + def column(self, index_or_name: AtomicSlice) -> ItemType: + """Access a column by integer position or column label. + + Args: + index_or_name (Union[str, int]): Name of the column, must be present in + :py:attr:`~biocframe.BiocFrame.BiocFrame.column_names`. + + Alternatively, you may provide the integer index of the column to + access. + + Raises: + ValueError: if ``index_or_name`` is not in column names. + ValueError: if integer index is greater than number of columns. + TypeError: if ``index_or_name`` is neither a string nor an integer. + + Returns: + Any: Column with its original type preserved. + """ + return self[:, index_or_name] + + def row(self, index_or_name: AtomicSlice) -> Dict[str, Any]: + """Access a row by integer position or row name. + + Args: + index_or_name (Union[str, int]): Integer index of the row to access. + + Alternatively, you may provide a string specifying the row to access, + only if :py:attr:`~biocframe.BiocFrame.BiocFrame.row_names` are + available. + + Raises: + ValueError: if ``index_or_name`` not in row names. + ValueError: if integer index greater than number of rows. + TypeError: if ``index_or_name`` is neither a string nor an integer. + + Returns: + Any: A dictionary with keys as column names and their values. + """ + return self[index_or_name, :] # TODO: implement in-place or views - def __setitem__(self, name: str, value: Sequence): + def __setitem__(self, name: str, value: ColType) -> None: """Add or re-assign a value to a column. Usage: @@ -560,7 +607,7 @@ def __setitem__(self, name: str, value: Sequence): Args: name (str): Name of the column. - value (Sequence): New value to set. + value (List): New value to set. Raises: ValueError: If length of ``value`` does not match the number of rows. @@ -575,7 +622,8 @@ def __setitem__(self, name: str, value: Sequence): self._column_names.append(name) self._number_of_columns += 1 - self._data[name] = value + # Dunno how to fix this one... + self._data[name] = value # type: ignore # TODO: implement in-place or view def __delitem__(self, name: str): @@ -654,7 +702,6 @@ def __array_ufunc__(self, func, method, *inputs, **kwargs) -> "BiocFrame": Returns: An object with the same type as caller. """ - from pandas import Series input = inputs[0] @@ -668,9 +715,10 @@ def __array_ufunc__(self, func, method, *inputs, **kwargs) -> "BiocFrame": return input + ########################################################################### # compatibility with Pandas @property - def columns(self) -> list: + def columns(self) -> List[str]: """Alias to :py:meth:`~biocframe.BiocFrame.BiocFrame.column_names`. Returns: @@ -679,7 +727,7 @@ def columns(self) -> list: return self.column_names @property - def index(self) -> Optional[list]: + def index(self) -> Optional[List[str]]: """Alias to :py:meth:`~biocframe.BiocFrame.BiocFrame.row_names`. Returns: @@ -687,9 +735,10 @@ def index(self) -> Optional[list]: """ return self.row_names + ########################################################################### # compatibility with R interfaces @property - def rownames(self) -> Optional[list]: + def rownames(self) -> Optional[List[str]]: """Alias to :py:meth:`~biocframe.BiocFrame.BiocFrame.row_names`. Returns: @@ -698,7 +747,7 @@ def rownames(self) -> Optional[list]: return self.row_names @rownames.setter - def rownames(self, names: list): + def rownames(self, names: List[str]): """Alias to :py:meth:`~biocframe.BiocFrame.BiocFrame.row_names`. Args: @@ -707,7 +756,7 @@ def rownames(self, names: list): self.row_names = names @property - def colnames(self) -> list: + def colnames(self) -> List[str]: """Alias to :py:meth:`~biocframe.BiocFrame.BiocFrame.column_names`. Returns: @@ -716,7 +765,7 @@ def colnames(self) -> list: return self.column_names @colnames.setter - def colnames(self, names: list): + def colnames(self, names: List[str]): """Alias to :py:meth:`~biocframe.BiocFrame.BiocFrame.column_names`. Args: diff --git a/src/biocframe/__init__.py b/src/biocframe/__init__.py index f639c2b..08f9351 100644 --- a/src/biocframe/__init__.py +++ b/src/biocframe/__init__.py @@ -1,19 +1,21 @@ -import sys +from sys import version_info -if sys.version_info[:2] >= (3, 8): - # TODO: Import directly (no need for conditional) when `python_requires = >= 3.8` - from importlib.metadata import PackageNotFoundError, version # pragma: no cover +if version_info[:2] >= (3, 8): + from importlib.metadata import ( # type: ignore + PackageNotFoundError, # type: ignore + version, # type: ignore + ) else: - from importlib_metadata import PackageNotFoundError, version # pragma: no cover + from importlib_metadata import ( + PackageNotFoundError, # type: ignore + version, # type: ignore + ) try: - # Change here if project is renamed and does not equal the package name - dist_name = "BiocFrame" - __version__ = version(dist_name) + __version__: str = version(__name__.rsplit(".", 1)[0]) # type: ignore except PackageNotFoundError: # pragma: no cover - __version__ = "unknown" + __version__: str = "unknown" finally: del version, PackageNotFoundError -from .BiocFrame import BiocFrame -from .io import from_pandas +from .BiocFrame import BiocFrame as BiocFrame diff --git a/src/biocframe/_type_checks.py b/src/biocframe/_type_checks.py index 84db192..f72ffe3 100644 --- a/src/biocframe/_type_checks.py +++ b/src/biocframe/_type_checks.py @@ -1,11 +1,14 @@ -from typing import Any, Callable +"""Checks for types of objects.""" + +from collections.abc import Sequence as c_Sequence +from typing import Any, Sequence __author__ = "jkanche, keviny2" __copyright__ = "jkanche" __license__ = "MIT" -def is_list_of_type(x: Any, target_type: Callable) -> bool: +def is_list_of_type(x: Sequence[Any], target_type: type) -> bool: """Checks if ``x`` is a list, and whether all elements of the list are of the same type. Args: @@ -15,6 +18,4 @@ def is_list_of_type(x: Any, target_type: Callable) -> bool: Returns: bool: True if ``x`` is :py:class:`list` and all elements are of the same type. """ - return (isinstance(x, list) or isinstance(x, tuple)) and all( - isinstance(item, target_type) for item in x - ) + return (x, c_Sequence) and all(isinstance(item, target_type) for item in x) diff --git a/src/biocframe/_validators.py b/src/biocframe/_validators.py index fec8425..b08efae 100644 --- a/src/biocframe/_validators.py +++ b/src/biocframe/_validators.py @@ -1,5 +1,8 @@ -from collections import OrderedDict -from typing import Dict, List, MutableMapping, Optional, Sequence, Tuple, Union +"""Validators for :py:class:`~biocframe.BiocFrame.BiocFrame` object.""" + +from typing import Any, List, Optional, Tuple + +from .types import DataType __author__ = "jkanche" __copyright__ = "jkanche" @@ -7,18 +10,18 @@ def validate_rows( - data: MutableMapping[str, Union[Sequence, MutableMapping]], - number_of_rows: Optional[int], - row_names: Optional[Sequence[str]], + data: DataType, + number_of_rows: Optional[int] = None, + row_names: Optional[List[str]] = None, ) -> int: """Validate rows of :py:class:`~biocframe.BiocFrame.BiocFrame` object. Args: - data (MutableMapping[str, Union[Sequence, MutableMapping]], optional): + data (MutableMapping[str, Union[List, MutableMapping]], optional): Dictionary of columns and their values. all columns must have the same length. Defaults to {}. number_of_rows (int, optional): Number of rows. - row_names (Sequence[str], optional): Row index values. + row_names (List[str], optional): Row index values. Raises: @@ -28,63 +31,64 @@ def validate_rows( Returns: int: Validated number of rows in ``data``. """ - incorrect_len_keys = [] - for k, v in data.items(): - tmpLen = len(v) - - if number_of_rows is None: - number_of_rows = tmpLen - elif number_of_rows != tmpLen: - incorrect_len_keys.append(k) - - if len(incorrect_len_keys) > 0: + lengths = {k: len(v) for k, v in data.items()} + mean_len = sum(lengths.values()) / len(lengths.values()) + int_mean_len = int(mean_len) + + if int_mean_len != mean_len or ( + number_of_rows is not None and int_mean_len != number_of_rows + ): + number_of_rows = ( + int_mean_len if number_of_rows is None else number_of_rows + ) + bad_rows = [k for k, v in lengths.items() if v != number_of_rows] raise ValueError( "`BiocFrame` expects all columns in ``data`` to be of equal" - f"length, these columns do not: {', '.join(incorrect_len_keys)}." + f"length, but these are not: {bad_rows}." ) + else: + number_of_rows = int_mean_len if row_names is not None: if not validate_unique_list(row_names): raise ValueError("`row_names` must be unique!") - if number_of_rows is None: - number_of_rows = len(row_names) - else: - if len(row_names) != number_of_rows: - raise ValueError( - "Length of `row_names` and `number_of_rows` do not match, " - f"l{len(row_names)} != {number_of_rows}" - ) + if len(row_names) != number_of_rows: + raise ValueError( + "Length of `row_names` and `number_of_rows` do not match, " + f"l{len(row_names)} != {number_of_rows}" + ) return number_of_rows def validate_cols( - column_names: Sequence[str], - data: MutableMapping[str, Union[Sequence, MutableMapping]], -) -> Tuple[List[str], Dict[str, Union[Sequence, MutableMapping]]]: + column_names: Optional[List[str]] = None, + data: DataType = {}, +) -> Tuple[List[str], DataType]: """Validate columns of a :py:class:`biocframe.BiocFrame` object. Args: - column_names (Sequence[str], optional): Column names, if not provided, + column_names (List[str], optional): Column names, if not provided, its automatically inferred from data. Defaults to None. - data (MutableMapping[str, Union[Sequence, MutableMapping]], optional): + data (MutableMapping[str, Union[List, MutableMapping]], optional): a dictionary of columns and their values. all columns must have the - same length. Defaults to {}. Defaults to {}. + same length. Defaults to {}. Raises: ValueError: When ``column_names`` do not match the keys from ``data``. TypeError: Incorrect column type. Returns: - Sequence[str]: List of columns names. + List[str]: List of columns names. """ if column_names is None: column_names = list(data.keys()) else: if len(column_names) != len(data.keys()): raise ValueError( - "Number of columns mismatch between `column_names` and `data`." + "Mismatch in number of columns between 'column_names' and " + "'data`'." ) if len(set(column_names).difference(data.keys())) > 0: @@ -97,19 +101,13 @@ def validate_cols( "Not all columns from `data` are present in `column_names`." ) - # Technically should throw an error but - # lets just fix it - # column names and dict order should be the same - incorrect_types = [] - new_odata = OrderedDict() - for k in column_names: - # check for types - col_value = data[k] - - if not (hasattr(col_value, "__len__") and hasattr(col_value, "__getitem__")): - incorrect_types.append(k) - - new_odata[k] = data[k] + # Technically should throw an error but lets just fix it column names and + # dict order should be the same + incorrect_types: List[str] = [ + k + for k, v in data.items() + if not (hasattr(v, "__len__") and hasattr(v, "__getitem__")) + ] if len(incorrect_types) > 0: raise TypeError( @@ -118,16 +116,14 @@ def validate_cols( f"{', '.join(incorrect_types)}." ) - data = new_odata - return column_names, data -def validate_unique_list(values: Sequence) -> bool: +def validate_unique_list(values: List[Any]) -> bool: """Validate if ``values`` contains unique values. Args: - values (Sequence): List to check. + values (List): List to check. Returns: bool: `True` if all values are unique else False. diff --git a/src/biocframe/io/from_pandas.py b/src/biocframe/io/from_pandas.py index 644d429..665c956 100644 --- a/src/biocframe/io/from_pandas.py +++ b/src/biocframe/io/from_pandas.py @@ -1,3 +1,5 @@ +from pandas import DataFrame + from ..BiocFrame import BiocFrame __author__ = "jkanche" @@ -17,7 +19,6 @@ def from_pandas(input: "DataFrame") -> BiocFrame: Returns: BiocFrame: A :py:class:`~biocframe.BiocFrame.BiocFrame` object. """ - from pandas import DataFrame if not isinstance(input, DataFrame): @@ -29,4 +30,6 @@ def from_pandas(input: "DataFrame") -> BiocFrame: if input.index is not None: rindex = input.index.to_list() - return BiocFrame(data=rdata, row_names=rindex, column_names=input.columns.to_list()) + return BiocFrame( + data=rdata, row_names=rindex, column_names=input.columns.to_list() + ) diff --git a/src/biocframe/types.py b/src/biocframe/types.py index 5686a50..44889cf 100644 --- a/src/biocframe/types.py +++ b/src/biocframe/types.py @@ -1,8 +1,43 @@ -from typing import Optional, Sequence, Tuple, Union +"""Custom types for biocframe.""" + +from typing import Any, Dict, List, Protocol, Tuple, Union, runtime_checkable __author__ = "jkanche" __copyright__ = "jkanche" __license__ = "MIT" -SlicerTypes = Union[Sequence[int], Sequence[bool], Sequence[str], slice, int, str] -SlicerArgTypes = Union[Sequence[str], Tuple[SlicerTypes, Optional[SlicerTypes]]] +SimpleSlice = Union[slice, List[int]] +AtomicSlice = Union[int, bool, str] +RangeSlice = Union[ + List[AtomicSlice], + slice, + Tuple[Union[AtomicSlice, slice], Union[AtomicSlice, slice, None]], +] +AllSlice = Union[RangeSlice, AtomicSlice] + + +@runtime_checkable +class BiocSeq(Protocol): + """The protocol for data types.""" + + @property + def shape(self) -> List[int]: + """Return the shape of the data.""" + ... + + def __getitem__(self, __key: Any) -> Any: + """Slice the data.""" + ... + + def __len__(self) -> int: + """Return the length of the data.""" + ... + + +ColType = Union[Dict[str, Any], List[Any], BiocSeq] +DataType = Union[ + Dict[str, ColType], + Dict[str, Dict[str, Any]], + Dict[str, List[Any]], + Dict[str, BiocSeq], +] diff --git a/src/biocframe/utils.py b/src/biocframe/utils.py index 7c578da..a49e14d 100644 --- a/src/biocframe/utils.py +++ b/src/biocframe/utils.py @@ -1,29 +1,31 @@ -from typing import Any, List, Sequence, Tuple, Union -from warnings import warn +"""Utility functions for biocframe.""" +from typing import Any, List, Tuple, cast +from warnings import warn from ._type_checks import is_list_of_type -from .types import SlicerTypes +from .types import AllSlice, SimpleSlice __author__ = "jkanche" __copyright__ = "jkanche" __license__ = "MIT" -def _match_to_indices( - data: Sequence, query: SlicerTypes -) -> Tuple[Union[slice, List[int]], bool]: +def match_to_indices( + data: List[Any], query: AllSlice +) -> Tuple[SimpleSlice, bool]: """Utility function to make slicer arguments more palatable. Args: - data (Sequence): Input data array to slice. - query (SlicerTypes): Either a slice or - a list of indices to keep. + data (List): Input data array to slice. + query (SlicerTypes): Either a slice or a list of int indices to keep. Returns: - Tuple[Union[slice, List[int]], bool]: Resolved list of indices and if its a unary slice. + SlicerTypes: + Resolved list of indices. + bool: + `True` if its a unary slice. """ - resolved_indices = None is_unary = False @@ -38,16 +40,18 @@ def _match_to_indices( elif isinstance(query, slice): # resolved_indices = list(range(len(data))[query]) resolved_indices = query - elif isinstance(query, list) or isinstance(query, tuple): + else: if is_list_of_type(query, bool): if len(query) != len(data): warn( "`indices` is a boolean vector, length should match the size of the data." ) - resolved_indices = [i for i in range(len(query)) if query[i] is True] + resolved_indices = [ + i for i in range(len(query)) if query[i] is True + ] elif is_list_of_type(query, int): - resolved_indices = query + resolved_indices = cast(List[int], query) elif is_list_of_type(query, str): diff = list(set(query).difference(set(data))) if len(diff) > 0: @@ -58,17 +62,27 @@ def _match_to_indices( resolved_indices = [data.index(i) for i in query] else: raise TypeError("`indices` is a list of unsupported types!") - else: - raise TypeError("`indices` is unsupported!") return resolved_indices, is_unary -def _slice_or_index(data: Any, query: Union[slice, List[int]]): - sliced_data = None +def slice_or_index(data: Any, query: SimpleSlice) -> List[Any]: + """Utility function to slice or index data. + + Args: + data (Any): Input data array to slice. + query (BasicSlice): Either a `slice` or a list of int indices to keep. + + Returns: + List[Any]: The sliced data. + + Raises: + TypeError: If the query is not a slice or a list. + """ + sliced_data: List[Any] if isinstance(query, slice): sliced_data = data[query] - elif isinstance(query, list): + else: if not isinstance(data, list): try: return data[query] @@ -76,7 +90,5 @@ def _slice_or_index(data: Any, query: Union[slice, List[int]]): pass sliced_data = [data[i] for i in query] - else: - raise TypeError("Cannot match column indices to a known operation!") return sliced_data diff --git a/tests/test_utils.py b/tests/test_utils.py index 907308e..29db24d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,4 +1,4 @@ -from biocframe.utils import _match_to_indices +from biocframe.utils import match_to_indices __author__ = "jkanche" __copyright__ = "jkanche" @@ -8,12 +8,12 @@ def test_match_to_indices(): obj = ["b", "n", "m"] - sliced_ind, is_unary = _match_to_indices(obj, query=[0, 2]) + sliced_ind, is_unary = match_to_indices(obj, query=[0, 2]) assert sliced_ind is not None assert len(sliced_ind) == 2 assert sliced_ind == [0, 2] - sliced_ind, is_unary = _match_to_indices(obj, query=["b", "n"]) + sliced_ind, is_unary = match_to_indices(obj, query=["b", "n"]) assert sliced_ind is not None assert sliced_ind == [0, 1] assert len(sliced_ind) == 2 From cc2d38bb0adbfcfaa6c91835d7f54d5b39de11ee Mon Sep 17 00:00:00 2001 From: Max Hargreaves Date: Wed, 20 Sep 2023 10:51:18 -0700 Subject: [PATCH 2/6] Change: finished changes --- src/biocframe/BiocFrame.py | 102 +++++++++++++++++-------------------- src/biocframe/types.py | 34 +++++++++---- 2 files changed, 72 insertions(+), 64 deletions(-) diff --git a/src/biocframe/BiocFrame.py b/src/biocframe/BiocFrame.py index 79ffa9f..a5609dc 100644 --- a/src/biocframe/BiocFrame.py +++ b/src/biocframe/BiocFrame.py @@ -19,10 +19,12 @@ from .types import ( AllSlice, AtomicSlice, - BiocSeq, + BiocCol, + ColSlice, ColType, DataType, RangeSlice, + RowSlice, SimpleSlice, ) from .utils import match_to_indices, slice_or_index @@ -173,40 +175,39 @@ def __repr__(self) -> str: table = PrettyTable(padding_width=1) table.field_names = [str(col) for col in self._column_names] - _rows = [] - rows_to_show = 2 - _top = self.shape[0] - if _top > rows_to_show: - _top = rows_to_show + num_rows = self.shape[0] + # maximum number of top and bottom rows to show + max_shown_rows = 3 - # top three rows - for r in range(_top): - _row = self.row(r) - vals = list(_row.values()) - res = [str(v) for v in vals] - _rows.append(res) + max_top_row = max_shown_rows if num_rows > max_shown_rows else num_rows - if self.shape[0] > 2 * rows_to_show: - # add ... - _rows.append(["..." for _ in range(len(self._column_names))]) + min_last_row = num_rows - max_shown_rows + if min_last_row <= 0: + min_last_row = None + elif min_last_row < max_top_row: + min_last_row = max_top_row - _last = self.shape[0] - _top - if _last <= rows_to_show: - _last = self.shape[0] - _top + rows: List[List[str]] = [] - # last three rows - for r in range(_last + 1, len(self)): - _row = self.row(r) - vals = list(_row.values()) - res = [str(v) for v in vals] - _rows.append(res) + # up to top three rows + for r in range(max_top_row): + rows.append([str(val) for val in self.row(r).values()]) - table.add_rows(_rows) + if min_last_row is not None: + if num_rows > (max_shown_rows * 2): + # add ... to the middle row + rows.append(["..." for _ in range(len(self._column_names))]) + + # up to last three rows + for r in range(min_last_row, num_rows): + rows.append([str(val) for val in self.row(r).values()]) + + table.add_rows(rows) # type: ignore pattern = ( - f"BiocFrame with {self.dims[0]} rows & {self.dims[1]} columns \n" - f"contains row names?: {self.row_names is not None} \n" - f"{table.get_string()}" + f"BiocFrame with {num_rows} rows & {self.dims[1]} columns \n" + f"with row names: {self.row_names is not None} \n" + f"{table.get_string()}" # type: ignore ) return pattern @@ -326,24 +327,16 @@ def has_column(self, name: str) -> bool: @overload def _slice( self, - row_indices_or_names: Optional[AtomicSlice], - column_indices_or_names: Optional[AtomicSlice], + row_indices_or_names: AtomicSlice, + column_indices_or_names: None, ) -> Dict[str, Any]: ... @overload def _slice( self, - row_indices_or_names: Optional[RangeSlice], - column_indices_or_names: Optional[RangeSlice], - ) -> ItemType: - ... - - @overload - def _slice( - self, - row_indices_or_names: Union[AtomicSlice, slice], - column_indices_or_names: Union[AtomicSlice, slice], + row_indices_or_names: Optional[AllSlice], + column_indices_or_names: Union[AllSlice, None], ) -> ItemType: ... @@ -401,7 +394,7 @@ def _slice( new_number_of_rows = len(new_row_names) for k, v in new_data.items(): - if isinstance(v, BiocSeq): + if isinstance(v, BiocCol): tmp: List[SimpleSlice] = [slice(None)] * len(v.shape) tmp[0] = new_row_indices new_data[k] = v[(*tmp,)] @@ -433,11 +426,13 @@ def _slice( ) @overload - def __getitem__(self, __key: AtomicSlice) -> Dict[str, Any]: + def __getitem__(self, __key: Union[RangeSlice, ColSlice]) -> ItemType: ... @overload - def __getitem__(self, __key: RangeSlice) -> ItemType: + def __getitem__( + self, __key: Union[AtomicSlice, RowSlice] + ) -> Dict[str, Any]: ... # TODO: implement in-place or views @@ -525,24 +520,21 @@ def __getitem__(self, __key: AllSlice) -> ItemType: # column names if everything is a string if is_list_of_type(__key, str): return self._slice(None, __key) - elif is_list_of_type(__key, int): + + if is_list_of_type(__key, int): return self._slice(__key, None) - elif is_list_of_type(__key, bool): + + if is_list_of_type(__key, bool): return self._slice(__key, None) - else: - raise ValueError("`args` is not supported.") - # tuple - if len(__key) == 0: - raise ValueError("`args` must contain at least one slice.") + raise ValueError("`args` is not supported.") - if len(__key) == 1: - return self._slice(__key[0], None) - elif len(__key) == 2: - return self._slice(__key[0], __key[1]) - else: + # tuple of two elements + if len(__key) != 2: raise ValueError("Length of `args` is more than 2.") + return self._slice(__key[0], __key[1]) + def column(self, index_or_name: AtomicSlice) -> ItemType: """Access a column by integer position or column label. diff --git a/src/biocframe/types.py b/src/biocframe/types.py index 44889cf..ebf5619 100644 --- a/src/biocframe/types.py +++ b/src/biocframe/types.py @@ -1,23 +1,39 @@ """Custom types for biocframe.""" -from typing import Any, Dict, List, Protocol, Tuple, Union, runtime_checkable +from typing import ( + Any, + Dict, + List, + Protocol, + Sequence, + Tuple, + Union, + runtime_checkable, +) __author__ = "jkanche" __copyright__ = "jkanche" __license__ = "MIT" -SimpleSlice = Union[slice, List[int]] -AtomicSlice = Union[int, bool, str] +SimpleSlice = Union[slice, Sequence[int]] + +AtomicSlice = Union[int, str] +ListSlice = List[Union[AtomicSlice, bool]] RangeSlice = Union[ - List[AtomicSlice], + ListSlice, slice, - Tuple[Union[AtomicSlice, slice], Union[AtomicSlice, slice, None]], + Tuple[ + Union[ListSlice, slice], + Union[ListSlice, slice, None], + ], ] -AllSlice = Union[RangeSlice, AtomicSlice] +RowSlice = Tuple[AtomicSlice, "AllSlice"] +ColSlice = Tuple["AllSlice", AtomicSlice] +AllSlice = Union[RangeSlice, AtomicSlice, RowSlice, ColSlice] @runtime_checkable -class BiocSeq(Protocol): +class BiocCol(Protocol): """The protocol for data types.""" @property @@ -34,10 +50,10 @@ def __len__(self) -> int: ... -ColType = Union[Dict[str, Any], List[Any], BiocSeq] +ColType = Union[Dict[str, Any], List[Any], BiocCol] DataType = Union[ Dict[str, ColType], Dict[str, Dict[str, Any]], Dict[str, List[Any]], - Dict[str, BiocSeq], + Dict[str, BiocCol], ] From c3b3c14dbb827f81f8d654a9d86da13fee65dc17 Mon Sep 17 00:00:00 2001 From: Max Hargreaves Date: Wed, 20 Sep 2023 13:11:43 -0700 Subject: [PATCH 3/6] Change: final-ish --- setup.cfg | 2 +- src/biocframe/BiocFrame.py | 69 ++++++++++++++------------------- src/biocframe/__init__.py | 1 + src/biocframe/_validators.py | 14 +++++-- src/biocframe/io/from_pandas.py | 30 +++++++------- src/biocframe/types.py | 28 +++++++------ src/biocframe/utils.py | 25 +++++++++++- tests/test_initialize.py | 14 +++++-- tests/test_methods.py | 19 +++++++-- tests/test_readme.py | 14 +++++-- tests/test_utils.py | 4 +- 11 files changed, 129 insertions(+), 91 deletions(-) diff --git a/setup.cfg b/setup.cfg index b8fa774..ef9a41b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,7 +50,6 @@ python_requires = >=3.8 install_requires = importlib-metadata; python_version<"3.8" prettytable - pandas [options.packages.find] where = src @@ -69,6 +68,7 @@ testing = pytest pytest-cov numpy + pandas [options.entry_points] # Add here console scripts like: diff --git a/src/biocframe/BiocFrame.py b/src/biocframe/BiocFrame.py index a5609dc..7bea937 100644 --- a/src/biocframe/BiocFrame.py +++ b/src/biocframe/BiocFrame.py @@ -23,14 +23,15 @@ ColSlice, ColType, DataType, - RangeSlice, RowSlice, + SeqSlice, SimpleSlice, + TupleSlice, ) from .utils import match_to_indices, slice_or_index try: - from pandas import DataFrame, RangeIndex + from pandas import DataFrame except Exception: pass @@ -149,27 +150,6 @@ def __init__( self._number_of_columns = len(self._column_names) self._metadata = metadata - @classmethod - def from_pandas(cls, df: "DataFrame") -> "BiocFrame": - """Read a :py:class:`~biocframe.BiocFrame.BiocFrame` from :py:class:`~pandas.DataFrame` object. - - Args: - df (:py:class:`~pandas.DataFrame`): Input data. - - Raises: - TypeError: If ``input`` is not a :py:class:`~pandas.DataFrame`. - - Returns: - BiocFrame: A :py:class:`~biocframe.BiocFrame.BiocFrame` object. - """ - r_data: Dict[str, List[Any]] = df.to_dict("list") # type: ignore - r_index: Optional[List[str]] = None - - if df.index is not RangeIndex: # type: ignore - r_index = df.index.to_list() # type: ignore - - return BiocFrame(data=r_data, row_names=r_index) - def __repr__(self) -> str: """Get a machine-readable string representation of the object.""" table = PrettyTable(padding_width=1) @@ -426,7 +406,7 @@ def _slice( ) @overload - def __getitem__(self, __key: Union[RangeSlice, ColSlice]) -> ItemType: + def __getitem__(self, __key: Union[SeqSlice, slice, ColSlice]) -> ItemType: ... @overload @@ -435,6 +415,10 @@ def __getitem__( ) -> Dict[str, Any]: ... + @overload + def __getitem__(self, __key: TupleSlice) -> "BiocFrame": + ... + # TODO: implement in-place or views def __getitem__(self, __key: AllSlice) -> ItemType: """Subset the data frame. @@ -576,7 +560,7 @@ def row(self, index_or_name: AtomicSlice) -> Dict[str, Any]: return self[index_or_name, :] # TODO: implement in-place or views - def __setitem__(self, name: str, value: ColType) -> None: + def __setitem__(self, __key: str, __value: ColType) -> None: """Add or re-assign a value to a column. Usage: @@ -604,18 +588,18 @@ def __setitem__(self, name: str, value: ColType) -> None: Raises: ValueError: If length of ``value`` does not match the number of rows. """ - if len(value) != self.shape[0]: + if len(__value) != self.shape[0]: raise ValueError( "Length of `value`, does not match the number of the rows," - f"need to be {self.shape[0]} but provided {len(value)}." + f"need to be {self.shape[0]} but provided {len(__value)}." ) - if name not in self.column_names: - self._column_names.append(name) + if __key not in self.column_names: + self._column_names.append(__key) self._number_of_columns += 1 # Dunno how to fix this one... - self._data[name] = value # type: ignore + self._data[__key] = __value # type: ignore # TODO: implement in-place or view def __delitem__(self, name: str): @@ -648,7 +632,11 @@ def __delitem__(self, name: str): if name not in self.column_names: raise ValueError(f"Column: '{name}' does not exist.") - del self._data[name] + try: + del self._data[name] # type: ignore + except Exception: + self._data = {k: v for k, v in self._data.items() if k != name} + self._column_names.remove(name) self._number_of_columns -= 1 @@ -670,15 +658,16 @@ def to_pandas(self) -> "DataFrame": Returns: DataFrame: a :py:class:`~pandas.DataFrame` object. """ - from pandas import DataFrame - return DataFrame( data=self._data, index=self._row_names, columns=self._column_names ) # TODO: very primitive implementation, needs very robust testing # TODO: implement in-place, view - def __array_ufunc__(self, func, method, *inputs, **kwargs) -> "BiocFrame": + + def __array_ufunc__( + self, ufunc: Any, method: str, *inputs: Any, **kwargs: Any + ) -> "BiocFrame": """Interface with NumPy array methods. Usage: @@ -696,16 +685,16 @@ def __array_ufunc__(self, func, method, *inputs, **kwargs) -> "BiocFrame": """ from pandas import Series - input = inputs[0] - if not isinstance(input, BiocFrame): + _input = inputs[0] + if not isinstance(_input, BiocFrame): raise TypeError("Input is not a `BiocFrame` object.") for col in self.column_names: - if is_numeric_dtype(Series(input.column(col))): - new_col = getattr(func, method)(input.column(col), **kwargs) - input[col] = new_col + if is_numeric_dtype(Series(_input.column(col))): # type: ignore + new_col = getattr(ufunc, method)(_input.column(col), **kwargs) + _input[col] = new_col - return input + return _input ########################################################################### # compatibility with Pandas diff --git a/src/biocframe/__init__.py b/src/biocframe/__init__.py index 08f9351..4d7b5f2 100644 --- a/src/biocframe/__init__.py +++ b/src/biocframe/__init__.py @@ -19,3 +19,4 @@ del version, PackageNotFoundError from .BiocFrame import BiocFrame as BiocFrame +from .io import from_pandas as from_pandas diff --git a/src/biocframe/_validators.py b/src/biocframe/_validators.py index b08efae..98e5cd5 100644 --- a/src/biocframe/_validators.py +++ b/src/biocframe/_validators.py @@ -32,16 +32,22 @@ def validate_rows( int: Validated number of rows in ``data``. """ lengths = {k: len(v) for k, v in data.items()} - mean_len = sum(lengths.values()) / len(lengths.values()) + mean_len = ( + sum(lengths.values()) / len(lengths.values()) + if len(lengths) > 0 + else 0 + ) int_mean_len = int(mean_len) - if int_mean_len != mean_len or ( + if int_mean_len == 0: + number_of_rows = number_of_rows if number_of_rows is not None else 0 + elif int_mean_len != mean_len or ( number_of_rows is not None and int_mean_len != number_of_rows ): - number_of_rows = ( + expected_num_rows = ( int_mean_len if number_of_rows is None else number_of_rows ) - bad_rows = [k for k, v in lengths.items() if v != number_of_rows] + bad_rows = [k for k, v in lengths.items() if v != expected_num_rows] raise ValueError( "`BiocFrame` expects all columns in ``data`` to be of equal" f"length, but these are not: {bad_rows}." diff --git a/src/biocframe/io/from_pandas.py b/src/biocframe/io/from_pandas.py index 665c956..2b13640 100644 --- a/src/biocframe/io/from_pandas.py +++ b/src/biocframe/io/from_pandas.py @@ -1,17 +1,24 @@ -from pandas import DataFrame +"""A function for converting from pandas.DataFrame to BiocFrame.""" + +from typing import Any, Dict, List, Optional from ..BiocFrame import BiocFrame +try: + from pandas import DataFrame, RangeIndex +except Exception: + pass + __author__ = "jkanche" __copyright__ = "jkanche" __license__ = "MIT" -def from_pandas(input: "DataFrame") -> BiocFrame: +def from_pandas(df: "DataFrame") -> "BiocFrame": """Read a :py:class:`~biocframe.BiocFrame.BiocFrame` from :py:class:`~pandas.DataFrame` object. Args: - input (:py:class:`~pandas.DataFrame`): Input data. + df (:py:class:`~pandas.DataFrame`): Input data. Raises: TypeError: If ``input`` is not a :py:class:`~pandas.DataFrame`. @@ -19,17 +26,10 @@ def from_pandas(input: "DataFrame") -> BiocFrame: Returns: BiocFrame: A :py:class:`~biocframe.BiocFrame.BiocFrame` object. """ - from pandas import DataFrame - - if not isinstance(input, DataFrame): - raise TypeError("data is not a pandas `DataFrame` object.") - - rdata = input.to_dict("list") - rindex = None + r_data: Dict[str, List[Any]] = df.to_dict("list") # type: ignore + r_index: Optional[List[str]] = None - if input.index is not None: - rindex = input.index.to_list() + if df.index is not RangeIndex: # type: ignore + r_index = df.index.to_list() # type: ignore - return BiocFrame( - data=rdata, row_names=rindex, column_names=input.columns.to_list() - ) + return BiocFrame(data=r_data, row_names=r_index) diff --git a/src/biocframe/types.py b/src/biocframe/types.py index ebf5619..11e4ef6 100644 --- a/src/biocframe/types.py +++ b/src/biocframe/types.py @@ -2,8 +2,8 @@ from typing import ( Any, - Dict, List, + Mapping, Protocol, Sequence, Tuple, @@ -18,15 +18,12 @@ SimpleSlice = Union[slice, Sequence[int]] AtomicSlice = Union[int, str] -ListSlice = List[Union[AtomicSlice, bool]] -RangeSlice = Union[ - ListSlice, - slice, - Tuple[ - Union[ListSlice, slice], - Union[ListSlice, slice, None], - ], +SeqSlice = Sequence[Union[AtomicSlice, bool]] +TupleSlice = Tuple[ + Union[SeqSlice, slice], + Union[SeqSlice, slice, None], ] +RangeSlice = Union[SeqSlice, slice, TupleSlice] RowSlice = Tuple[AtomicSlice, "AllSlice"] ColSlice = Tuple["AllSlice", AtomicSlice] AllSlice = Union[RangeSlice, AtomicSlice, RowSlice, ColSlice] @@ -37,7 +34,7 @@ class BiocCol(Protocol): """The protocol for data types.""" @property - def shape(self) -> List[int]: + def shape(self) -> Sequence[int]: """Return the shape of the data.""" ... @@ -50,10 +47,11 @@ def __len__(self) -> int: ... -ColType = Union[Dict[str, Any], List[Any], BiocCol] +# Mapping is necessary as it is covariant which MutableMapping, etc. are not. +ColType = Union[Mapping[str, Any], List[Any], BiocCol] DataType = Union[ - Dict[str, ColType], - Dict[str, Dict[str, Any]], - Dict[str, List[Any]], - Dict[str, BiocCol], + Mapping[str, ColType], + Mapping[str, Mapping[str, Any]], + Mapping[str, List[Any]], + Mapping[str, BiocCol], ] diff --git a/src/biocframe/utils.py b/src/biocframe/utils.py index a49e14d..1d20123 100644 --- a/src/biocframe/utils.py +++ b/src/biocframe/utils.py @@ -1,16 +1,37 @@ """Utility functions for biocframe.""" -from typing import Any, List, Tuple, cast +from typing import Any, List, Sequence, Tuple, Union, cast, overload from warnings import warn from ._type_checks import is_list_of_type -from .types import AllSlice, SimpleSlice +from .types import ( + AllSlice, + AtomicSlice, + ColSlice, + RowSlice, + SeqSlice, + SimpleSlice, + TupleSlice, +) __author__ = "jkanche" __copyright__ = "jkanche" __license__ = "MIT" +@overload +def match_to_indices(data: List[Any], query: slice) -> Tuple[slice, bool]: + ... + + +@overload +def match_to_indices( + data: List[Any], + query: Union[SeqSlice, TupleSlice, AtomicSlice, RowSlice, ColSlice], +) -> Tuple[Sequence[int], bool]: + ... + + def match_to_indices( data: List[Any], query: AllSlice ) -> Tuple[SimpleSlice, bool]: diff --git a/tests/test_initialize.py b/tests/test_initialize.py index 3c9c5d4..264d721 100644 --- a/tests/test_initialize.py +++ b/tests/test_initialize.py @@ -2,7 +2,7 @@ import pytest import biocframe -from biocframe.BiocFrame import BiocFrame +from biocframe import BiocFrame, from_pandas __author__ = "jkanche" __copyright__ = "jkanche" @@ -42,7 +42,10 @@ def test_initialize_pandas(): { "ncol1": [4, 5, 6], "ncol2": ["a", "b", "c"], - "deep": {"dcol1": ["j", "k", "l"], "dcol2": ["a", "s", "l"]}, + "deep": { + "dcol1": ["j", "k", "l"], + "dcol2": ["a", "s", "l"], + }, }, { "ncol2": ["a"], @@ -57,7 +60,7 @@ def test_initialize_pandas(): } ) - bframe = biocframe.from_pandas(df_gr) + bframe = from_pandas(df_gr) assert bframe is not None @@ -80,7 +83,10 @@ def test_should_fail(): { "ncol1": [4, 5, 6], "ncol2": ["a", "b", "c"], - "deep": {"dcol1": ["j", "k", "l"], "dcol2": ["a", "s", "l"]}, + "deep": { + "dcol1": ["j", "k", "l"], + "dcol2": ["a", "s", "l"], + }, }, { "ncol2": ["a"], diff --git a/tests/test_methods.py b/tests/test_methods.py index 1447bab..da74481 100644 --- a/tests/test_methods.py +++ b/tests/test_methods.py @@ -200,7 +200,10 @@ def test_bframe_slice(): assert slice is not None assert len(slice.column_names) == 2 - assert len(list(set(slice.column_names).difference(["column1", "nested"]))) == 0 + assert ( + len(list(set(slice.column_names).difference(["column1", "nested"]))) + == 0 + ) assert len(slice.dims) == 2 assert slice.dims == (2, 2) @@ -210,7 +213,12 @@ def test_bframe_slice(): assert sliced_list is not None assert len(sliced_list.column_names) == 2 assert ( - len(list(set(sliced_list.column_names).difference(["column1", "nested"]))) == 0 + len( + list( + set(sliced_list.column_names).difference(["column1", "nested"]) + ) + ) + == 0 ) assert len(sliced_list.dims) == 2 @@ -336,7 +344,10 @@ def test_nested_biocFrame_slice(): assert slice is not None assert len(slice.column_names) == 2 - assert len(list(set(slice.column_names).difference(["column1", "nested"]))) == 0 + assert ( + len(list(set(slice.column_names).difference(["column1", "nested"]))) + == 0 + ) assert len(slice.dims) == 2 assert slice.dims == (2, 2) @@ -372,7 +383,7 @@ def test_bframe_iter(): assert bframe is not None iterCount = 0 - for k, v in bframe: + for _ in bframe: iterCount += 1 assert iterCount == bframe.dims[0] diff --git a/tests/test_readme.py b/tests/test_readme.py index 067db10..002d8de 100644 --- a/tests/test_readme.py +++ b/tests/test_readme.py @@ -22,10 +22,10 @@ def test_bframe(): "chr3", ] * 20, - "starts": range(100, 300), - "ends": range(110, 310), + "starts": list(range(100, 300)), + "ends": list(range(110, 310)), "strand": ["-", "+", "+", "*", "*", "+", "+", "+", "-", "-"] * 20, - "score": range(0, 200), + "score": list(range(0, 200)), "GC": [random() for _ in range(10)] * 20, } @@ -55,6 +55,12 @@ def test_bframe(): assert sliced_df is not None assert sliced_df.dims == (4, 3) assert ( - len(list(set(sliced_df.column_names).difference(["end", "strands", "scores"]))) + len( + list( + set(sliced_df.column_names).difference( + ["end", "strands", "scores"] + ) + ) + ) == 0 ) diff --git a/tests/test_utils.py b/tests/test_utils.py index 29db24d..88e43ff 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,12 +8,12 @@ def test_match_to_indices(): obj = ["b", "n", "m"] - sliced_ind, is_unary = match_to_indices(obj, query=[0, 2]) + sliced_ind, _ = match_to_indices(obj, query=[0, 2]) assert sliced_ind is not None assert len(sliced_ind) == 2 assert sliced_ind == [0, 2] - sliced_ind, is_unary = match_to_indices(obj, query=["b", "n"]) + sliced_ind, _ = match_to_indices(obj, query=["b", "n"]) assert sliced_ind is not None assert sliced_ind == [0, 1] assert len(sliced_ind) == 2 From 9e6d4e47c7f7ab6bad249dae068f767cc9ebb190 Mon Sep 17 00:00:00 2001 From: Max Hargreaves Date: Wed, 20 Sep 2023 13:29:51 -0700 Subject: [PATCH 4/6] Fix: metadata --- src/biocframe/BiocFrame.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/biocframe/BiocFrame.py b/src/biocframe/BiocFrame.py index 79e1640..9619848 100644 --- a/src/biocframe/BiocFrame.py +++ b/src/biocframe/BiocFrame.py @@ -153,7 +153,7 @@ def __init__( column_names, self._data ) self._number_of_columns = len(self._column_names) - self._metadata = metadata + self._metadata = {} if metadata is None else metadata def __repr__(self) -> str: """Get a machine-readable string representation of the object.""" @@ -284,7 +284,7 @@ def column_names(self, names: List[str]) -> None: } @property - def metadata(self) -> Optional[Dict[str, Any]]: + def metadata(self) -> Dict[str, Any]: """Get/set the metadata. Args: @@ -296,7 +296,7 @@ def metadata(self) -> Optional[Dict[str, Any]]: return self._metadata @metadata.setter - def metadata(self, metadata: Optional[Dict[str, Any]]): + def metadata(self, metadata: Dict[str, Any]): self._metadata = metadata def has_column(self, name: str) -> bool: From f7140991ab36454d3d61814c02eca3942762fde4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 20 Sep 2023 20:59:56 +0000 Subject: [PATCH 5/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- setup.py | 2 +- src/biocframe/BiocFrame.py | 12 +++--------- src/biocframe/_validators.py | 13 +++---------- src/biocframe/utils.py | 8 ++------ tests/test_methods.py | 17 +++-------------- tests/test_readme.py | 8 +------- 6 files changed, 13 insertions(+), 47 deletions(-) diff --git a/setup.py b/setup.py index 79c4d8d..f627d76 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ if __name__ == "__main__": try: setup(use_scm_version={"version_scheme": "no-guess-dev"}) - except: # noqa + except: print( "\n\nAn error occurred while building the project, " "please ensure you have the most updated version of setuptools, " diff --git a/src/biocframe/BiocFrame.py b/src/biocframe/BiocFrame.py index 9619848..4fab415 100644 --- a/src/biocframe/BiocFrame.py +++ b/src/biocframe/BiocFrame.py @@ -149,9 +149,7 @@ def __init__( self._number_of_rows = validate_rows( self._data, number_of_rows, self._row_names ) - self._column_names, self._data = validate_cols( - column_names, self._data - ) + self._column_names, self._data = validate_cols(column_names, self._data) self._number_of_columns = len(self._column_names) self._metadata = {} if metadata is None else metadata @@ -279,9 +277,7 @@ def column_names(self, names: List[str]) -> None: raise ValueError("Column names must be unique!") self._column_names = names - self._data = { - names[i]: v for i, (_, v) in enumerate(self.data.items()) - } + self._data = {names[i]: v for i, (_, v) in enumerate(self.data.items())} @property def metadata(self) -> Dict[str, Any]: @@ -414,9 +410,7 @@ def __getitem__(self, __key: Union[SeqSlice, slice, ColSlice]) -> ItemType: ... @overload - def __getitem__( - self, __key: Union[AtomicSlice, RowSlice] - ) -> Dict[str, Any]: + def __getitem__(self, __key: Union[AtomicSlice, RowSlice]) -> Dict[str, Any]: ... @overload diff --git a/src/biocframe/_validators.py b/src/biocframe/_validators.py index 98e5cd5..9c25c6a 100644 --- a/src/biocframe/_validators.py +++ b/src/biocframe/_validators.py @@ -32,11 +32,7 @@ def validate_rows( int: Validated number of rows in ``data``. """ lengths = {k: len(v) for k, v in data.items()} - mean_len = ( - sum(lengths.values()) / len(lengths.values()) - if len(lengths) > 0 - else 0 - ) + mean_len = sum(lengths.values()) / len(lengths.values()) if len(lengths) > 0 else 0 int_mean_len = int(mean_len) if int_mean_len == 0: @@ -44,9 +40,7 @@ def validate_rows( elif int_mean_len != mean_len or ( number_of_rows is not None and int_mean_len != number_of_rows ): - expected_num_rows = ( - int_mean_len if number_of_rows is None else number_of_rows - ) + expected_num_rows = int_mean_len if number_of_rows is None else number_of_rows bad_rows = [k for k, v in lengths.items() if v != expected_num_rows] raise ValueError( "`BiocFrame` expects all columns in ``data`` to be of equal" @@ -93,8 +87,7 @@ def validate_cols( else: if len(column_names) != len(data.keys()): raise ValueError( - "Mismatch in number of columns between 'column_names' and " - "'data`'." + "Mismatch in number of columns between 'column_names' and " "'data`'." ) if len(set(column_names).difference(data.keys())) > 0: diff --git a/src/biocframe/utils.py b/src/biocframe/utils.py index 1d20123..cca16c7 100644 --- a/src/biocframe/utils.py +++ b/src/biocframe/utils.py @@ -32,9 +32,7 @@ def match_to_indices( ... -def match_to_indices( - data: List[Any], query: AllSlice -) -> Tuple[SimpleSlice, bool]: +def match_to_indices(data: List[Any], query: AllSlice) -> Tuple[SimpleSlice, bool]: """Utility function to make slicer arguments more palatable. Args: @@ -68,9 +66,7 @@ def match_to_indices( "`indices` is a boolean vector, length should match the size of the data." ) - resolved_indices = [ - i for i in range(len(query)) if query[i] is True - ] + resolved_indices = [i for i in range(len(query)) if query[i] is True] elif is_list_of_type(query, int): resolved_indices = cast(List[int], query) elif is_list_of_type(query, str): diff --git a/tests/test_methods.py b/tests/test_methods.py index e516dcd..2951b7b 100644 --- a/tests/test_methods.py +++ b/tests/test_methods.py @@ -200,10 +200,7 @@ def test_bframe_slice(): assert slice is not None assert len(slice.column_names) == 2 - assert ( - len(list(set(slice.column_names).difference(["column1", "nested"]))) - == 0 - ) + assert len(list(set(slice.column_names).difference(["column1", "nested"]))) == 0 assert len(slice.dims) == 2 assert slice.dims == (2, 2) @@ -213,12 +210,7 @@ def test_bframe_slice(): assert sliced_list is not None assert len(sliced_list.column_names) == 2 assert ( - len( - list( - set(sliced_list.column_names).difference(["column1", "nested"]) - ) - ) - == 0 + len(list(set(sliced_list.column_names).difference(["column1", "nested"]))) == 0 ) assert len(sliced_list.dims) == 2 @@ -344,10 +336,7 @@ def test_nested_biocFrame_slice(): assert slice is not None assert len(slice.column_names) == 2 - assert ( - len(list(set(slice.column_names).difference(["column1", "nested"]))) - == 0 - ) + assert len(list(set(slice.column_names).difference(["column1", "nested"]))) == 0 assert len(slice.dims) == 2 assert slice.dims == (2, 2) diff --git a/tests/test_readme.py b/tests/test_readme.py index 002d8de..a915fb4 100644 --- a/tests/test_readme.py +++ b/tests/test_readme.py @@ -55,12 +55,6 @@ def test_bframe(): assert sliced_df is not None assert sliced_df.dims == (4, 3) assert ( - len( - list( - set(sliced_df.column_names).difference( - ["end", "strands", "scores"] - ) - ) - ) + len(list(set(sliced_df.column_names).difference(["end", "strands", "scores"]))) == 0 ) From 491a8663f0c6d9de7e6860419ec11570b54cb4ed Mon Sep 17 00:00:00 2001 From: Max Hargreaves Date: Thu, 21 Sep 2023 11:15:50 -0700 Subject: [PATCH 6/6] Changes: need for genomicranges --- src/biocframe/BiocFrame.py | 18 ++++++++++-------- src/biocframe/py.typed | 0 src/biocframe/types.py | 8 ++++++-- 3 files changed, 16 insertions(+), 10 deletions(-) create mode 100644 src/biocframe/py.typed diff --git a/src/biocframe/BiocFrame.py b/src/biocframe/BiocFrame.py index 9619848..54391a3 100644 --- a/src/biocframe/BiocFrame.py +++ b/src/biocframe/BiocFrame.py @@ -146,6 +146,7 @@ def __init__( """ self._data: DataType = {} if data is None else data self._row_names = row_names + self._metadata = {} if metadata is None else metadata self._number_of_rows = validate_rows( self._data, number_of_rows, self._row_names ) @@ -153,10 +154,9 @@ def __init__( column_names, self._data ) self._number_of_columns = len(self._column_names) - self._metadata = {} if metadata is None else metadata - def __repr__(self) -> str: - """Get a machine-readable string representation of the object.""" + def _repr_table(self) -> str: + """Make the pretty table for the __repr__ method..""" table = PrettyTable(padding_width=1) table.field_names = [str(col) for col in self._column_names] @@ -189,14 +189,16 @@ def __repr__(self) -> str: table.add_rows(rows) # type: ignore - pattern = ( - f"BiocFrame with {num_rows} rows & {self.dims[1]} columns \n" + return table.get_string() + + def __repr__(self) -> str: + """Get a machine-readable string representation of the object.""" + return ( + f"BiocFrame with {self.shape[0]} rows & {self.dims[1]} columns \n" f"with row names: {self.row_names is not None} \n" - f"{table.get_string()}" # type: ignore + f"{self._repr_table()}" # type: ignore ) - return pattern - @property def shape(self) -> Tuple[int, int]: """Get shape of the data frame. diff --git a/src/biocframe/py.typed b/src/biocframe/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/src/biocframe/types.py b/src/biocframe/types.py index 11e4ef6..5fe99ce 100644 --- a/src/biocframe/types.py +++ b/src/biocframe/types.py @@ -46,12 +46,16 @@ def __len__(self) -> int: """Return the length of the data.""" ... + def __iter__(self) -> Any: + """Iterate over the data.""" + ... + # Mapping is necessary as it is covariant which MutableMapping, etc. are not. -ColType = Union[Mapping[str, Any], List[Any], BiocCol] +ColType = Union[Mapping[str, Any], Sequence[Any], BiocCol] DataType = Union[ Mapping[str, ColType], Mapping[str, Mapping[str, Any]], - Mapping[str, List[Any]], + Mapping[str, Sequence[Any]], Mapping[str, BiocCol], ]