From 1fa879c4947b60aec5a8fff19c19fe6337cc85f1 Mon Sep 17 00:00:00 2001 From: Aaron Lun Date: Mon, 30 Oct 2023 18:57:34 -0700 Subject: [PATCH] Improved the __repr__ and __str__ methods for BiocFrames. (#59) * Greatly streamline the print method for BiocFrames. --------- --- AUTHORS.md | 1 + README.md | 91 +++++++++--------- setup.cfg | 3 +- src/biocframe/BiocFrame.py | 184 +++++++++++++++++++++---------------- 4 files changed, 151 insertions(+), 128 deletions(-) diff --git a/AUTHORS.md b/AUTHORS.md index f21a024..76da8b8 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -1,3 +1,4 @@ # Contributors * Jayaram Kancherla [jayaram.kancherla@gmail.com](mailto:jayaram.kancherla@gmail.com) +* Aaron Lun [infinite.monkeys.with.keyboards@gmail.com](mailto:infinite.monkeys.with.keyboards@gmail.com) diff --git a/README.md b/README.md index f800a46..2eb1c21 100644 --- a/README.md +++ b/README.md @@ -44,14 +44,12 @@ print(bframe) ``` ## output - BiocFrame with 3 rows & 2 columns - ┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ - ┃ ensembl ┃ symbol ┃ - ┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ - │ ENS00001 │ MAP1A │ - │ ENS00002 │ BIN1 │ - │ ENS00003 │ ESR1 │ - └────────────────┴───────────────┘ + BiocFrame with 3 rows and 2 columns + ensembl symbol + + [0] ENS00001 MAP1A + [1] ENS00002 BIN1 + [2] ENS00003 ESR1 You can specify complex representations as columns, for example @@ -71,14 +69,12 @@ print(bframe2) ``` ## output - BiocFrame with 3 rows & 3 columns - ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ - ┃ row_names ┃ ensembl ┃ symbol ┃ ranges ┃ - ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ - │ row1 │ ENS00001 │ MAP1A │ {'chr': 'chr1', 'start': 1000, 'end': 1100} │ - │ row2 │ ENS00002 │ BIN1 │ {'chr': 'chr2', 'start': 1100, 'end': 4000} │ - │ row3 │ ENS00002 │ ESR1 │ {'chr': 'chr3', 'start': 5000, 'end': 5500} │ - └───────────┴────────────────┴───────────────┴─────────────────────────────────────────────┘ + BiocFrame with 3 rows and 3 columns + ensembl symbol ranges + + row1 ENS00001 MAP1A chr1:1000:1100 + row2 ENS00002 BIN1 chr2:1100:4000 + row3 ENS00002 ESR1 chr3:5000:5500 ### Properties @@ -109,14 +105,12 @@ print(bframe) ``` ## output - BiocFrame with 3 rows & 2 columns - ┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┓ - ┃ column1 ┃ column2 ┃ - ┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━┩ - │ ENS00001 │ MAP1A │ - │ ENS00002 │ BIN1 │ - │ ENS00003 │ ESR1 │ - └────────────────┴────────────────┘ + BiocFrame with 3 rows and 2 columns + column1 column2 + + [0] ENS00001 MAP1A + [1] ENS00002 BIN1 + [2] ENS00003 ESR1 To add new columns, @@ -126,14 +120,12 @@ print(bframe) ``` ## output - BiocFrame with 3 rows & 3 columns - ┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ - ┃ column1 ┃ column2 ┃ score ┃ - ┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ - │ ENS00001 │ MAP1A │ 2 │ - │ ENS00002 │ BIN1 │ 3 │ - │ ENS00003 │ ESR1 │ 4 │ - └────────────────┴────────────────┴───────────────┘ + BiocFrame with 3 rows and 3 columns + column1 column2 score + + [0] ENS00001 MAP1A 2 + [1] ENS00002 BIN1 3 + [2] ENS00003 ESR1 4 ### Subset `BiocFrame` @@ -145,12 +137,10 @@ print(sliced) ``` ## output - BiocFrame with 1 row & 1 column - ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┓ - ┃ row_names ┃ column1 ┃ - ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━┩ - │ 1 │ ENS00002 │ - └───────────┴────────────────┘ + BiocFrame with 1 row and 1 column + column1 + + [0] ENS00002 This operation accepts different slice input types, you can either specify a boolean vector, a `slice` object, a list of indices, or row/column names to subset. @@ -179,20 +169,23 @@ combined = combine(bframe1, bframe2) # OR an object oriented approach -combined = bframe.combine(bframe2) +combined = bframe1.combine(bframe2) ``` ## output - BiocFrame with 10 rows & 2 - columns - ┏━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ - ┃ odd ┃ even ┃ - ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ - │ 1 │ 0 │ - │ 3 │ 2 │ - │ ... │ ... │ - │ 99 │ 88 │ - └────────────┴─────────────┘ + BiocFrame with 10 rows and 2 columns + odd even + + [0] 1 0 + [1] 3 2 + [2] 5 4 + [3] 7 6 + [4] 9 8 + [5] 11 0 + [6] 33 22 + [7] 55 44 + [8] 77 66 + [9] 99 88 For more details, check out the BiocFrame class [reference](https://biocpy.github.io/BiocFrame/api/biocframe.html#biocframe.BiocFrame.BiocFrame). diff --git a/setup.cfg b/setup.cfg index bf5e4f9..9ff0c83 100644 --- a/setup.cfg +++ b/setup.cfg @@ -49,8 +49,7 @@ python_requires = >=3.8 # For more information, check out https://semver.org/. install_requires = importlib-metadata; python_version<"3.8" - rich - biocgenerics>=0.1.1 + biocgenerics>=0.1.3 biocutils>=0.0.6 [options.packages.find] diff --git a/src/biocframe/BiocFrame.py b/src/biocframe/BiocFrame.py index 0d90f83..7fcda45 100644 --- a/src/biocframe/BiocFrame.py +++ b/src/biocframe/BiocFrame.py @@ -1,5 +1,5 @@ from collections import OrderedDict -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union, Sequence from warnings import warn from biocgenerics.colnames import colnames as colnames_generic @@ -9,7 +9,8 @@ from biocgenerics.combine_rows import combine_rows from biocgenerics.rownames import rownames as rownames_generic from biocgenerics.rownames import set_rownames -from biocutils import is_list_of_type, normalize_subscript +from biocgenerics import show_as_cell, format_table +from biocutils import is_list_of_type, normalize_subscript, print_truncated_list from ._validators import validate_cols, validate_rows, validate_unique_list from .Factor import Factor @@ -201,89 +202,101 @@ def _validate(self): self._number_of_rows = 0 def __repr__(self) -> str: - if self.row_names is None: - if self.shape[0] == 0: - return f"Empty BiocFrame with no rows & {self.shape[1]} column{'s' if self.shape[1] != 1 else ''}." + output = "BiocFrame(data={" + data_blobs = [] + for k, v in self._data.items(): + if isinstance(v, list): + data_blobs.append(repr(k) + ": " + print_truncated_list(v)) + else: + data_blobs.append(repr(k) + ": " + repr(v)) + output += ", ".join(data_blobs) + output += "}" - if self.shape[1] == 0: - return f"Empty BiocFrame with {self.shape[0]} row{'s' if self.shape[0] != 1 else ''} & no columns." + output += ", number_of_rows=" + str(self.shape[0]) + if self._row_names: + output += ", row_names=" + print_truncated_list(self._row_names) - return ( - f"BiocFrame with {self.dims[0]} row{'s' if self.shape[0] != 1 else ''}" - f" & {self.dims[1]} column{'s' if self.dims[1] != 1 else ''}" - ) + output += ", column_names=" + print_truncated_list(self._column_names) - def __str__(self) -> str: - if self.row_names is None: - if self.shape[0] == 0: - return f"Empty BiocFrame with no rows & {self.shape[1]} column{'s' if self.shape[1] != 1 else ''}." + if self._mcols is not None and self._mcols.shape[1] > 0: + # TODO: fix potential recursion here. + output += ", mcols=" + repr(self._mcols) - if self.shape[1] == 0: - return f"Empty BiocFrame with {self.shape[0]} row{'s' if self.shape[0] != 1 else ''} & no columns." + if len(self._metadata): + meta_blobs = [] + for k, v in self._metadata.items(): + if isinstance(v, list): + meta_blobs.append(repr(k) + ": " + print_truncated_list(v)) + else: + meta_blobs.append(repr(k) + ": " + repr(v)) + output += ", metadata={" + ", ".join(data_blobs) + "}" - from io import StringIO + output += ")" + return output - from rich.console import Console - from rich.table import Table + def __str__(self) -> str: + output = f"BiocFrame with {self.dims[0]} row{'s' if self.shape[0] != 1 else ''}" + output += f" and {self.dims[1]} column{'s' if self.dims[1] != 1 else ''}\n" + + nr = self.shape[0] + added_table = False + if nr and self.shape[1]: + if nr <= 10: + indices = range(nr) + insert_ellipsis = False + else: + indices = [0, 1, 2, nr - 3, nr - 2, nr - 1] + insert_ellipsis = True - table = Table( - title=( - f"BiocFrame with {self.dims[0]} row{'s' if self.shape[0] != 1 else ''}" - f" & {self.dims[1]} column{'s' if self.dims[1] != 1 else ''}" - ), - show_header=True, - ) - if self.row_names is not None: - table.add_column("row_names") + if self._row_names is not None: + raw_floating = _slice_or_index(self._row_names, indices) + else: + raw_floating = ["[" + str(i) + "]" for i in indices] + if insert_ellipsis: + raw_floating = raw_floating[:3] + [""] + raw_floating[3:] + floating = ["", ""] + raw_floating + + columns = [] + for col in self._column_names: + data = self._data[col] + showed = show_as_cell(data, indices) + header = [col, "<" + type(data).__name__ + ">"] + minwidth = max(40, len(header[0]), len(header[1])) + for i, y in enumerate(showed): + if len(y) > minwidth: + showed[i] = y[: minwidth - 3] + "..." + if insert_ellipsis: + showed = showed[:3] + ["..."] + showed[3:] + columns.append(header + showed) + + output += format_table(columns, floating_names=floating) + added_table = True + + footer = [] + if self.mcols is not None and self.mcols.shape[1]: + footer.append( + "mcols (" + + str(self.mcols.shape[1]) + + "): " + + print_truncated_list( + self.mcols.column_names, sep=" ", include_brackets=False + ) + ) + if len(self.metadata): + footer.append( + "metadata (" + + str(len(self.metadata)) + + "): " + + print_truncated_list( + list(self.metadata.keys()), sep=" ", include_brackets=False + ) + ) + if len(footer): + if added_table: + output += "\n------\n" + output += "\n".join(footer) - for col in self.column_names: - table.add_column(f"{str(col)} [italic]<{type(self.column(col)).__name__}>") - - _rows = [] - rows_to_show = 2 - _top = self.shape[0] - if _top > rows_to_show: - _top = rows_to_show - - # top two rows - for r in range(_top): - _row = self.row(r) - vals = list(_row.values()) - res = [str(v) for v in vals] - if self.row_names: - res = [str(self.row_names[r])] + res - _rows.append(res) - - if self.shape[0] > 2 * rows_to_show: - # add ... - _dots = [] - if self.row_names: - _dots = ["..."] - - _dots.extend(["..." for _ in range(len(self.column_names))]) - _rows.append(_dots) - - _last = self.shape[0] - _top - if _last < rows_to_show: - _last += 1 - - # last set of rows - for r in range(_last, len(self)): - _row = self.row(r) - vals = list(_row.values()) - res = [str(v) for v in vals] - if self.row_names: - res = [str(self.row_names[r])] + res - _rows.append(res) - - for _row in _rows: - table.add_row(*_row) - - console = Console(file=StringIO()) - with console.capture() as capture: - console.print(table) - - return capture.get() + return output @property def shape(self) -> Tuple[int, int]: @@ -1102,3 +1115,20 @@ def _colnames_bframe(x: BiocFrame): @set_colnames.register(BiocFrame) def _set_colnames_bframe(x: BiocFrame, names: List[str]): x.column_names = names + + +@show_as_cell.register(BiocFrame) +def _show_as_cell_BiocFrame(x: BiocFrame, indices: Sequence[int]) -> List[str]: + constructs = [] + for i in indices: + constructs.append([]) + + for k in x._column_names: + col = show_as_cell(x._data[k], indices) + for i, v in enumerate(col): + constructs[i].append(v) + + for i, x in enumerate(constructs): + constructs[i] = ":".join(x) + + return constructs