diff --git a/.coveragerc b/.coveragerc index 68ecdd763..f4afc3ce5 100644 --- a/.coveragerc +++ b/.coveragerc @@ -2,3 +2,4 @@ exclude_lines = pragma: no cover if\s+(typing\.)?TYPE_CHECKING: + \.\.\. diff --git a/.github/linters/.ruff.toml b/.github/linters/.ruff.toml index 10aeeccbd..03b79e181 100644 --- a/.github/linters/.ruff.toml +++ b/.github/linters/.ruff.toml @@ -61,6 +61,8 @@ ignore = [ "FBT002", # builtin-attribute-shadowing (not an issue) "A003", + # implicit-return (can add a return even though all cases are covered) + "RET503", # superfluous-else-return (sometimes it's more readable) "RET505", # superfluous-else-raise (sometimes it's more readable) diff --git a/src/safeds/data/tabular/containers/_column.py b/src/safeds/data/tabular/containers/_column.py index 963841049..0534546aa 100644 --- a/src/safeds/data/tabular/containers/_column.py +++ b/src/safeds/data/tabular/containers/_column.py @@ -1,8 +1,9 @@ from __future__ import annotations import io +from collections.abc import Sequence from numbers import Number -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, TypeVar, overload import matplotlib.pyplot as plt import numpy as np @@ -20,10 +21,12 @@ from safeds.data.tabular.typing import ColumnType if TYPE_CHECKING: - from collections.abc import Callable, Iterable, Iterator + from collections.abc import Callable, Iterator +_T = TypeVar("_T") -class Column: + +class Column(Sequence[_T]): """ A column is a named collection of values. @@ -31,21 +34,77 @@ class Column: ---------- name : str The name of the column. - data : Iterable + data : Sequence[_T] The data. - type_ : Optional[ColumnType] - The type of the column. If not specified, the type will be inferred from the data. + + Examples + -------- + >>> from safeds.data.tabular.containers import Column + >>> column = Column("test", [1, 2, 3]) """ + # ------------------------------------------------------------------------------------------------------------------ + # Creation + # ------------------------------------------------------------------------------------------------------------------ + + @staticmethod + def _from_pandas_series(data: pd.Series, type_: ColumnType | None = None) -> Column: + """ + Create a column from a `pandas.Series`. + + Parameters + ---------- + data : pd.Series + The data. + type_ : ColumnType | None + The type. If None, the type is inferred from the data. + + Returns + ------- + column : Column + The created column. + + Examples + -------- + >>> import pandas as pd + >>> from safeds.data.tabular.containers import Column + >>> column = Column._from_pandas_series(pd.Series([1, 2, 3], name="test")) + """ + result = object.__new__(Column) + result._name = data.name + result._data = data + # noinspection PyProtectedMember + result._type = type_ if type_ is not None else ColumnType._from_numpy_data_type(data.dtype) + + return result + # ------------------------------------------------------------------------------------------------------------------ # Dunder methods # ------------------------------------------------------------------------------------------------------------------ - def __init__(self, name: str, data: Iterable, type_: ColumnType | None = None) -> None: + def __init__(self, name: str, data: Sequence[_T]) -> None: + """ + Create a column. + + Parameters + ---------- + name : str + The name of the column. + data : Sequence[_T] + The data. + + Examples + -------- + >>> from safeds.data.tabular.containers import Column + >>> column = Column("test", [1, 2, 3]) + """ self._name: str = name self._data: pd.Series = data if isinstance(data, pd.Series) else pd.Series(data) # noinspection PyProtectedMember - self._type: ColumnType = type_ if type_ is not None else ColumnType._from_numpy_data_type(self._data.dtype) + self._type: ColumnType = ColumnType._from_numpy_data_type(self._data.dtype) + + def __contains__(self, item: Any) -> bool: + return item in self._data def __eq__(self, other: object) -> bool: if not isinstance(other, Column): @@ -54,10 +113,29 @@ def __eq__(self, other: object) -> bool: return True return self.name == other.name and self._data.equals(other._data) - def __getitem__(self, index: int) -> Any: - return self.get_value(index) - - def __iter__(self) -> Iterator[Any]: + @overload + def __getitem__(self, index: int) -> _T: + ... + + @overload + def __getitem__(self, index: slice) -> Column[_T]: + ... + + def __getitem__(self, index: int | slice) -> _T | Column[_T]: + if isinstance(index, int): + if index < 0 or index >= self._data.size: + raise IndexOutOfBoundsError(index) + return self._data[index] + + if isinstance(index, slice): + if index.start < 0 or index.start > self._data.size: + raise IndexOutOfBoundsError(index) + if index.stop < 0 or index.stop > self._data.size: + raise IndexOutOfBoundsError(index) + data = self._data[index].reset_index(drop=True).rename(self.name) + return Column._from_pandas_series(data, self._type) + + def __iter__(self) -> Iterator[_T]: return iter(self._data) def __len__(self) -> int: @@ -117,18 +195,18 @@ def type(self) -> ColumnType: # Getters # ------------------------------------------------------------------------------------------------------------------ - def get_unique_values(self) -> list[Any]: + def get_unique_values(self) -> list[_T]: """ Return a list of all unique values in the column. Returns ------- - unique_values : list[any] + unique_values : list[_T] List of unique values in the column. """ return list(self._data.unique()) - def get_value(self, index: int) -> Any: + def get_value(self, index: int) -> _T: """ Return column value at specified index, starting at 0. @@ -156,13 +234,13 @@ def get_value(self, index: int) -> Any: # Information # ------------------------------------------------------------------------------------------------------------------ - def all(self, predicate: Callable[[Any], bool]) -> bool: + def all(self, predicate: Callable[[_T], bool]) -> bool: """ Check if all values have a given property. Parameters ---------- - predicate : Callable[[Any], bool]) + predicate : Callable[[_T], bool]) Callable that is used to find matches. Returns @@ -173,13 +251,13 @@ def all(self, predicate: Callable[[Any], bool]) -> bool: """ return all(predicate(value) for value in self._data) - def any(self, predicate: Callable[[Any], bool]) -> bool: + def any(self, predicate: Callable[[_T], bool]) -> bool: """ Check if any value has a given property. Parameters ---------- - predicate : Callable[[Any], bool]) + predicate : Callable[[_T], bool]) Callable that is used to find matches. Returns @@ -190,13 +268,13 @@ def any(self, predicate: Callable[[Any], bool]) -> bool: """ return any(predicate(value) for value in self._data) - def none(self, predicate: Callable[[Any], bool]) -> bool: + def none(self, predicate: Callable[[_T], bool]) -> bool: """ Check if no values has a given property. Parameters ---------- - predicate : Callable[[Any], bool]) + predicate : Callable[[_T], bool]) Callable that is used to find matches. Returns @@ -236,7 +314,7 @@ def rename(self, new_name: str) -> Column: column : Column A new column with the new name. """ - return Column(new_name, self._data, self._type) + return Column._from_pandas_series(self._data.rename(new_name), self._type) # ------------------------------------------------------------------------------------------------------------------ # Statistics @@ -375,7 +453,7 @@ def missing_value_ratio(self) -> float: raise ColumnSizeError("> 0", "0") return self._count_missing_values() / self._data.size - def mode(self) -> Any: + def mode(self) -> list[_T]: """ Return the mode of the column. diff --git a/src/safeds/data/tabular/containers/_table.py b/src/safeds/data/tabular/containers/_table.py index 1f4cab0ee..aa2681eaf 100644 --- a/src/safeds/data/tabular/containers/_table.py +++ b/src/safeds/data/tabular/containers/_table.py @@ -338,15 +338,13 @@ def get_column(self, column_name: str) -> Column: UnknownColumnNameError If the specified target column name does not exist. """ - if self._schema.has_column(column_name): - output_column = Column( - column_name, - self._data.iloc[:, [self._schema._get_column_index(column_name)]].squeeze(), - self._schema.get_column_type(column_name), - ) - return output_column + if not self.has_column(column_name): + raise UnknownColumnNameError([column_name]) - raise UnknownColumnNameError([column_name]) + return Column._from_pandas_series( + self._data[column_name], + self.get_column_type(column_name), + ) def has_column(self, column_name: str) -> bool: """ @@ -980,7 +978,7 @@ def transform_column(self, name: str, transformer: Callable[[Row], Any]) -> Tabl """ if self.has_column(name): items: list = [transformer(item) for item in self.to_rows()] - result: Column = Column(name, pd.Series(items)) + result: Column = Column(name, items) return self.replace_column(name, result) raise UnknownColumnNameError([name]) diff --git a/src/safeds/data/tabular/exceptions/_exceptions.py b/src/safeds/data/tabular/exceptions/_exceptions.py index 2a6a5cf8b..f8189ac19 100644 --- a/src/safeds/data/tabular/exceptions/_exceptions.py +++ b/src/safeds/data/tabular/exceptions/_exceptions.py @@ -42,11 +42,11 @@ class IndexOutOfBoundsError(IndexError): Parameters ---------- - index : int + index : int | slice The wrongly used index. """ - def __init__(self, index: int): + def __init__(self, index: int | slice): super().__init__(f"There is no element at index '{index}'.") diff --git a/src/safeds/data/tabular/typing/_schema.py b/src/safeds/data/tabular/typing/_schema.py index 8ab03dacd..c9bd1996c 100644 --- a/src/safeds/data/tabular/typing/_schema.py +++ b/src/safeds/data/tabular/typing/_schema.py @@ -239,31 +239,3 @@ def _repr_markdown_(self) -> str: lines = (f"| {name} | {type_} |" for name, type_ in self._schema.items()) joined = "\n".join(lines) return f"| Column Name | Column Type |\n| --- | --- |\n{joined}" - - # ------------------------------------------------------------------------------------------------------------------ - # Other - # ------------------------------------------------------------------------------------------------------------------ - - def _get_column_index(self, column_name: str) -> int: - """ - Return the index of the column with specified column name. - - Parameters - ---------- - column_name : str - The name of the column. - - Returns - ------- - index : int - The index of the column. - - Raises - ------ - ColumnNameError - If the specified column name does not exist. - """ - if not self.has_column(column_name): - raise UnknownColumnNameError([column_name]) - - return list(self._schema.keys()).index(column_name) diff --git a/tests/safeds/data/tabular/containers/_column/test_getitem.py b/tests/safeds/data/tabular/containers/_column/test_getitem.py deleted file mode 100644 index 2ae9d3a2b..000000000 --- a/tests/safeds/data/tabular/containers/_column/test_getitem.py +++ /dev/null @@ -1,19 +0,0 @@ -import pytest -from safeds.data.tabular.containers import Column -from safeds.data.tabular.exceptions import IndexOutOfBoundsError - - -def test_getitem_valid() -> None: - column = Column("testColumn", [0, "1"]) - assert column[0] == 0 - assert column[1] == "1" - - -# noinspection PyStatementEffect -def test_getitem_invalid() -> None: - column = Column("testColumn", [0, "1"]) - with pytest.raises(IndexOutOfBoundsError): - column[-1] - - with pytest.raises(IndexOutOfBoundsError): - column[2] diff --git a/tests/safeds/data/tabular/containers/test_column.py b/tests/safeds/data/tabular/containers/test_column.py index 8e97c213d..43dd5ed83 100644 --- a/tests/safeds/data/tabular/containers/test_column.py +++ b/tests/safeds/data/tabular/containers/test_column.py @@ -1,6 +1,116 @@ +from typing import Any + +import pandas as pd import pytest import regex as re from safeds.data.tabular.containers import Column +from safeds.data.tabular.exceptions import IndexOutOfBoundsError +from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, String + + +class TestFromPandasSeries: + @pytest.mark.parametrize( + ("series", "expected"), + [ + (pd.Series([]), []), + (pd.Series([True, False, True]), [True, False, True]), + (pd.Series([1, 2, 3]), [1, 2, 3]), + (pd.Series([1.0, 2.0, 3.0]), [1.0, 2.0, 3.0]), + (pd.Series(["a", "b", "c"]), ["a", "b", "c"]), + (pd.Series([1, 2.0, "a", True]), [1, 2.0, "a", True]), + ], + ids=["empty", "boolean", "integer", "real number", "string", "mixed"], + ) + def test_should_store_the_data(self, series: pd.Series, expected: Column) -> None: + assert list(Column._from_pandas_series(series)) == expected + + @pytest.mark.parametrize( + ("series", "type_"), + [ + (pd.Series([True, False, True]), Boolean()), + (pd.Series([1, 2, 3]), Boolean()), + ], + ids=["type is correct", "type is wrong"], + ) + def test_should_use_type_if_passed(self, series: pd.Series, type_: ColumnType) -> None: + assert Column._from_pandas_series(series, type_).type == type_ + + @pytest.mark.parametrize( + ("series", "expected"), + [ + (pd.Series([]), String()), + (pd.Series([True, False, True]), Boolean()), + (pd.Series([1, 2, 3]), Integer()), + (pd.Series([1.0, 2.0, 3.0]), RealNumber()), + (pd.Series(["a", "b", "c"]), String()), + (pd.Series([1, 2.0, "a", True]), String()), + ], + ids=["empty", "boolean", "integer", "real number", "string", "mixed"], + ) + def test_should_infer_type_if_not_passed(self, series: pd.Series, expected: ColumnType) -> None: + assert Column._from_pandas_series(series).type == expected + + +class TestGetItem: + @pytest.mark.parametrize( + ("column", "index", "expected"), + [ + (Column("a", [0, 1]), 0, 0), + (Column("a", [0, 1]), 1, 1), + ], + ids=["first item", "second item"], + ) + def test_should_get_the_item_at_index(self, column: Column, index: int, expected: Any) -> None: + assert column[index] == expected + + @pytest.mark.parametrize( + ("column", "index", "expected"), + [ + (Column("a", [0, 1, 2]), slice(0, 1), Column("a", [0])), + (Column("a", [0, 1, 2]), slice(2, 3), Column("a", [2])), + (Column("a", [0, 1, 2]), slice(0, 3), Column("a", [0, 1, 2])), + (Column("a", [0, 1, 2]), slice(0, 3, 2), Column("a", [0, 2])), + ], + ids=["first item", "last item", "all items", "every other item"], + ) + def test_should_get_the_items_at_slice(self, column: Column, index: slice, expected: Column) -> None: + assert column[index] == expected + + @pytest.mark.parametrize( + "index", + [-1, 2, slice(-1, 2), slice(0, 4), slice(-1, 4)], + ids=[ + "negative", + "out of bounds", + "slice with negative start", + "slice with out of bounds end", + "slice with negative start and out of bounds end", + ], + ) + def test_should_raise_if_index_is_out_of_bounds(self, index: int | slice) -> None: + column = Column("a", [0, "1"]) + + with pytest.raises(IndexOutOfBoundsError): + # noinspection PyStatementEffect + column[index] + + +class TestContains: + @pytest.mark.parametrize( + ("column", "value", "expected"), + [ + (Column("a", []), 1, False), + (Column("a", [1, 2, 3]), 1, True), + (Column("a", [1, 2, 3]), 4, False), + ], + ids=[ + "empty", + "value exists", + "value does not exist", + ], + ) + def test_should_check_whether_the_value_exists(self, column: Column, value: Any, expected: bool) -> None: + assert (value in column) == expected class TestToHtml: diff --git a/tests/safeds/data/tabular/typing/test_schema.py b/tests/safeds/data/tabular/typing/test_schema.py index ea262131c..dda39f66a 100644 --- a/tests/safeds/data/tabular/typing/test_schema.py +++ b/tests/safeds/data/tabular/typing/test_schema.py @@ -31,6 +31,10 @@ class TestFromPandasDataFrame: pd.DataFrame({"A": ["a", "b", "c"]}), Schema({"A": String()}), ), + ( + pd.DataFrame({"A": [1, 2.0, "a", True]}), + Schema({"A": String()}), + ), ( pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}), Schema({"A": Integer(), "B": String()}), @@ -41,6 +45,7 @@ class TestFromPandasDataFrame: "real number", "string", "boolean", + "mixed", "multiple columns", ], ) @@ -212,27 +217,6 @@ def test_should_return_column_names(self, schema: Schema, expected: list[str]) - assert schema.column_names == expected -class TestGetColumnIndex: - @pytest.mark.parametrize( - ("schema", "column_name", "expected"), - [ - (Schema({"A": Integer()}), "A", 0), - (Schema({"A": Integer(), "B": RealNumber()}), "B", 1), - ], - ids=[ - "single column", - "multiple columns", - ], - ) - def test_should_return_column_index(self, schema: Schema, column_name: str, expected: int) -> None: - assert schema._get_column_index(column_name) == expected - - def test_should_raise_if_column_does_not_exist(self) -> None: - schema = Schema({"A": Integer()}) - with pytest.raises(UnknownColumnNameError): - schema._get_column_index("B") - - class TestToDict: @pytest.mark.parametrize( ("schema", "expected"),