Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: detect the true columntypes when initializing column #403

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/safeds/data/tabular/containers/_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def _from_pandas_series(data: pd.Series, type_: ColumnType | None = None) -> Col
result._name = data.name
result._data = data
# noinspection PyProtectedMember
result._type = type_ if type_ is not None else ColumnType._from_numpy_data_type(data.dtype)
result._type = type_ if type_ is not None else ColumnType._data_type(data)

return result

Expand Down Expand Up @@ -105,7 +105,7 @@ def __init__(self, name: str, data: Sequence[T] | None = None) -> None:
self._name: str = name
self._data: pd.Series = data.rename(name) if isinstance(data, pd.Series) else pd.Series(data, name=name)
# noinspection PyProtectedMember
self._type: ColumnType = ColumnType._from_numpy_data_type(self._data.dtype)
self._type: ColumnType = ColumnType._data_type(data)

def __contains__(self, item: Any) -> bool:
return item in self._data
Expand Down Expand Up @@ -688,3 +688,4 @@ def _count_missing_values(self) -> int:
The number of null values.
"""
return self._data.isna().sum()

3 changes: 2 additions & 1 deletion src/safeds/data/tabular/typing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Types used to define the schema of a tabular dataset."""

from ._column_type import Anything, Boolean, ColumnType, Integer, RealNumber, String
from ._column_type import Anything, Boolean, ColumnType, Integer, RealNumber, String, Nothing
from ._imputer_strategy import ImputerStrategy
from ._schema import Schema

Expand All @@ -13,4 +13,5 @@
"RealNumber",
"Schema",
"String",
"Nothing",
]
96 changes: 79 additions & 17 deletions src/safeds/data/tabular/typing/_column_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,25 @@

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import TYPE_CHECKING
from types import NoneType
from typing import Any

if TYPE_CHECKING:
import numpy as np
import numpy as np
import pandas as pd


class ColumnType(ABC):
"""Abstract base class for column types."""

@staticmethod
def _from_numpy_data_type(data_type: np.dtype) -> ColumnType:
def _data_type(data: pd.Series) -> ColumnType:
"""
Return the column type for a given `numpy` data type.

Parameters
----------
data_type : numpy.dtype
The `numpy` data type.
data : pd.Series
The data to be checked.

Returns
-------
Expand All @@ -31,17 +32,40 @@ def _from_numpy_data_type(data_type: np.dtype) -> ColumnType:
NotImplementedError
If the given data type is not supported.
"""
if data_type.kind in ("u", "i"):
return Integer()
if data_type.kind == "b":
return Boolean()
if data_type.kind == "f":
return RealNumber()
if data_type.kind in ("S", "U", "O", "M", "m"):
return String()

message = f"Unsupported numpy data type '{data_type}'."
raise NotImplementedError(message)

def column_type_of_type(cell_type: Any) -> ColumnType:
if cell_type == int or cell_type == np.int64 or cell_type == np.int32:
return Integer(is_nullable)
if cell_type == bool:
return Boolean(is_nullable)
if cell_type == float or cell_type == np.float64 or cell_type == np.float32:
return RealNumber(is_nullable)
if cell_type == str:
return String(is_nullable)
if cell_type is NoneType:
return Nothing()
else:
message = f"Unsupported numpy data type '{cell_type}'."
raise NotImplementedError(message)

result = Nothing()
is_nullable = False
for cell in data:
if result == Nothing():
result = column_type_of_type(type(cell))
if type(cell) is NoneType:
is_nullable = True
result._is_nullable = is_nullable
if result != column_type_of_type(type(cell)):
if type(cell) is NoneType:
is_nullable = True
result._is_nullable = is_nullable
elif result == Integer and type(cell) == float:
result = RealNumber(is_nullable)
else:
result = Anything(is_nullable)

return result

@abstractmethod
def is_nullable(self) -> bool:
Expand Down Expand Up @@ -289,3 +313,41 @@ def is_numeric(self) -> bool:
True if the column is numeric.
"""
return False


@dataclass
class Nothing(ColumnType):
"""Type for a column that contains None Values only."""

_is_nullable: bool

def __init__(self):
self._is_nullable = True

def __repr__(self) -> str:
result = "Nothing"
if self._is_nullable:
result += "?"
return result

def is_nullable(self) -> bool:
"""
Return whether the given column type is nullable.

Returns
-------
is_nullable : bool
True if the column is nullable.
"""
return True

def is_numeric(self) -> bool:
"""
Return whether the given column type is numeric.

Returns
-------
is_numeric : bool
True if the column is numeric.
"""
return False
4 changes: 3 additions & 1 deletion src/safeds/data/tabular/typing/_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ def _from_pandas_dataframe(dataframe: pd.DataFrame) -> Schema:
"""
names = dataframe.columns
# noinspection PyProtectedMember
types = (ColumnType._from_numpy_data_type(data_type) for data_type in dataframe.dtypes)
types = []
for col in dataframe:
types.append(ColumnType._data_type(dataframe[col]))

return Schema(dict(zip(names, types, strict=True)))

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import pytest
from safeds.data.tabular.containers import Column
from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, String
from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, String, Nothing, Anything


@pytest.mark.parametrize(
Expand Down Expand Up @@ -35,12 +35,12 @@ def test_should_use_type_if_passed(series: pd.Series, type_: ColumnType) -> None
@pytest.mark.parametrize(
("series", "expected"),
[
(pd.Series([]), String()),
(pd.Series([]), Nothing()),
(pd.Series([True, False, True]), Boolean()),
(pd.Series([1, 2, 3]), Integer()),
(pd.Series([1.0, 2.0, 3.0]), RealNumber()),
(pd.Series(["a", "b", "c"]), String()),
(pd.Series([1, 2.0, "a", True]), String()),
(pd.Series([1, 2.0, "a", True]), Anything(is_nullable=False)),
],
ids=["empty", "boolean", "integer", "real number", "string", "mixed"],
)
Expand Down
6 changes: 3 additions & 3 deletions tests/safeds/data/tabular/containers/_column/test_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pandas as pd
import pytest
from safeds.data.tabular.containers import Column
from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, String
from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, String, Nothing, Anything


def test_should_store_the_name() -> None:
Expand Down Expand Up @@ -43,12 +43,12 @@ def test_should_store_the_data(column: Column, expected: list) -> None:
@pytest.mark.parametrize(
("column", "expected"),
[
(Column("A", []), String()),
(Column("A", []), Nothing()),
(Column("A", [True, False, True]), Boolean()),
(Column("A", [1, 2, 3]), Integer()),
(Column("A", [1.0, 2.0, 3.0]), RealNumber()),
(Column("A", ["a", "b", "c"]), String()),
(Column("A", [1, 2.0, "a", True]), String()),
(Column("A", [1, 2.0, "a", True]), Anything()),
],
ids=["empty", "boolean", "integer", "real number", "string", "mixed"],
)
Expand Down
4 changes: 2 additions & 2 deletions tests/safeds/data/tabular/containers/_table/test_split.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import pytest
from safeds.data.tabular.containers import Table
from safeds.data.tabular.typing import Integer, Schema
from safeds.data.tabular.typing import Integer, Schema, Nothing


@pytest.mark.parametrize(
Expand All @@ -15,7 +15,7 @@
),
(
Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}),
Table._from_pandas_dataframe(pd.DataFrame(), Schema({"col1": Integer(), "col2": Integer()})),
Table._from_pandas_dataframe(pd.DataFrame(), Schema({"col1": Nothing(), "col2": Nothing()})),
Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}),
0,
),
Expand Down
56 changes: 21 additions & 35 deletions tests/safeds/data/tabular/typing/test_column_type.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,39 @@
import numpy as np
from typing import Iterable

import pytest

from safeds.data.tabular.typing import (
Anything,
Boolean,
ColumnType,
Integer,
RealNumber,
String,
Nothing,
)


class TestFromNumpyDataType:
# Test cases taken from https://numpy.org/doc/stable/reference/arrays.scalars.html#scalars
class TestDataType:
@pytest.mark.parametrize(
("data_type", "expected"),
("data", "expected"),
[
# Boolean
(np.dtype(np.bool_), Boolean()),
# Number
(np.dtype(np.half), RealNumber()),
(np.dtype(np.single), RealNumber()),
(np.dtype(np.float_), RealNumber()),
(np.dtype(np.longfloat), RealNumber()),
# Int
(np.dtype(np.byte), Integer()),
(np.dtype(np.short), Integer()),
(np.dtype(np.intc), Integer()),
(np.dtype(np.int_), Integer()),
(np.dtype(np.longlong), Integer()),
(np.dtype(np.ubyte), Integer()),
(np.dtype(np.ushort), Integer()),
(np.dtype(np.uintc), Integer()),
(np.dtype(np.uint), Integer()),
(np.dtype(np.ulonglong), Integer()),
# String
(np.dtype(np.str_), String()),
(np.dtype(np.unicode_), String()),
(np.dtype(np.object_), String()),
(np.dtype(np.datetime64), String()),
(np.dtype(np.timedelta64), String()),
([1, 2, 3], Integer(is_nullable=False)),
([1.0, 2.0, 3.0], RealNumber(is_nullable=False)),
([True, False, True], Boolean(is_nullable=False)),
(["a", "b", "c"], String(is_nullable=False)),
(["a", 1, 2.0], Anything(is_nullable=False)),
([None, None, None], Nothing()),
([None, 1, 2], Integer(is_nullable=True)),
([1.0, 2.0, None], RealNumber(is_nullable=True)),
([True, False, None], Boolean(is_nullable=True)),
(["a", None, "b"], String(is_nullable=True)),

],
ids=repr,
ids=["Integer", "Real number", "Boolean", "String", "Mixed", "None", "Nullable integer",
"Nullable RealNumber", "Nullable Boolean", "Nullable String"],
)
def test_should_create_column_type_from_numpy_data_type(self, data_type: np.dtype, expected: ColumnType) -> None:
assert ColumnType._from_numpy_data_type(data_type) == expected

def test_should_raise_if_data_type_is_not_supported(self) -> None:
with pytest.raises(NotImplementedError):
ColumnType._from_numpy_data_type(np.dtype(np.void))
def test_should_return_the_data_type(self, data: Iterable, expected: ColumnType) -> None:
assert ColumnType._data_type(data) == expected


class TestRepr:
Expand Down
41 changes: 34 additions & 7 deletions tests/safeds/data/tabular/typing/test_schema.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from __future__ import annotations

from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Iterable

import pandas as pd
import pytest
from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, Schema, String


from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, Schema, String, Anything
from safeds.exceptions import UnknownColumnNameError

if TYPE_CHECKING:
Expand All @@ -13,7 +15,7 @@

class TestFromPandasDataFrame:
@pytest.mark.parametrize(
("dataframe", "expected"),
("columns", "expected"),
[
(
pd.DataFrame({"A": [True, False, True]}),
Expand All @@ -33,24 +35,49 @@ class TestFromPandasDataFrame:
),
(
pd.DataFrame({"A": [1, 2.0, "a", True]}),
Schema({"A": String()}),
Schema({"A": Anything()}),
),
(
pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}),
Schema({"A": Integer(), "B": String()}),
),
(
pd.DataFrame({"A": [True, False, None]}),
Schema({"A": Boolean(is_nullable=True)}),
),
(
pd.DataFrame({"A": [1, None, 3]}),
Schema({"A": RealNumber()}),
),
(
pd.DataFrame({"A": [1.0, None, 3.0]}),
Schema({"A": RealNumber()}),
),
(
pd.DataFrame({"A": ["a", None, "c"]}),
Schema({"A": String(is_nullable=True)}),
),
(
pd.DataFrame({"A": [1, 2.0, None, True]}),
Schema({"A": Anything(is_nullable=True)}),
),
],
ids=[
"boolean",
"integer",
"real number",
"string",
"boolean",
"mixed",
"multiple columns",
"boolean?",
"integer?",
"real number?",
"string?",
"Anything?",
],
)
def test_should_create_schema_from_pandas_dataframe(self, dataframe: pd.DataFrame, expected: Schema) -> None:
assert Schema._from_pandas_dataframe(dataframe) == expected
def test_should_create_schema_from_pandas_dataframe(self, columns: Iterable, expected: Schema) -> None:
assert Schema._from_pandas_dataframe(columns) == expected


class TestRepr:
Expand Down