Safe-DS · daniaHu · May 26, 2023 · Jun 2, 2023 · Jun 2, 2023 · Jun 2, 2023
@@ -75,7 +75,7 @@ def _from_pandas_series(data: pd.Series, type_: ColumnType | None = None) -> Col
         result._name = data.name
         result._data = data
         # noinspection PyProtectedMember
-        result._type = type_ if type_ is not None else ColumnType._from_numpy_data_type(data.dtype)
+        result._type = type_ if type_ is not None else ColumnType._data_type(data)
 
         return result
 
@@ -105,7 +105,7 @@ def __init__(self, name: str, data: Sequence[T] | None = None) -> None:
         self._name: str = name
         self._data: pd.Series = data.rename(name) if isinstance(data, pd.Series) else pd.Series(data, name=name)
         # noinspection PyProtectedMember
-        self._type: ColumnType = ColumnType._from_numpy_data_type(self._data.dtype)
+        self._type: ColumnType = ColumnType._data_type(data)
 
     def __contains__(self, item: Any) -> bool:
         return item in self._data
@@ -688,3 +688,4 @@ def _count_missing_values(self) -> int:
             The number of null values.
         """
         return self._data.isna().sum()
+
@@ -1,6 +1,6 @@
 """Types used to define the schema of a tabular dataset."""
 
-from ._column_type import Anything, Boolean, ColumnType, Integer, RealNumber, String
+from ._column_type import Anything, Boolean, ColumnType, Integer, RealNumber, String, Nothing
 from ._imputer_strategy import ImputerStrategy
 from ._schema import Schema
 
@@ -13,4 +13,5 @@
     "RealNumber",
     "Schema",
     "String",
+    "Nothing",
 ]
@@ -2,24 +2,25 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import TYPE_CHECKING
+from types import NoneType
+from typing import Any
 
-if TYPE_CHECKING:
-    import numpy as np
+import numpy as np
+import pandas as pd
 
 
 class ColumnType(ABC):
     """Abstract base class for column types."""
 
     @staticmethod
-    def _from_numpy_data_type(data_type: np.dtype) -> ColumnType:
+    def _data_type(data: pd.Series) -> ColumnType:
         """
         Return the column type for a given `numpy` data type.
 
         Parameters
         ----------
-        data_type : numpy.dtype
-            The `numpy` data type.
+        data : pd.Series
+            The data to be checked.
 
         Returns
         -------
@@ -31,17 +32,40 @@ def _from_numpy_data_type(data_type: np.dtype) -> ColumnType:
         NotImplementedError
             If the given data type is not supported.
         """
-        if data_type.kind in ("u", "i"):
-            return Integer()
-        if data_type.kind == "b":
-            return Boolean()
-        if data_type.kind == "f":
-            return RealNumber()
-        if data_type.kind in ("S", "U", "O", "M", "m"):
-            return String()
-
-        message = f"Unsupported numpy data type '{data_type}'."
-        raise NotImplementedError(message)
+
+        def column_type_of_type(cell_type: Any) -> ColumnType:
+            if cell_type == int or cell_type == np.int64 or cell_type == np.int32:
+                return Integer(is_nullable)
+            if cell_type == bool:
+                return Boolean(is_nullable)
+            if cell_type == float or cell_type == np.float64 or cell_type == np.float32:
+                return RealNumber(is_nullable)
+            if cell_type == str:
+                return String(is_nullable)
+            if cell_type is NoneType:
+                return Nothing()
+            else:
+                message = f"Unsupported numpy data type '{cell_type}'."
+                raise NotImplementedError(message)
+
+        result = Nothing()
+        is_nullable = False
+        for cell in data:
+            if result == Nothing():
+                result = column_type_of_type(type(cell))
+                if type(cell) is NoneType:
+                    is_nullable = True
+                    result._is_nullable = is_nullable
+            if result != column_type_of_type(type(cell)):
+                if type(cell) is NoneType:
+                    is_nullable = True
+                    result._is_nullable = is_nullable
+                elif result == Integer and type(cell) == float:
+                    result = RealNumber(is_nullable)
+                else:
+                    result = Anything(is_nullable)
+
+        return result
 
     @abstractmethod
     def is_nullable(self) -> bool:
@@ -289,3 +313,41 @@ def is_numeric(self) -> bool:
             True if the column is numeric.
         """
         return False
+
+
+@dataclass
+class Nothing(ColumnType):
+    """Type for a column that contains None Values only."""
+
+    _is_nullable: bool
+
+    def __init__(self):
+        self._is_nullable = True
+
+    def __repr__(self) -> str:
+        result = "Nothing"
+        if self._is_nullable:
+            result += "?"
+        return result
+
+    def is_nullable(self) -> bool:
+        """
+        Return whether the given column type is nullable.
+
+        Returns
+        -------
+        is_nullable : bool
+            True if the column is nullable.
+        """
+        return True
+
+    def is_numeric(self) -> bool:
+        """
+        Return whether the given column type is numeric.
+
+        Returns
+        -------
+        is_numeric : bool
+            True if the column is numeric.
+        """
+        return False
@@ -49,7 +49,9 @@ def _from_pandas_dataframe(dataframe: pd.DataFrame) -> Schema:
         """
         names = dataframe.columns
         # noinspection PyProtectedMember
-        types = (ColumnType._from_numpy_data_type(data_type) for data_type in dataframe.dtypes)
+        types = []
+        for col in dataframe:
+            types.append(ColumnType._data_type(dataframe[col]))
 
         return Schema(dict(zip(names, types, strict=True)))
 

@@ -1,7 +1,7 @@
 import pandas as pd
 import pytest
 from safeds.data.tabular.containers import Column
-from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, String
+from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, String, Nothing, Anything
 
 
 @pytest.mark.parametrize(
@@ -35,12 +35,12 @@ def test_should_use_type_if_passed(series: pd.Series, type_: ColumnType) -> None
 @pytest.mark.parametrize(
     ("series", "expected"),
     [
-        (pd.Series([]), String()),
+        (pd.Series([]), Nothing()),
         (pd.Series([True, False, True]), Boolean()),
         (pd.Series([1, 2, 3]), Integer()),
         (pd.Series([1.0, 2.0, 3.0]), RealNumber()),
         (pd.Series(["a", "b", "c"]), String()),
-        (pd.Series([1, 2.0, "a", True]), String()),
+        (pd.Series([1, 2.0, "a", True]), Anything(is_nullable=False)),
     ],
     ids=["empty", "boolean", "integer", "real number", "string", "mixed"],
 )

@@ -3,7 +3,7 @@
 import pandas as pd
 import pytest
 from safeds.data.tabular.containers import Column
-from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, String
+from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, String, Nothing, Anything
 
 
 def test_should_store_the_name() -> None:
@@ -43,12 +43,12 @@ def test_should_store_the_data(column: Column, expected: list) -> None:
 @pytest.mark.parametrize(
     ("column", "expected"),
     [
-        (Column("A", []), String()),
+        (Column("A", []), Nothing()),
         (Column("A", [True, False, True]), Boolean()),
         (Column("A", [1, 2, 3]), Integer()),
         (Column("A", [1.0, 2.0, 3.0]), RealNumber()),
         (Column("A", ["a", "b", "c"]), String()),
-        (Column("A", [1, 2.0, "a", True]), String()),
+        (Column("A", [1, 2.0, "a", True]), Anything()),
     ],
     ids=["empty", "boolean", "integer", "real number", "string", "mixed"],
 )

@@ -1,7 +1,7 @@
 import pandas as pd
 import pytest
 from safeds.data.tabular.containers import Table
-from safeds.data.tabular.typing import Integer, Schema
+from safeds.data.tabular.typing import Integer, Schema, Nothing
 
 
 @pytest.mark.parametrize(
@@ -15,7 +15,7 @@
         ),
         (
             Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}),
-            Table._from_pandas_dataframe(pd.DataFrame(), Schema({"col1": Integer(), "col2": Integer()})),
+            Table._from_pandas_dataframe(pd.DataFrame(), Schema({"col1": Nothing(), "col2": Nothing()})),
             Table({"col1": [1, 2, 1], "col2": [1, 2, 4]}),
             0,
         ),

@@ -1,53 +1,39 @@
-import numpy as np
+from typing import Iterable
+
 import pytest
+
 from safeds.data.tabular.typing import (
     Anything,
     Boolean,
     ColumnType,
     Integer,
     RealNumber,
     String,
+    Nothing,
 )
 
 
-class TestFromNumpyDataType:
-    # Test cases taken from https://numpy.org/doc/stable/reference/arrays.scalars.html#scalars
+class TestDataType:
     @pytest.mark.parametrize(
-        ("data_type", "expected"),
+        ("data", "expected"),
         [
-            # Boolean
-            (np.dtype(np.bool_), Boolean()),
-            # Number
-            (np.dtype(np.half), RealNumber()),
-            (np.dtype(np.single), RealNumber()),
-            (np.dtype(np.float_), RealNumber()),
-            (np.dtype(np.longfloat), RealNumber()),
-            # Int
-            (np.dtype(np.byte), Integer()),
-            (np.dtype(np.short), Integer()),
-            (np.dtype(np.intc), Integer()),
-            (np.dtype(np.int_), Integer()),
-            (np.dtype(np.longlong), Integer()),
-            (np.dtype(np.ubyte), Integer()),
-            (np.dtype(np.ushort), Integer()),
-            (np.dtype(np.uintc), Integer()),
-            (np.dtype(np.uint), Integer()),
-            (np.dtype(np.ulonglong), Integer()),
-            # String
-            (np.dtype(np.str_), String()),
-            (np.dtype(np.unicode_), String()),
-            (np.dtype(np.object_), String()),
-            (np.dtype(np.datetime64), String()),
-            (np.dtype(np.timedelta64), String()),
+            ([1, 2, 3], Integer(is_nullable=False)),
+            ([1.0, 2.0, 3.0], RealNumber(is_nullable=False)),
+            ([True, False, True], Boolean(is_nullable=False)),
+            (["a", "b", "c"], String(is_nullable=False)),
+            (["a", 1, 2.0], Anything(is_nullable=False)),
+            ([None, None, None], Nothing()),
+            ([None, 1, 2], Integer(is_nullable=True)),
+            ([1.0, 2.0, None], RealNumber(is_nullable=True)),
+            ([True, False, None], Boolean(is_nullable=True)),
+            (["a", None, "b"], String(is_nullable=True)),
+
         ],
-        ids=repr,
+        ids=["Integer", "Real number", "Boolean", "String", "Mixed", "None", "Nullable integer",
+             "Nullable RealNumber", "Nullable Boolean", "Nullable String"],
     )
-    def test_should_create_column_type_from_numpy_data_type(self, data_type: np.dtype, expected: ColumnType) -> None:
-        assert ColumnType._from_numpy_data_type(data_type) == expected
-
-    def test_should_raise_if_data_type_is_not_supported(self) -> None:
-        with pytest.raises(NotImplementedError):
-            ColumnType._from_numpy_data_type(np.dtype(np.void))
+    def test_should_return_the_data_type(self, data: Iterable, expected: ColumnType) -> None:
+        assert ColumnType._data_type(data) == expected
 
 
 class TestRepr:

@@ -1,10 +1,12 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Iterable
 
 import pandas as pd
 import pytest
-from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, Schema, String
+
+
+from safeds.data.tabular.typing import Boolean, ColumnType, Integer, RealNumber, Schema, String, Anything
 from safeds.exceptions import UnknownColumnNameError
 
 if TYPE_CHECKING:
@@ -13,7 +15,7 @@
 
 class TestFromPandasDataFrame:
     @pytest.mark.parametrize(
-        ("dataframe", "expected"),
+        ("columns", "expected"),
         [
             (
                 pd.DataFrame({"A": [True, False, True]}),
@@ -33,24 +35,49 @@ class TestFromPandasDataFrame:
             ),
             (
                 pd.DataFrame({"A": [1, 2.0, "a", True]}),
-                Schema({"A": String()}),
+                Schema({"A": Anything()}),
             ),
             (
                 pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}),
                 Schema({"A": Integer(), "B": String()}),
             ),
+            (
+                pd.DataFrame({"A": [True, False, None]}),
+                Schema({"A": Boolean(is_nullable=True)}),
+            ),
+            (
+                pd.DataFrame({"A": [1, None, 3]}),
+                Schema({"A": RealNumber()}),
+            ),
+            (
+                pd.DataFrame({"A": [1.0, None, 3.0]}),
+                Schema({"A": RealNumber()}),
+            ),
+            (
+                pd.DataFrame({"A": ["a", None, "c"]}),
+                Schema({"A": String(is_nullable=True)}),
+            ),
+            (
+                pd.DataFrame({"A": [1, 2.0, None, True]}),
+                Schema({"A": Anything(is_nullable=True)}),
+            ),
         ],
         ids=[
+            "boolean",
             "integer",
             "real number",
             "string",
-            "boolean",
             "mixed",
             "multiple columns",
+            "boolean?",
+            "integer?",
+            "real number?",
+            "string?",
+            "Anything?",
         ],
     )
-    def test_should_create_schema_from_pandas_dataframe(self, dataframe: pd.DataFrame, expected: Schema) -> None:
-        assert Schema._from_pandas_dataframe(dataframe) == expected
+    def test_should_create_schema_from_pandas_dataframe(self, columns: Iterable, expected: Schema) -> None:
+        assert Schema._from_pandas_dataframe(columns) == expected
 
 
 class TestRepr: