Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: __repr__ method to Schema, Row, Column, SupervisedDataset & Table; summary() to Column & Table #333

Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
751 changes: 525 additions & 226 deletions Runtime/safe-ds/poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Runtime/safe-ds/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ pandas = "^1.5.2"
scikit-learn = "^1.2.0"
seaborn = "^0.12.2"
matplotlib = "^3.6.2"
ipython = "^8.8.0"

[tool.poetry.dev-dependencies]
pytest = "^7.2.0"
Expand Down
135 changes: 87 additions & 48 deletions Runtime/safe-ds/safe_ds/data/_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import numpy as np
import pandas as pd
from IPython.core.display_functions import DisplayHandle, display
from safe_ds.exceptions import (
ColumnLengthMismatchError,
ColumnSizeError,
Expand Down Expand Up @@ -73,24 +74,6 @@ def get_value(self, index: int) -> Any:

return self._data[index]

def idness(self) -> float:
"""
Calculates the idness of this column (number of unique values / number of rows).

Returns
-------
idness: float
The idness of the column

Raises
------
ColumnSizeError
If this column is empty
"""
if self._data.size == 0:
raise ColumnSizeError("> 0", "0")
return self._data.nunique() / self._data.size

@property
def statistics(self) -> ColumnStatistics:
return ColumnStatistics(self)
Expand Down Expand Up @@ -204,25 +187,6 @@ def has_missing_values(self) -> bool:
or (isinstance(value, Number) and np.isnan(value))
)

def stability(self) -> float:
"""
Calculates the stability of this column.
The value is calculated as the ratio between the number of mode values and the number of non-null-values.

Returns
-------
stability: float
Stability of this column

Raises
------
ColumnSizeError
If this column is empty
"""
if self._data.size == 0:
raise ColumnSizeError("> 0", "0")
return self._data.value_counts()[self.statistics.mode()] / self._data.count()

def correlation_with(self, other_column: Column) -> float:
"""
Calculates Pearson correlation between this and another column, if both are numerical
Expand Down Expand Up @@ -270,6 +234,33 @@ def __eq__(self, other: object) -> bool:
def __hash__(self) -> int:
return hash(self._data)

def __str__(self) -> str:
tmp = self._data.to_frame()
tmp.columns = [self.name]
return tmp.__str__()

def __repr__(self) -> str:
tmp = self._data.to_frame()
tmp.columns = [self.name]
return tmp.__repr__()

def _ipython_display_(self) -> DisplayHandle:
"""
Returns a pretty display object for the Column to be used in Jupyter Notebooks

Returns
-------
output: DisplayHandle
Output object
"""
tmp = self._data.to_frame()
tmp.columns = [self.name]

with pd.option_context(
"display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1]
):
return display(tmp)


class ColumnStatistics:
def __init__(self, column: Column):
Expand All @@ -286,11 +277,13 @@ def max(self) -> float:

Raises
------
TypeError
NonNumericColumnError
If the data contains non-numerical data.
"""
if not self.column._type.is_numeric():
raise TypeError("The column contains non numerical data.")
raise NonNumericColumnError(
f"{self.column.name} is of type {self.column._type}."
)
return self.column._data.max()

def min(self) -> float:
Expand All @@ -304,11 +297,13 @@ def min(self) -> float:

Raises
------
TypeError
NonNumericColumnError
If the data contains non-numerical data.
"""
if not self.column._type.is_numeric():
raise TypeError("The column contains non numerical data.")
raise NonNumericColumnError(
f"{self.column.name} is of type {self.column._type}."
)
return self.column._data.min()

def mean(self) -> float:
Expand All @@ -322,11 +317,13 @@ def mean(self) -> float:

Raises
------
TypeError
NonNumericColumnError
If the data contains non-numerical data.
"""
if not self.column._type.is_numeric():
raise TypeError("The column contains non numerical data.")
raise NonNumericColumnError(
f"{self.column.name} is of type {self.column._type}."
)
return self.column._data.mean()

def mode(self) -> Any:
Expand All @@ -351,11 +348,13 @@ def median(self) -> float:

Raises
------
TypeError
NonNumericColumnError
If the data contains non-numerical data.
"""
if not self.column._type.is_numeric():
raise TypeError("The column contains non numerical data.")
raise NonNumericColumnError(
f"{self.column.name} is of type {self.column._type}."
)
return self.column._data.median()

def sum(self) -> float:
Expand All @@ -369,7 +368,7 @@ def sum(self) -> float:

Raises
---
NonNumericalColumnError
NonNumericColumnError
If the data is non numerical

"""
Expand All @@ -391,7 +390,7 @@ def variance(self) -> float:

Raises
---
NonNumericalColumnError
NonNumericColumnError
If the data is non numerical

"""
Expand All @@ -414,7 +413,7 @@ def standard_deviation(self) -> float:

Raises
---
NonNumericalColumnError
NonNumericColumnError
If the data is non numerical

"""
Expand All @@ -423,3 +422,43 @@ def standard_deviation(self) -> float:
f"{self.column.name} is of type {self.column._type}."
)
return self.column._data.std()

def stability(self) -> float:
"""
Calculates the stability of this column.
The value is calculated as the ratio between the number of mode values and the number of non-null-values.

Returns
-------
stability: float
Stability of this column

Raises
------
ColumnSizeError
If this column is empty
"""
if self.column._data.size == 0:
raise ColumnSizeError("> 0", "0")
return (
self.column._data.value_counts()[self.column.statistics.mode()]
/ self.column._data.count()
)

def idness(self) -> float:
"""
Calculates the idness of this column (number of unique values / number of rows).

Returns
-------
idness: float
The idness of the column

Raises
------
ColumnSizeError
If this column is empty
"""
if self.column._data.size == 0:
raise ColumnSizeError("> 0", "0")
return self.column._data.nunique() / self.column._data.size
39 changes: 39 additions & 0 deletions Runtime/safe-ds/safe_ds/data/_row.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Any

import pandas as pd
from IPython.core.display_functions import DisplayHandle, display
from safe_ds.exceptions import UnknownColumnNameError

from ._table_schema import TableSchema
Expand Down Expand Up @@ -49,6 +50,17 @@ def has_column(self, column_name: str) -> bool:
"""
return self.schema.has_column(column_name)

def get_column_names(self) -> list[str]:
"""
Get a list of the ordered column names

Returns
-------
result: list[str]
Order Column names
"""
return list(self.schema._schema.keys())

def __eq__(self, other: typing.Any) -> bool:
if not isinstance(other, Row):
return NotImplemented
Expand All @@ -58,3 +70,30 @@ def __eq__(self, other: typing.Any) -> bool:

def __hash__(self) -> int:
return hash(self._data)

def __str__(self) -> str:
tmp = self._data.to_frame().T
tmp.columns = self.get_column_names()
return tmp.__str__()

def __repr__(self) -> str:
tmp = self._data.to_frame().T
tmp.columns = self.get_column_names()
return tmp.__repr__()

def _ipython_display_(self) -> DisplayHandle:
"""
Returns a pretty display object for the Row to be used in Jupyter Notebooks

Returns
-------
output: DisplayHandle
Output object
"""
tmp = self._data.to_frame().T
tmp.columns = self.get_column_names()

with pd.option_context(
"display.max_rows", tmp.shape[0], "display.max_columns", tmp.shape[1]
):
return display(tmp)
26 changes: 26 additions & 0 deletions Runtime/safe-ds/safe_ds/data/_supervised_dataset.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from IPython.core.display_functions import DisplayHandle

from ._column import Column
from ._table import Table

Expand Down Expand Up @@ -26,3 +28,27 @@ def feature_vectors(self) -> Table:
@property
def target_values(self) -> Column:
return self._y

def __repr__(self) -> str:
tmp = self._X.add_column(self._y)
header_info = "Target Column is '" + self._y.name + "'\n"
return header_info + tmp.__repr__()

def __str__(self) -> str:
tmp = self._X.add_column(self._y)
header_info = "Target Column is '" + self._y.name + "'\n"
return header_info + tmp.__str__()

def _ipython_display_(self) -> DisplayHandle:
"""
Returns a pretty display object for the Table to be used in Jupyter Notebooks

Returns
-------
output: DisplayHandle
Output object
"""
tmp = self._X.add_column(self._y)
header_info = "Target Column is '" + self._y.name + "'\n"
print(header_info)
return tmp._ipython_display_()
Loading