Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: added Column.summarize_statistics() #715

Merged
merged 8 commits into from
May 4, 2024
33 changes: 33 additions & 0 deletions src/safeds/data/tabular/containers/_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,39 @@ def transform(self, transformer: Callable[[T], R]) -> Column[R]:
# Statistics
# ------------------------------------------------------------------------------------------------------------------

def summarize_statistics(self) -> Table:
"""
Return a table with a number of statistical key values.

The original Column is not modified.

Returns
-------
statistics:
The table with statistics.

Examples
--------
>>> from safeds.data.tabular.containers import Column
>>> column = Column("a", [1, 3])
>>> column.summarize_statistics()
metric a
0 minimum 1
1 maximum 3
2 mean 2.0
3 mode [1, 3]
4 median 2.0
5 variance 2.0
6 standard deviation 1.4142135623730951
7 missing value count 0
8 missing value ratio 0.0
9 idness 1.0
10 stability 0.5
"""
from safeds.data.tabular.containers import Table

return Table({self._name: self._data}).summarize_statistics()

def correlation_with(self, other_column: Column) -> float:
"""
Calculate Pearson correlation between this and another column. Both columns have to be numerical.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from statistics import stdev

import pytest
from safeds.data.tabular.containers import Column, Table


@pytest.mark.parametrize(
("column", "expected"),
[
(
Column("col1", [1, 2, 1]),
Table(
{
"metric": [
"minimum",
"maximum",
"mean",
"mode",
"median",
"variance",
"standard deviation",
"missing value count",
"missing value ratio",
"idness",
"stability",
],
"col1": [
"1",
"2",
str(4.0 / 3),
"[1]",
"1.0",
str(1.0 / 3),
str(stdev([1, 2, 1])),
"0",
"0.0",
str(2.0 / 3),
str(2.0 / 3),
],
},
),
),
(
Column("col1", ["a", "b", "c"]),
Table(
{
"metric": [
"minimum",
"maximum",
"mean",
"mode",
"median",
"variance",
"standard deviation",
"missing value count",
"missing value ratio",
"idness",
"stability",
],
"col1": [
"-",
"-",
"-",
"['a', 'b', 'c']",
"-",
"-",
"-",
"0",
"0.0",
"1.0",
str(1.0 / 3),
],
},
),
),
(
Column("col", [None, None]),
Table(
{
"metric": [
"minimum",
"maximum",
"mean",
"mode",
"median",
"variance",
"standard deviation",
"missing value count",
"missing value ratio",
"idness",
"stability",
],
"col": ["-", "-", "-", "[]", "-", "-", "-", "2", "1.0", "0.0", "-"],
},
),
),
],
ids=[
"Column of integers",
"Column of characters",
"Column of None",
],
)
def test_should_summarize_statistics(column: Column, expected: Table) -> None:
assert column.summarize_statistics().schema == expected.schema
assert column.summarize_statistics() == expected