Skip to content

Commit

Permalink
feat: function to drop columns/rows with missing values (#97)
Browse files Browse the repository at this point in the history
Closes #10.

### Summary of Changes

Add methods to `Table` to handle missing values:
* `drop_columns_with_missing_values` returns a `Table` without the
columns that have missing values.
* `drop_rows_with_missing_values` returns a `Table` without the rows
that have missing values.

---------

Co-authored-by: lars-reimann <[email protected]>
  • Loading branch information
lars-reimann and lars-reimann authored Mar 27, 2023
1 parent 8f14d65 commit 05d771c
Show file tree
Hide file tree
Showing 3 changed files with 82 additions and 2 deletions.
30 changes: 28 additions & 2 deletions src/safeds/data/tabular/containers/_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,19 @@ def drop_columns(self, column_names: list[str]) -> Table:
)
return Table(transformed_data)

def drop_columns_with_missing_values(self) -> Table:
"""
Return a table without the columns that contain missing values.
Returns
-------
table : Table
A table without the columns that contain missing values.
"""
return Table.from_columns(
[column for column in self.to_columns() if not column.has_missing_values()]
)

def drop_columns_with_non_numerical_values(self) -> Table:
"""
Return a table without the columns that contain non-numerical values.
Expand All @@ -593,12 +606,24 @@ def drop_duplicate_rows(self) -> Table:
-------
result : Table
The table with the duplicate rows removed.
"""
df = self._data.drop_duplicates(ignore_index=True)
df.columns = self._schema.get_column_names()
return Table(df)

def drop_rows_with_missing_values(self) -> Table:
"""
Return a table without the rows that contain missing values.
Returns
-------
table : Table
A table without the rows that contain missing values.
"""
result = self._data.copy(deep=True)
result = result.dropna(axis="index")
return Table(result, self._schema)

def drop_rows_with_outliers(self) -> Table:
"""
Remove all rows from the table that contain at least one outlier defined as having a value that has a distance
Expand Down Expand Up @@ -868,7 +893,8 @@ def split(self, percentage_in_first: float) -> typing.Tuple[Table, Table]:
Returns
-------
result : (Table, Table)
A tuple containing the two resulting tables. The first table has the specified size, the second table contains the rest of the data.
A tuple containing the two resulting tables. The first table has the specified size, the second table
contains the rest of the data.
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import numpy as np
import pandas as pd
from safeds.data.tabular.containers import Table
from safeds.data.tabular.typing import ColumnType, TableSchema


def test_drop_columns_with_missing_values_valid() -> None:
table = Table(
pd.DataFrame(
data={
"col1": [None, None, None, None],
"col2": [1, 2, 3, None],
"col3": [1, 2, 3, 4],
"col4": [2, 3, 1, 4],
}
)
)
updated_table = table.drop_columns_with_missing_values()
assert updated_table.get_column_names() == ["col3", "col4"]


def test_drop_columns_with_missing_values_empty() -> None:
table = Table(
[], TableSchema({"col1": ColumnType.from_numpy_dtype(np.dtype(float))})
)
updated_table = table.drop_columns_with_missing_values()
assert updated_table.get_column_names() == ["col1"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import numpy as np
import pandas as pd
from safeds.data.tabular.containers import Table
from safeds.data.tabular.typing import ColumnType, TableSchema


def test_drop_rows_with_missing_values_valid() -> None:
table = Table(
pd.DataFrame(
data={
"col1": [None, None, "C", "A"],
"col2": [None, "Test1", "Test3", "Test1"],
"col3": [None, 2, 3, 4],
"col4": [None, 3, 1, 4],
}
)
)
updated_table = table.drop_rows_with_missing_values()
assert updated_table.count_rows() == 2


def test_drop_rows_with_missing_values_empty() -> None:
table = Table(
[], TableSchema({"col1": ColumnType.from_numpy_dtype(np.dtype(float))})
)
updated_table = table.drop_rows_with_missing_values()
assert updated_table.get_column_names() == ["col1"]

0 comments on commit 05d771c

Please sign in to comment.