From e4e881be06e52ad9e1aa8da46d30fc4d41a3493b Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Thu, 16 Jan 2025 10:57:03 +0100 Subject: [PATCH] feat: add duckdb dataframe `drop_nulls` (#1811) * feat: add duckdb dataframe drop_nulls * rollback * test columns with spaces * single double quote * push list conversion to narwhals BaseFrame level --- narwhals/_arrow/dataframe.py | 3 +-- narwhals/_dask/dataframe.py | 3 +-- narwhals/_duckdb/dataframe.py | 9 +++++++++ narwhals/_pandas_like/dataframe.py | 3 +-- narwhals/_spark_like/dataframe.py | 2 +- narwhals/dataframe.py | 3 ++- tests/frame/drop_nulls_test.py | 21 ++++++++------------- 7 files changed, 23 insertions(+), 21 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index ed738647c..96a8ef717 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -395,10 +395,9 @@ def drop(self: Self, columns: list[str], strict: bool) -> Self: # noqa: FBT001 ) return self._from_native_frame(self._native_frame.drop(to_drop)) - def drop_nulls(self: Self, subset: str | list[str] | None) -> Self: + def drop_nulls(self: Self, subset: list[str] | None) -> Self: if subset is None: return self._from_native_frame(self._native_frame.drop_null()) - subset = [subset] if isinstance(subset, str) else subset plx = self.__narwhals_namespace__() return self.filter(~plx.any_horizontal(plx.col(*subset).is_null())) diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 89e1bc482..1a8efc446 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -149,10 +149,9 @@ def select( ) return self._from_native_frame(df) - def drop_nulls(self: Self, subset: str | list[str] | None) -> Self: + def drop_nulls(self: Self, subset: list[str] | None) -> Self: if subset is None: return self._from_native_frame(self._native_frame.dropna()) - subset = [subset] if isinstance(subset, str) else subset plx = self.__narwhals_namespace__() return self.filter(~plx.any_horizontal(plx.col(*subset).is_null())) diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index 98eca0bdb..a3c2798b1 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -321,3 +321,12 @@ def sort( ) ) return self._from_native_frame(result) + + def drop_nulls(self: Self, subset: list[str] | None) -> Self: + import duckdb + + rel = self._native_frame + subset_ = subset if subset is not None else rel.columns + keep_condition = " and ".join(f'"{col}" is not null' for col in subset_) + query = f"select * from rel where {keep_condition}" # noqa: S608 + return self._from_native_frame(duckdb.sql(query)) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 47c43a69a..fdd53b4a6 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -371,10 +371,9 @@ def select( ) return self._from_native_frame(df) - def drop_nulls(self, subset: str | list[str] | None) -> Self: + def drop_nulls(self, subset: list[str] | None) -> Self: if subset is None: return self._from_native_frame(self._native_frame.dropna(axis=0)) - subset = [subset] if isinstance(subset, str) else subset plx = self.__narwhals_namespace__() return self.filter(~plx.any_horizontal(plx.col(*subset).is_null())) diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index 7e7f41ddb..34da43fde 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -183,7 +183,7 @@ def sort( sort_cols = [sort_f(col) for col, sort_f in zip(flat_by, sort_funcs)] return self._from_native_frame(self._native_frame.sort(*sort_cols)) - def drop_nulls(self: Self, subset: str | list[str] | None) -> Self: + def drop_nulls(self: Self, subset: list[str] | None) -> Self: return self._from_native_frame(self._native_frame.dropna(subset=subset)) def rename(self: Self, mapping: dict[str, str]) -> Self: diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 1ae43028c..e45041679 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -107,7 +107,8 @@ def with_row_index(self, name: str = "index") -> Self: self._compliant_frame.with_row_index(name), ) - def drop_nulls(self: Self, subset: str | list[str] | None = None) -> Self: + def drop_nulls(self: Self, subset: str | list[str] | None) -> Self: + subset = [subset] if isinstance(subset, str) else subset return self._from_compliant_dataframe( self._compliant_frame.drop_nulls(subset=subset), ) diff --git a/tests/frame/drop_nulls_test.py b/tests/frame/drop_nulls_test.py index 368ad6ba0..c49b17126 100644 --- a/tests/frame/drop_nulls_test.py +++ b/tests/frame/drop_nulls_test.py @@ -7,18 +7,16 @@ from tests.utils import assert_equal_data data = { - "a": [1.0, 2.0, None, 4.0], - "b": [None, 3.0, None, 5.0], + "alpha": [1.0, 2.0, None, 4.0], + "beta gamma": [None, 3.0, None, 5.0], } -def test_drop_nulls(constructor: Constructor, request: pytest.FixtureRequest) -> None: - if "duckdb" in str(constructor): - request.applymarker(pytest.mark.xfail) +def test_drop_nulls(constructor: Constructor) -> None: result = nw.from_native(constructor(data)).drop_nulls() expected = { - "a": [2.0, 4.0], - "b": [3.0, 5.0], + "alpha": [2.0, 4.0], + "beta gamma": [3.0, 5.0], } assert_equal_data(result, expected) @@ -26,18 +24,15 @@ def test_drop_nulls(constructor: Constructor, request: pytest.FixtureRequest) -> @pytest.mark.parametrize( ("subset", "expected"), [ - ("a", {"a": [1, 2.0, 4.0], "b": [None, 3.0, 5.0]}), - (["a"], {"a": [1, 2.0, 4.0], "b": [None, 3.0, 5.0]}), - (["a", "b"], {"a": [2.0, 4.0], "b": [3.0, 5.0]}), + ("alpha", {"alpha": [1, 2.0, 4.0], "beta gamma": [None, 3.0, 5.0]}), + (["alpha"], {"alpha": [1, 2.0, 4.0], "beta gamma": [None, 3.0, 5.0]}), + (["alpha", "beta gamma"], {"alpha": [2.0, 4.0], "beta gamma": [3.0, 5.0]}), ], ) def test_drop_nulls_subset( constructor: Constructor, subset: str | list[str], expected: dict[str, float], - request: pytest.FixtureRequest, ) -> None: - if "duckdb" in str(constructor): - request.applymarker(pytest.mark.xfail) result = nw.from_native(constructor(data)).drop_nulls(subset=subset) assert_equal_data(result, expected)