rapidsai · rapids-bot · Jun 6, 2024 · Jun 3, 2024 · May 22, 2024 · May 22, 2024
@@ -5,8 +5,8 @@
 
 from __future__ import annotations
 
-__all__: list[str] = ["DataFrame", "Column", "Scalar"]
+__all__: list[str] = ["DataFrame", "Column", "NamedColumn", "Scalar"]
 
-from cudf_polars.containers.column import Column
+from cudf_polars.containers.column import Column, NamedColumn
 from cudf_polars.containers.dataframe import DataFrame
 from cudf_polars.containers.scalar import Scalar
@@ -13,24 +13,29 @@
 if TYPE_CHECKING:
     from typing_extensions import Self
 
-__all__: list[str] = ["Column"]
+__all__: list[str] = ["Column", "NamedColumn"]
 
 
 class Column:
-    """A column, a name, and sortedness."""
+    """A column with sortedness metadata."""
 
     obj: plc.Column
-    name: str
     is_sorted: plc.types.Sorted
     order: plc.types.Order
     null_order: plc.types.NullOrder
 
-    def __init__(self, column: plc.Column, name: str):
+    def __init__(
+        self,
+        column: plc.Column,
+        *,
+        is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
+        order: plc.types.Order = plc.types.Order.ASCENDING,
+        null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
+    ):
         self.obj = column
-        self.name = name
-        self.is_sorted = plc.types.Sorted.NO
-        self.order = plc.types.Order.ASCENDING
-        self.null_order = plc.types.NullOrder.BEFORE
+        self.is_sorted = is_sorted
+        self.order = order
+        self.null_order = null_order
 
     def sorted_like(self, like: Column, /) -> Self:
         """
@@ -81,22 +86,20 @@ def set_sorted(
         self.null_order = null_order
         return self
 
-    def copy(self, *, new_name: str | None = None) -> Self:
+    def copy(self) -> Self:
         """
-        Return a shallow copy of the column.
-
-        Parameters
-        ----------
-        new_name
-            Optional new name for the copied column.
+        A shallow copy of the column.
 
         Returns
         -------
         New column sharing data with self.
         """
         return type(self)(
-            self.obj, self.name if new_name is None else new_name
-        ).sorted_like(self)
+            self.obj,
+            is_sorted=self.is_sorted,
+            order=self.order,
+            null_order=self.null_order,
+        )
 
     def mask_nans(self) -> Self:
         """Return a copy of self with nans masked out."""
@@ -117,3 +120,44 @@ def nan_count(self) -> int:
                 plc.DataType(plc.TypeId.INT32),
             )
         ).as_py()
+
+
+class NamedColumn(Column):
+    """A column with a name."""
+
+    name: str
+
+    def __init__(
+        self,
+        column: plc.Column,
+        name: str,
+        *,
+        is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
+        order: plc.types.Order = plc.types.Order.ASCENDING,
+        null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
+    ) -> None:
+        super().__init__(
+            column, is_sorted=is_sorted, order=order, null_order=null_order
+        )
+        self.name = name
+
+    def copy(self, *, new_name: str | None = None) -> Self:
+        """
+        A shallow copy of the column.
+
+        Parameters
+        ----------
+        new_name
+            Optional new name for the copied column.
+
+        Returns
+        -------
+        New column sharing data with self.
+        """
+        return type(self)(
+            self.obj,
+            self.name if new_name is None else new_name,
+            is_sorted=self.is_sorted,
+            order=self.order,
+            null_order=self.null_order,
+        )
@@ -12,7 +12,7 @@
 
 import cudf._lib.pylibcudf as plc
 
-from cudf_polars.containers.column import Column
+from cudf_polars.containers.column import NamedColumn
 
 if TYPE_CHECKING:
     from collections.abc import Mapping, Sequence, Set
@@ -21,7 +21,7 @@
 
     import cudf
 
-    from cudf_polars.containers.scalar import Scalar
+    from cudf_polars.containers import Column
 
 
 __all__: list[str] = ["DataFrame"]
@@ -30,26 +30,20 @@
 class DataFrame:
     """A representation of a dataframe."""
 
-    columns: list[Column]
-    scalars: list[Scalar]
+    columns: list[NamedColumn]
     table: plc.Table | None
 
-    def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None:
+    def __init__(self, columns: Sequence[NamedColumn]) -> None:
         self.columns = list(columns)
         self._column_map = {c.name: c for c in self.columns}
-        self.scalars = list(scalars)
-        if len(scalars) == 0:
-            self.table = plc.Table([c.obj for c in columns])
-        else:
-            self.table = None
+        self.table = plc.Table([c.obj for c in columns])
 
     def copy(self) -> Self:
         """Return a shallow copy of self."""
-        return type(self)(self.columns, self.scalars)
+        return type(self)(self.columns)
 
     def to_polars(self) -> pl.DataFrame:
         """Convert to a polars DataFrame."""
-        assert len(self.scalars) == 0
         return pl.from_arrow(
             plc.interop.to_arrow(
                 self.table,
@@ -83,8 +77,10 @@ def num_rows(self) -> int:
     def from_cudf(cls, df: cudf.DataFrame) -> Self:
         """Create from a cudf dataframe."""
         return cls(
-            [Column(c.to_pylibcudf(mode="read"), name) for name, c in df._data.items()],
-            [],
+            [
+                NamedColumn(c.to_pylibcudf(mode="read"), name)
+                for name, c in df._data.items()
+            ]
         )
 
     @classmethod
@@ -108,10 +104,12 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
         ValueError if the number of provided names does not match the
         number of columns in the table.
         """
-        # TODO: strict=True when we drop py39
         if table.num_columns() != len(names):
             raise ValueError("Mismatching name and table length.")
-        return cls([Column(c, name) for c, name in zip(table.columns(), names)], [])
+        return cls(
+            # TODO: strict=True when we drop py39
+            [NamedColumn(c, name) for c, name in zip(table.columns(), names)]
+        )
 
     def sorted_like(
         self, like: DataFrame, /, *, subset: Set[str] | None = None
@@ -139,11 +137,12 @@ def sorted_like(
         subset = self.column_names_set if subset is None else subset
         self.columns = [
             c.sorted_like(other) if c.name in subset else c
+            # TODO: strict=True when we drop py39
             for c, other in zip(self.columns, like.columns)
         ]
         return self
 
-    def with_columns(self, columns: Sequence[Column]) -> Self:
+    def with_columns(self, columns: Sequence[NamedColumn]) -> Self:
         """
         Return a new dataframe with extra columns.
 
@@ -160,35 +159,31 @@ def with_columns(self, columns: Sequence[Column]) -> Self:
         -----
         If column names overlap, newer names replace older ones.
         """
-        return type(self)([*self.columns, *columns], self.scalars)
+        return type(self)([*self.columns, *columns])
 
     def discard_columns(self, names: Set[str]) -> Self:
         """Drop columns by name."""
-        return type(self)(
-            [c for c in self.columns if c.name not in names], self.scalars
-        )
+        return type(self)([c for c in self.columns if c.name not in names])
 
     def select(self, names: Sequence[str]) -> Self:
         """Select columns by name returning DataFrame."""
         want = set(names)
         if not want.issubset(self.column_names_set):
             raise ValueError("Can't select missing names")
-        return type(self)([self._column_map[name] for name in names], self.scalars)
+        return type(self)([self._column_map[name] for name in names])
 
-    def replace_columns(self, *columns: Column) -> Self:
+    def replace_columns(self, *columns: NamedColumn) -> Self:
         """Return a new dataframe with columns replaced by name."""
         new = {c.name: c for c in columns}
         if not set(new).issubset(self.column_names_set):
             raise ValueError("Cannot replace with non-existing names")
-        return type(self)([new.get(c.name, c) for c in self.columns], self.scalars)
+        return type(self)([new.get(c.name, c) for c in self.columns])
 
     def rename_columns(self, mapping: Mapping[str, str]) -> Self:
         """Rename some columns."""
-        return type(self)(
-            [c.copy(new_name=mapping.get(c.name)) for c in self.columns], self.scalars
-        )
+        return type(self)([c.copy(new_name=mapping.get(c.name)) for c in self.columns])
 
-    def select_columns(self, names: Set[str]) -> list[Column]:
+    def select_columns(self, names: Set[str]) -> list[NamedColumn]:
         """Select columns by name."""
         return [c for c in self.columns if c.name in names]