Skip to content

Commit

Permalink
feat: string operations on cells (#791)
Browse files Browse the repository at this point in the history
### Summary of Changes

Add a first batch of string operations on cells:

* `contains`
* `ends_with`
* `index_of`
* `length`
* `replace`
* `starts_with`
* `substring`
* `to_date`
* `to_datetime`
* `to_float`
* `to_int`
* `to_lowercase`
* `to_uppercase`
* `trim`
* `trim_end`
* `trim_start`

---------

Co-authored-by: megalinter-bot <[email protected]>
  • Loading branch information
lars-reimann and megalinter-bot authored May 19, 2024
1 parent 4137131 commit 4a17f76
Show file tree
Hide file tree
Showing 72 changed files with 2,166 additions and 267 deletions.
7 changes: 6 additions & 1 deletion src/resources/from_json_file.json
Original file line number Diff line number Diff line change
@@ -1 +1,6 @@
{ "a": { "0": 1, "1": 2, "2": 3 }, "b": { "0": 4, "1": 5, "2": 6 } }
{
"columns": [
{ "name": "a", "datatype": "Int64", "bit_settings": "", "values": [1, 2, 3] },
{ "name": "b", "datatype": "Int64", "bit_settings": "", "values": [4, 5, 6] }
]
}
6 changes: 0 additions & 6 deletions src/resources/from_json_file_2.json

This file was deleted.

7 changes: 6 additions & 1 deletion src/resources/to_json_file.json
Original file line number Diff line number Diff line change
@@ -1 +1,6 @@
{ "a": { "0": 1, "1": 2, "2": 3 }, "b": { "0": 4, "1": 5, "2": 6 } }
{
"columns": [
{ "name": "a", "datatype": "Int64", "bit_settings": "", "values": [1, 2, 3] },
{ "name": "b", "datatype": "Int64", "bit_settings": "", "values": [4, 5, 6] }
]
}
6 changes: 0 additions & 6 deletions src/resources/to_json_file_2.json

This file was deleted.

3 changes: 3 additions & 0 deletions src/safeds/data/tabular/containers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from ._cell import Cell
from ._column import Column
from ._row import Row
from ._string_cell import StringCell
from ._table import Table

apipkg.initpkg(
Expand All @@ -16,6 +17,7 @@
"Cell": "._cell:Cell",
"Column": "._column:Column",
"Row": "._row:Row",
"StringCell": "._string_cell:StringCell",
"Table": "._table:Table",
},
)
Expand All @@ -24,5 +26,6 @@
"Cell",
"Column",
"Row",
"StringCell",
"Table",
]
79 changes: 45 additions & 34 deletions src/safeds/data/tabular/containers/_cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
if TYPE_CHECKING:
import polars as pl

from ._string_cell import StringCell

T_co = TypeVar("T_co", covariant=True)
P = TypeVar("P")
P_contra = TypeVar("P_contra", contravariant=True)
R_co = TypeVar("R_co", covariant=True)


Expand Down Expand Up @@ -109,10 +111,10 @@ def __mul__(self, other: Any) -> Cell[R_co]: ...
def __rmul__(self, other: Any) -> Cell[R_co]: ...

@abstractmethod
def __pow__(self, other: float | Cell[P]) -> Cell[R_co]: ...
def __pow__(self, other: float | Cell[P_contra]) -> Cell[R_co]: ...

@abstractmethod
def __rpow__(self, other: float | Cell[P]) -> Cell[R_co]: ...
def __rpow__(self, other: float | Cell[P_contra]) -> Cell[R_co]: ...

@abstractmethod
def __sub__(self, other: Any) -> Cell[R_co]: ...
Expand All @@ -134,6 +136,15 @@ def __hash__(self) -> int: ...
@abstractmethod
def __sizeof__(self) -> int: ...

# ------------------------------------------------------------------------------------------------------------------
# Properties
# ------------------------------------------------------------------------------------------------------------------

@property
@abstractmethod
def str(self) -> StringCell:
"""Namespace for operations on strings."""

# ------------------------------------------------------------------------------------------------------------------
# Boolean operations
# ------------------------------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -372,6 +383,36 @@ def add(self, other: Any) -> Cell[R_co]:
"""
return self.__add__(other)

def div(self, other: Any) -> Cell[R_co]:
"""
Divide by a value. This is equivalent to the `/` operator.
Examples
--------
>>> from safeds.data.tabular.containers import Column
>>> column = Column("example", [6, 8])
>>> column.transform(lambda cell: cell.div(2))
+---------+
| example |
| --- |
| f64 |
+=========+
| 3.00000 |
| 4.00000 |
+---------+
>>> column.transform(lambda cell: cell / 2)
+---------+
| example |
| --- |
| f64 |
+=========+
| 3.00000 |
| 4.00000 |
+---------+
"""
return self.__truediv__(other)

def mod(self, other: Any) -> Cell[R_co]:
"""
Perform a modulo operation. This is equivalent to the `%` operator.
Expand Down Expand Up @@ -432,7 +473,7 @@ def mul(self, other: Any) -> Cell[R_co]:
"""
return self.__mul__(other)

def pow(self, other: float | Cell[P]) -> Cell[R_co]:
def pow(self, other: float | Cell[P_contra]) -> Cell[R_co]:
"""
Raise to a power. This is equivalent to the `**` operator.
Expand Down Expand Up @@ -492,36 +533,6 @@ def sub(self, other: Any) -> Cell[R_co]:
"""
return self.__sub__(other)

def div(self, other: Any) -> Cell[R_co]:
"""
Divide by a value. This is equivalent to the `/` operator.
Examples
--------
>>> from safeds.data.tabular.containers import Column
>>> column = Column("example", [6, 8])
>>> column.transform(lambda cell: cell.div(2))
+---------+
| example |
| --- |
| f64 |
+=========+
| 3.00000 |
| 4.00000 |
+---------+
>>> column.transform(lambda cell: cell / 2)
+---------+
| example |
| --- |
| f64 |
+=========+
| 3.00000 |
| 4.00000 |
+---------+
"""
return self.__truediv__(other)

# ------------------------------------------------------------------------------------------------------------------
# Comparison operations
# ------------------------------------------------------------------------------------------------------------------
Expand Down
4 changes: 2 additions & 2 deletions src/safeds/data/tabular/containers/_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -756,8 +756,8 @@ def correlation_with(self, other: Column) -> float:
>>> column1.correlation_with(column2)
1.0
>>> column4 = Column("test", [3, 2, 1])
>>> column1.correlation_with(column4)
>>> column3 = Column("test", [3, 2, 1])
>>> column1.correlation_with(column3)
-1.0
"""
import polars as pl
Expand Down
26 changes: 23 additions & 3 deletions src/safeds/data/tabular/containers/_lazy_cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
if TYPE_CHECKING:
import polars as pl

from ._string_cell import StringCell

T = TypeVar("T")
P = TypeVar("P")
R = TypeVar("R")
Expand All @@ -31,7 +33,9 @@ def __init__(self, expression: pl.Expr) -> None:
# "Boolean" operators (actually bitwise) -----------------------------------

def __invert__(self) -> Cell[bool]:
return _wrap(self._expression.__invert__())
import polars as pl

return _wrap(self._expression.cast(pl.Boolean).__invert__())

def __and__(self, other: bool | Cell[bool]) -> Cell[bool]:
return _wrap(self._expression.__and__(other))
Expand Down Expand Up @@ -83,10 +87,16 @@ def __abs__(self) -> Cell[R]:
return _wrap(self._expression.__abs__())

def __ceil__(self) -> Cell[R]:
return _wrap(self._expression.ceil())
import polars as pl

# polars does not yet implement floor for integers
return _wrap(self._expression.cast(pl.Float64).ceil())

def __floor__(self) -> Cell[R]:
return _wrap(self._expression.floor())
import polars as pl

# polars does not yet implement floor for integers
return _wrap(self._expression.cast(pl.Float64).floor())

def __neg__(self) -> Cell[R]:
return _wrap(self._expression.__neg__())
Expand Down Expand Up @@ -166,6 +176,16 @@ def __hash__(self) -> int:
def __sizeof__(self) -> int:
return self._expression.__sizeof__()

# ------------------------------------------------------------------------------------------------------------------
# Properties
# ------------------------------------------------------------------------------------------------------------------

@property
def str(self) -> StringCell:
from ._lazy_string_cell import _LazyStringCell # circular import

return _LazyStringCell(self._expression)

# ------------------------------------------------------------------------------------------------------------------
# Internal
# ------------------------------------------------------------------------------------------------------------------
Expand Down
101 changes: 101 additions & 0 deletions src/safeds/data/tabular/containers/_lazy_string_cell.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from safeds._utils import _structural_hash
from safeds._validation import _check_bounds, _ClosedBound

from ._lazy_cell import _LazyCell
from ._string_cell import StringCell

if TYPE_CHECKING:
import datetime

import polars as pl

from ._cell import Cell


class _LazyStringCell(StringCell):
# ------------------------------------------------------------------------------------------------------------------
# Dunder methods
# ------------------------------------------------------------------------------------------------------------------

def __init__(self, expression: pl.Expr) -> None:
self._expression: pl.Expr = expression

def __hash__(self) -> int:
return _structural_hash(self._expression.meta.serialize())

def __sizeof__(self) -> int:
return self._expression.__sizeof__()

# ------------------------------------------------------------------------------------------------------------------
# String operations
# ------------------------------------------------------------------------------------------------------------------

def contains(self, substring: str) -> Cell[bool]:
return _LazyCell(self._expression.str.contains(substring, literal=True))

def length(self, optimize_for_ascii: bool = False) -> Cell[int]:
if optimize_for_ascii:
return _LazyCell(self._expression.str.len_bytes())
else:
return _LazyCell(self._expression.str.len_chars())

def ends_with(self, suffix: str) -> Cell[bool]:
return _LazyCell(self._expression.str.ends_with(suffix))

def index_of(self, substring: str) -> Cell[int | None]:
return _LazyCell(self._expression.str.find(substring, literal=True))

def replace(self, old: str, new: str) -> Cell[str]:
return _LazyCell(self._expression.str.replace_all(old, new, literal=True))

def starts_with(self, prefix: str) -> Cell[bool]:
return _LazyCell(self._expression.str.starts_with(prefix))

def substring(self, start: int = 0, length: int | None = None) -> Cell[str]:
_check_bounds("length", length, lower_bound=_ClosedBound(0))

return _LazyCell(self._expression.str.slice(start, length))

def to_date(self) -> Cell[datetime.date | None]:
return _LazyCell(self._expression.str.to_date(format="%F", strict=False))

def to_datetime(self) -> Cell[datetime.datetime | None]:
return _LazyCell(self._expression.str.to_datetime(format="%+", strict=False))

def to_int(self, *, base: int = 10) -> Cell[int | None]:
return _LazyCell(self._expression.str.to_integer(base=base, strict=False))

def to_float(self) -> Cell[float | None]:
import polars as pl

return _LazyCell(self._expression.cast(pl.Float64, strict=False))

def to_lowercase(self) -> Cell[str]:
return _LazyCell(self._expression.str.to_lowercase())

def to_uppercase(self) -> Cell[str]:
return _LazyCell(self._expression.str.to_uppercase())

def trim(self) -> Cell[str]:
return _LazyCell(self._expression.str.strip_chars())

def trim_end(self) -> Cell[str]:
return _LazyCell(self._expression.str.strip_chars_end())

def trim_start(self) -> Cell[str]:
return _LazyCell(self._expression.str.strip_chars_start())

# ------------------------------------------------------------------------------------------------------------------
# Internal
# ------------------------------------------------------------------------------------------------------------------

def _equals(self, other: object) -> bool:
if not isinstance(other, _LazyStringCell):
return NotImplemented
if self is other:
return True
return self._expression.meta.eq(other._expression.meta)
Loading

0 comments on commit 4a17f76

Please sign in to comment.