Skip to content

Commit

Permalink
update column extension function names and desc in readme
Browse files Browse the repository at this point in the history
enable UP007
  • Loading branch information
Fateme Tardasti authored and fpgmaas committed Jul 15, 2024
1 parent 407f463 commit 615db18
Show file tree
Hide file tree
Showing 9 changed files with 30 additions and 27 deletions.
20 changes: 10 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -476,41 +476,41 @@ from quinn.extensions import *

### Column Extensions

**isFalsy()**
**is_falsy()**

Returns `True` if `has_stuff` is `None` or `False`.
Returns a Column indicating whether all values in the Column are False or NULL: `True` if `has_stuff` is `None` or `False`.

```python
source_df.withColumn("is_stuff_falsy", F.col("has_stuff").isFalsy())
```

**isTruthy()**
**is_truthy()**

Returns `True` unless `has_stuff` is `None` or `False`.
Calculates a boolean expression that is the opposite of is_falsy for the given Column: `True` unless `has_stuff` is `None` or `False`.

```python
source_df.withColumn("is_stuff_truthy", F.col("has_stuff").isTruthy())
```

**isNullOrBlank()**
**is_null_or_blank()**

Returns `True` if `blah` is `null` or blank (the empty string or a string that only contains whitespace).
Returns a Boolean value which expresses whether a given column is NULL or contains only blank characters: `True` if `blah` is `null` or blank (the empty string or a string that only contains whitespace).

```python
source_df.withColumn("is_blah_null_or_blank", F.col("blah").isNullOrBlank())
```

**isNotIn()**
**is_not_in()**

Returns `True` if `fun_thing` is not included in the `bobs_hobbies` list.
To see if a value is not in a list of values: `True` if `fun_thing` is not included in the `bobs_hobbies` list.

```python
source_df.withColumn("is_not_bobs_hobby", F.col("fun_thing").isNotIn(bobs_hobbies))
```

**nullBetween()**
**null_between()**

Returns `True` if `age` is between `lower_age` and `upper_age`. If `lower_age` is populated and `upper_age` is `null`, it will return `True` if `age` is greater than or equal to `lower_age`. If `lower_age` is `null` and `upper_age` is populate, it will return `True` if `age` is lower than or equal to `upper_age`.
To see if a value is between two values in a null friendly way: `True` if `age` is between `lower_age` and `upper_age`. If `lower_age` is populated and `upper_age` is `null`, it will return `True` if `age` is greater than or equal to `lower_age`. If `lower_age` is `null` and `upper_age` is populate, it will return `True` if `age` is lower than or equal to `upper_age`.

```python
source_df.withColumn("is_between", F.col("age").nullBetween(F.col("lower_age"), F.col("upper_age")))
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/create_benchmark_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from __future__ import annotations

import random
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING

from pyspark.sql import SparkSession
from pyspark.sql import functions as F # noqa: N812
Expand All @@ -38,7 +38,7 @@ def save_benchmark_df(
spark: SparkSession,
n: int,
data_label: str,
repartition_n: Optional[int] = None,
repartition_n: int | None = None,
) -> None:
"""Save a benchmark dataframe to disk."""
print(f"Generating benchmark df for n={n}")
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,6 @@ ignore = [
"D205", # It is broken
"TCH003", # I have no idea what is it about
"PLC1901", # Strange thing
"UP007", # Not supported in py3.6
"UP038", # Not supported in all py versions
"SIM108", # Don't create long ternary operators
"PTH123", # Don't force use of Pathlib
"PTH207", # Don't force use of Pathlib
Expand All @@ -109,3 +107,6 @@ ignore = [
"quinn/__init__.py" = ["F401", "F403"]
"quinn/functions.py" = ["FBT003"]
"quinn/keyword_finder.py" = ["A002"]

[tool.ruff.isort]
required-imports = ["from __future__ import annotations"]
1 change: 1 addition & 0 deletions quinn/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# limitations under the License.

"""quinn API."""
from __future__ import annotations

from quinn.append_if_schema_identical import append_if_schema_identical
from quinn.dataframe_helpers import (
Expand Down
6 changes: 5 additions & 1 deletion quinn/append_if_schema_identical.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations

from pyspark.sql import DataFrame
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from pyspark.sql import DataFrame


class SchemaMismatchError(ValueError):
Expand Down
4 changes: 2 additions & 2 deletions quinn/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,9 +278,9 @@ def is_falsy(col: Column) -> Column:


def is_truthy(col: Column) -> Column:
"""Calculates a boolean expression that is the opposite of isFalsy for the given ``Column`` col.
"""Calculates a boolean expression that is the opposite of is_falsy for the given ``Column`` col.
:param Column col: The ``Column`` to calculate the opposite of isFalsy for.
:param Column col: The ``Column`` to calculate the opposite of is_falsy for.
:returns: A ``Column`` with the results of the calculation.
:rtype: Column
"""
Expand Down
10 changes: 4 additions & 6 deletions quinn/math.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,14 @@

from __future__ import annotations

from typing import Optional, Union

from pyspark.sql import Column
from pyspark.sql import functions as F # noqa: N812


def rand_laplace(
mu: Union[float, Column],
beta: Union[float, Column],
seed: Optional[int] = None,
mu: float | Column,
beta: float | Column,
seed: int | None = None,
) -> Column:
"""Generate random numbers from Laplace(mu, beta).
Expand All @@ -47,7 +45,7 @@ def rand_laplace(
def div_or_else(
cola: Column,
colb: Column,
default: Union[float, Column] = 0.0,
default: float | Column = 0.0,
) -> Column:
"""Return result of division of cola by colb or default if colb is zero.
Expand Down
3 changes: 1 addition & 2 deletions quinn/schema_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from __future__ import annotations

import json
from typing import Optional

from pyspark.sql import SparkSession
from pyspark.sql import types as T # noqa: N812
Expand Down Expand Up @@ -100,7 +99,7 @@ def schema_from_csv(spark: SparkSession, file_path: str) -> T.StructType: # noq
:rtype: pyspark.sql.types.StructType
"""

def _validate_json(metadata: Optional[str]) -> dict:
def _validate_json(metadata: str | None) -> dict:
if metadata is None:
return {}

Expand Down
4 changes: 2 additions & 2 deletions quinn/split_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from __future__ import annotations

from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING

from pyspark.sql.functions import length, split, trim, udf, when
from pyspark.sql.types import IntegerType
Expand All @@ -28,7 +28,7 @@ def split_col( # noqa: PLR0913
delimiter: str,
new_col_names: list[str],
mode: str = "permissive",
default: Optional[str] = None,
default: str | None = None,
) -> DataFrame:
"""Splits the given column based on the delimiter and creates new columns with the split values.
Expand Down

0 comments on commit 615db18

Please sign in to comment.