Skip to content

Commit

Permalink
feat: add spark support
Browse files Browse the repository at this point in the history
  • Loading branch information
vorel99 committed Dec 16, 2023
1 parent c0d29b4 commit 16c64c9
Show file tree
Hide file tree
Showing 17 changed files with 105 additions and 194 deletions.
4 changes: 0 additions & 4 deletions src/ydata_profiling/model/pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,8 @@
dataframe_pandas,
describe_boolean_pandas,
describe_categorical_pandas,
describe_counts_pandas,
describe_date_pandas,
describe_file_pandas,
describe_generic_pandas,
describe_image_pandas,
describe_numeric_pandas,
describe_path_pandas,
Expand All @@ -27,10 +25,8 @@
"dataframe_pandas",
"describe_boolean_pandas",
"describe_categorical_pandas",
"describe_counts_pandas",
"describe_date_pandas",
"describe_file_pandas",
"describe_generic_pandas",
"describe_image_pandas",
"describe_numeric_pandas",
"describe_path_pandas",
Expand Down
64 changes: 0 additions & 64 deletions src/ydata_profiling/model/pandas/describe_counts_pandas.py

This file was deleted.

37 changes: 0 additions & 37 deletions src/ydata_profiling/model/pandas/describe_generic_pandas.py

This file was deleted.

4 changes: 0 additions & 4 deletions src/ydata_profiling/model/spark/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
dataframe_spark,
describe_boolean_spark,
describe_categorical_spark,
describe_counts_spark,
describe_date_spark,
describe_generic_spark,
describe_numeric_spark,
describe_supported_spark,
duplicates_spark,
Expand All @@ -21,9 +19,7 @@
"dataframe_spark",
"describe_boolean_spark",
"describe_categorical_spark",
"describe_counts_spark",
"describe_date_spark",
"describe_generic_spark",
"describe_numeric_spark",
"describe_supported_spark",
"duplicates_spark",
Expand Down
1 change: 1 addition & 0 deletions src/ydata_profiling/model/spark/correlations_spark.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Correlations between variables."""

from typing import Optional

import pandas as pd
Expand Down
7 changes: 4 additions & 3 deletions src/ydata_profiling/model/spark/describe_boolean_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@

from ydata_profiling.config import Settings
from ydata_profiling.model.summary_algorithms import describe_boolean_1d
from ydata_profiling.model.var_description.default import VarDescription


@describe_boolean_1d.register
def describe_boolean_1d_spark(
config: Settings, df: DataFrame, summary: dict
) -> Tuple[Settings, DataFrame, dict]:
config: Settings, df: DataFrame, summary: VarDescription
) -> Tuple[Settings, DataFrame, VarDescription]:
"""Describe a boolean series.
Args:
Expand All @@ -20,7 +21,7 @@ def describe_boolean_1d_spark(
A dict containing calculated series description values.
"""

value_counts = summary["value_counts"]
value_counts = summary.value_counts

# get the most common boolean value and its frequency
top = value_counts.first()
Expand Down
5 changes: 3 additions & 2 deletions src/ydata_profiling/model/spark/describe_categorical_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@

from ydata_profiling.config import Settings
from ydata_profiling.model.summary_algorithms import describe_categorical_1d
from ydata_profiling.model.var_description.default import VarDescription


@describe_categorical_1d.register
def describe_categorical_1d_spark(
config: Settings, df: DataFrame, summary: dict
) -> Tuple[Settings, DataFrame, dict]:
config: Settings, df: DataFrame, summary: VarDescription
) -> Tuple[Settings, DataFrame, VarDescription]:
"""Describe a categorical series.
Args:
Expand Down
5 changes: 3 additions & 2 deletions src/ydata_profiling/model/spark/describe_date_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from ydata_profiling.config import Settings
from ydata_profiling.model.summary_algorithms import describe_date_1d
from ydata_profiling.model.var_description.default import VarDescription


def date_stats_spark(df: DataFrame, summary: dict) -> dict:
Expand All @@ -21,8 +22,8 @@ def date_stats_spark(df: DataFrame, summary: dict) -> dict:

@describe_date_1d.register
def describe_date_1d_spark(
config: Settings, df: DataFrame, summary: dict
) -> Tuple[Settings, DataFrame, dict]:
config: Settings, df: DataFrame, summary: VarDescription
) -> Tuple[Settings, DataFrame, VarDescription]:
"""Describe a date series.
Args:
Expand Down
32 changes: 0 additions & 32 deletions src/ydata_profiling/model/spark/describe_generic_spark.py

This file was deleted.

21 changes: 11 additions & 10 deletions src/ydata_profiling/model/spark/describe_numeric_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@
describe_numeric_1d,
histogram_compute,
)
from ydata_profiling.model.var_description.default import VarDescription


def numeric_stats_spark(df: DataFrame, summary: dict) -> dict:
def numeric_stats_spark(df: DataFrame, summary: VarDescription) -> dict:
column = df.columns[0]

expr = [
Expand All @@ -29,8 +30,8 @@ def numeric_stats_spark(df: DataFrame, summary: dict) -> dict:

@describe_numeric_1d.register
def describe_numeric_1d_spark(
config: Settings, df: DataFrame, summary: dict
) -> Tuple[Settings, DataFrame, dict]:
config: Settings, df: DataFrame, summary: VarDescription
) -> Tuple[Settings, DataFrame, VarDescription]:
"""Describe a boolean series.
Args:
Expand All @@ -51,7 +52,7 @@ def describe_numeric_1d_spark(
summary["kurtosis"] = stats["kurtosis"]
summary["sum"] = stats["sum"]

value_counts = summary["value_counts"]
value_counts = summary.value_counts

n_infinite = (
value_counts.where(F.col(df.columns[0]).isin([np.inf, -np.inf]))
Expand Down Expand Up @@ -106,12 +107,12 @@ def describe_numeric_1d_spark(
).stat.approxQuantile("abs_dev", [0.5], quantile_threshold)[0]

# FIXME: move to fmt
summary["p_negative"] = summary["n_negative"] / summary["n"]
summary["p_negative"] = summary["n_negative"] / summary.n
summary["range"] = summary["max"] - summary["min"]
summary["iqr"] = summary["75%"] - summary["25%"]
summary["cv"] = summary["std"] / summary["mean"] if summary["mean"] else np.NaN
summary["p_zeros"] = summary["n_zeros"] / summary["n"]
summary["p_infinite"] = summary["n_infinite"] / summary["n"]
summary["p_zeros"] = summary["n_zeros"] / summary.n
summary["p_infinite"] = summary["n_infinite"] / summary.n

# TODO - enable this feature
# because spark doesn't have an indexing system, there isn't really the idea of monotonic increase/decrease
Expand All @@ -124,14 +125,14 @@ def describe_numeric_1d_spark(
# display in pandas display
# the alternative is to do this in spark natively, but it is not trivial
infinity_values = [np.inf, -np.inf]
infinity_index = summary["value_counts_without_nan"].index.isin(infinity_values)
infinity_index = summary.value_counts_without_nan.index.isin(infinity_values)

summary.update(
histogram_compute(
config,
summary["value_counts_without_nan"][~infinity_index].index.values,
summary.value_counts_without_nan[~infinity_index].index.values,
summary["n_distinct"],
weights=summary["value_counts_without_nan"][~infinity_index].values,
weights=summary.value_counts_without_nan[~infinity_index].values,
)
)

Expand Down
20 changes: 7 additions & 13 deletions src/ydata_profiling/model/spark/describe_supported_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,17 @@
from pyspark.sql import DataFrame

from ydata_profiling.config import Settings
from ydata_profiling.model.spark.var_description.default_spark import (
get_default_spark_description,
)
from ydata_profiling.model.summary_algorithms import describe_supported
from ydata_profiling.model.var_description.default import VarDescription


@describe_supported.register
def describe_supported_spark(
config: Settings, series: DataFrame, summary: dict
) -> Tuple[Settings, DataFrame, dict]:
) -> Tuple[Settings, DataFrame, VarDescription]:
"""Describe a supported series.
Args:
series: The Series to describe.
Expand All @@ -18,16 +22,6 @@ def describe_supported_spark(
A dict containing calculated series description values.
"""

# number of non-NaN observations in the Series
count = summary["count"]
n_distinct = summary["value_counts"].count()
series_description = get_default_spark_description(config, series, summary)

summary["n_distinct"] = n_distinct
summary["p_distinct"] = n_distinct / count if count > 0 else 0

n_unique = summary["value_counts"].where("count == 1").count()
summary["is_unique"] = n_unique == count
summary["n_unique"] = n_unique
summary["p_unique"] = n_unique / count

return config, series, summary
return config, series, series_description
5 changes: 3 additions & 2 deletions src/ydata_profiling/model/spark/describe_text_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@

from ydata_profiling.config import Settings
from ydata_profiling.model.summary_algorithms import describe_text_1d
from ydata_profiling.model.var_description.default import VarDescription


@describe_text_1d.register
def describe_text_1d_spark(
config: Settings, df: DataFrame, summary: dict
) -> Tuple[Settings, DataFrame, dict]:
config: Settings, df: DataFrame, summary: VarDescription
) -> Tuple[Settings, DataFrame, VarDescription]:
"""Describe a categorical series.
Args:
Expand Down
1 change: 1 addition & 0 deletions src/ydata_profiling/model/spark/summary_spark.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Compute statistical description of datasets."""

import multiprocessing
from typing import Tuple

Expand Down
1 change: 1 addition & 0 deletions src/ydata_profiling/model/spark/timeseries_index_spark.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Compute statistical description of datasets."""

from pyspark.sql import DataFrame

from ydata_profiling.config import Settings
Expand Down
Loading

0 comments on commit 16c64c9

Please sign in to comment.