Skip to content

Commit

Permalink
feat: Add UNIQUE data type (#70)
Browse files Browse the repository at this point in the history
Resolves #33
  • Loading branch information
mbelak-dtml authored Aug 11, 2023
1 parent e637aa3 commit c5956ca
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 20 deletions.
19 changes: 19 additions & 0 deletions edvart/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class DataType(IntEnum):
DATE = 4
UNKNOWN = 5
MISSING = 6
UNIQUE = 7

def __str__(self):
return self.name.lower()
Expand Down Expand Up @@ -43,6 +44,8 @@ def infer_data_type(series: pd.Series, string_representation: bool = False) -> U
ret = DataType.BOOLEAN
elif is_date(series):
ret = DataType.DATE
elif is_unique(series):
ret = DataType.UNIQUE
elif is_categorical(series):
ret = DataType.CATEGORICAL
elif is_numeric(series):
Expand All @@ -53,6 +56,22 @@ def infer_data_type(series: pd.Series, string_representation: bool = False) -> U
return str(ret) if string_representation else ret


def is_unique(series: pd.Series) -> bool:
"""Heuristic to tell if a series is categorical with only unique values.
Parameters
----------
series : pd.Series
Series from which to infer data type.
Returns
-------
bool
Boolean indicating whether series contains only unique values.
"""
return is_categorical(series) and series.nunique() == len(series)


def is_numeric(series: pd.Series) -> bool:
"""
Heuristic to tell if a series contains numbers only.
Expand Down
52 changes: 32 additions & 20 deletions edvart/report_sections/univariate_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,8 @@ def univariate_analysis(df: pd.DataFrame, columns: Optional[List[str]] = None) -
if data_type in (DataType.CATEGORICAL, DataType.BOOLEAN):
UnivariateAnalysis.top_most_frequent(df[col])
UnivariateAnalysis.bar_plot(df[col])
elif data_type == DataType.UNIQUE:
display(Markdown("Each value in the column is unique."))
else:
UnivariateAnalysis.numeric_statistics(df[col])
UnivariateAnalysis.histogram(df[col])
Expand Down Expand Up @@ -378,29 +380,37 @@ def add_cells(self, cells: List[Dict[str, Any]]) -> None:
column_header = nbfv4.new_markdown_cell(f"## *{col} - {data_type_name}*")
cells.append(column_header)
if data_type in (DataType.CATEGORICAL, DataType.BOOLEAN):
code = code_dedent(
f"""
top_most_frequent(df['{col}'])
bar_plot(df['{col}'])"""
)
elif self.verbosity == Verbosity.MEDIUM:
code = code_dedent(
f"""
numeric_statistics(df['{col}'])
histogram(df['{col}'])"""
cell = nbfv4.new_code_cell(
code_dedent(
f"""
top_most_frequent(df['{col}'])
bar_plot(df['{col}'])"""
)
)
elif data_type == DataType.UNIQUE:
cell = nbfv4.new_markdown_cell("Each value in the column is unique.")
else:
code = code_dedent(
f"""
numeric_statistics(
df['{col}'],
descriptive_stats=default_descriptive_statistics(),
quantile_stats=default_quantile_statistics()
if self.verbosity == Verbosity.MEDIUM:
cell = nbfv4.new_code_cell(
code_dedent(
f"""
numeric_statistics(df['{col}'])
histogram(df['{col}'])"""
)
histogram(df['{col}'])"""
)
code_cell = nbfv4.new_code_cell(code)
cells.append(code_cell)
)
else:
cell = nbfv4.new_code_cell(
code_dedent(
f"""
numeric_statistics(
df['{col}'],
descriptive_stats=default_descriptive_statistics(),
quantile_stats=default_quantile_statistics()
)
histogram(df['{col}'])"""
)
)
cells.append(cell)

def show(self, df: pd.DataFrame) -> None:
"""Generates univariate analysis cell output in the calling notebook.
Expand All @@ -425,6 +435,8 @@ def show(self, df: pd.DataFrame) -> None:
if data_type in (DataType.CATEGORICAL, DataType.BOOLEAN):
UnivariateAnalysis.top_most_frequent(df[col])
UnivariateAnalysis.bar_plot(df[col])
elif data_type == DataType.UNIQUE:
display(Markdown("Each value in the column is unique."))
else:
UnivariateAnalysis.numeric_statistics(df[col])
UnivariateAnalysis.histogram(df[col])
Expand Down
12 changes: 12 additions & 0 deletions tests/test_data_type_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,18 @@ def test_inference():
assert data_types.infer_data_type(
pd.Series([None, None, np.nan, float("nan")]) == data_types.DataType.MISSING
), "Should be missing"
assert (
data_types.infer_data_type(pd.Series(list(range(10)))) == data_types.DataType.UNIQUE
), "Should be unique"
assert (
data_types.infer_data_type(pd.Series([1] + list(range(100)))) == data_types.DataType.NUMERIC
), "Should be numeric"
assert (
data_types.infer_data_type(pd.Series()) == data_types.DataType.UNKNOWN
), "Should be unknown"
assert data_types.infer_data_type(
pd.Series([True, False]) == data_types.DataType.BOOLEAN
), "Should be boolean"


def test_missing_series():
Expand Down

0 comments on commit c5956ca

Please sign in to comment.