-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathsequence_length_similarity.py
53 lines (42 loc) · 1.88 KB
/
sequence_length_similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
"""SequenceLengthSimilarity module."""
import pandas as pd
from sdmetrics.goal import Goal
from sdmetrics.single_column.statistical.kscomplement import KSComplement
class SequenceLengthSimilarity:
"""Sequence Length Similarity metric.
Attributes:
name (str):
Name to use when reports about this metric are printed.
goal (sdmetrics.goal.Goal):
The goal of this metric.
min_value (Union[float, tuple[float]]):
Minimum value or values that this metric can take.
max_value (Union[float, tuple[float]]):
Maximum value or values that this metric can take.
"""
name = 'Sequence Length Similarity'
goal = Goal.MAXIMIZE
min_value = 0.0
max_value = 1.0
@staticmethod
def compute(real_data: pd.Series, synthetic_data: pd.Series) -> float:
"""Compute this metric.
The length of a sequence is determined by the number of times the same sequence key occurs.
For example if id_09231 appeared 150 times in the sequence key, then the sequence is of
length 150. This metric compares the lengths of all sequence keys in the
real data vs. the synthetic data.
It works as follows:
- Calculate the length of each sequence in the real data
- Calculate the length of each sequence in the synthetic data
- Apply the KSComplement metric to compare the similarities of the distributions
- Return this score
Args:
real_data (Union[numpy.ndarray, pandas.DataFrame]):
The values from the real dataset.
synthetic_data (Union[numpy.ndarray, pandas.DataFrame]):
The values from the synthetic dataset.
Returns:
float:
The score.
"""
return KSComplement.compute(real_data.value_counts(), synthetic_data.value_counts())