sdv-dev · gsheni · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024
@@ -16,6 +16,7 @@ jobs:
       run: |
         python -m pip install .[dev]
         make check-deps OUTPUT_FILEPATH=latest_requirements.txt
+        make fix-lint
     - name: Create pull request
       id: cpr
       uses: peter-evans/create-pull-request@v4

@@ -54,12 +54,12 @@ def fit_sequences(self, sequences, context_types, data_types):
                 For example, a sequence might look something like::
 
                     {
-                        "context": [1],
-                        "data": [
+                        'context': [1],
+                        'data': [
                             [1, 3, 4, 5, 11, 3, 4],
-                            [2, 2, 3, 4,  5, 1, 2],
-                            [1, 3, 4, 5,  2, 3, 1]
-                        ]
+                            [2, 2, 3, 4, 5, 1, 2],
+                            [1, 3, 4, 5, 2, 3, 1],
+                        ],
                     }
 
                 The "context" attribute maps to a list of variables which

@@ -271,7 +271,7 @@ def _analyze_data(self, sequences, context_types, data_types):
     def _normalize(tensor, value, properties):
         """Normalize the value between 0 and 1 and flag nans."""
         value_idx, missing_idx = properties['indices']
-        if pd.isnull(value):
+        if pd.isna(value):
             tensor[value_idx] = 0.0
             tensor[missing_idx] = 1.0
         else:
@@ -493,12 +493,12 @@ def fit_sequences(self, sequences, context_types, data_types):
                 For example, a sequence might look something like::
 
                     {
-                        "context": [1],
-                        "data": [
+                        'context': [1],
+                        'data': [
                             [1, 3, 4, 5, 11, 3, 4],
-                            [2, 2, 3, 4,  5, 1, 2],
-                            [1, 3, 4, 5,  2, 3, 1]
-                        ]
+                            [2, 2, 3, 4, 5, 1, 2],
+                            [1, 3, 4, 5, 2, 3, 1],
+                        ],
                     }
 
                 The "context" attribute maps to a list of variables which

@@ -131,7 +131,7 @@ def _idx_map(self, x, t):
                     'type': t,
                     'mu': np.nanmean(x[i]),
                     'std': np.nanstd(x[i]),
-                    'nulls': pd.isnull(x[i]).any(),
+                    'nulls': pd.isna(x[i]).any(),
                     'indices': (idx, idx + 1, idx + 2),
                 }
                 idx += 3
@@ -141,7 +141,7 @@ def _idx_map(self, x, t):
                     'type': t,
                     'min': np.nanmin(x[i]),
                     'range': np.nanmax(x[i]) - np.nanmin(x[i]),
-                    'nulls': pd.isnull(x[i]).any(),
+                    'nulls': pd.isna(x[i]).any(),
                     'indices': (idx, idx + 1, idx + 2),
                 }
                 idx += 3
@@ -150,7 +150,7 @@ def _idx_map(self, x, t):
                 idx_map[i] = {'type': t, 'indices': {}}
                 idx += 1
                 for v in set(x[i]):
-                    if pd.isnull(v):
+                    if pd.isna(v):
                         v = None
 
                     idx_map[i]['indices'][v] = idx
@@ -210,30 +210,30 @@ def _data_to_tensor(self, data):
 
                 elif props['type'] in ['continuous', 'timestamp']:
                     mu_idx, sigma_idx, missing_idx = props['indices']
-                    if pd.isnull(data[key][i]) or props['std'] == 0:
+                    if pd.isna(data[key][i]) or props['std'] == 0:
                         x[mu_idx] = 0.0
                     else:
                         x[mu_idx] = (data[key][i] - props['mu']) / props['std']
 
                     x[sigma_idx] = 0.0
-                    x[missing_idx] = 1.0 if pd.isnull(data[key][i]) else 0.0
+                    x[missing_idx] = 1.0 if pd.isna(data[key][i]) else 0.0
 
                 elif props['type'] in ['count']:
                     r_idx, p_idx, missing_idx = props['indices']
-                    if pd.isnull(data[key][i]) or props['range'] == 0:
+                    if pd.isna(data[key][i]) or props['range'] == 0:
                         x[r_idx] = 0.0
                     else:
                         x[r_idx] = (data[key][i] - props['min']) / props['range']
 
                     x[p_idx] = 0.0
-                    x[missing_idx] = 1.0 if pd.isnull(data[key][i]) else 0.0
+                    x[missing_idx] = 1.0 if pd.isna(data[key][i]) else 0.0
 
                 elif props['type'] in [
                     'categorical',
                     'ordinal',
                 ]:  # categorical
                     value = data[key][i]
-                    if pd.isnull(value):
+                    if pd.isna(value):
                         value = None
                     x[props['indices'][value]] = 1.0
 
@@ -258,25 +258,25 @@ def _context_to_tensor(self, context):
                 mu_idx, sigma_idx, missing_idx = props['indices']
                 x[mu_idx] = (
                     0.0
-                    if (pd.isnull(context[key]) or props['std'] == 0)
+                    if (pd.isna(context[key]) or props['std'] == 0)
                     else (context[key] - props['mu']) / props['std']
                 )
                 x[sigma_idx] = 0.0
-                x[missing_idx] = 1.0 if pd.isnull(context[key]) else 0.0
+                x[missing_idx] = 1.0 if pd.isna(context[key]) else 0.0
 
             elif props['type'] in ['count']:
                 r_idx, p_idx, missing_idx = props['indices']
                 x[r_idx] = (
                     0.0
-                    if (pd.isnull(context[key]) or props['range'] == 0)
+                    if (pd.isna(context[key]) or props['range'] == 0)
                     else (context[key] - props['min']) / props['range']
                 )
                 x[p_idx] = 0.0
-                x[missing_idx] = 1.0 if pd.isnull(context[key]) else 0.0
+                x[missing_idx] = 1.0 if pd.isna(context[key]) else 0.0
 
             elif props['type'] in ['categorical', 'ordinal']:
                 value = context[key]
-                if pd.isnull(value):
+                if pd.isna(value):
                     value = None
                 x[props['indices'][value]] = 1.0
 
@@ -295,12 +295,12 @@ def fit_sequences(self, sequences, context_types, data_types):
                 For example, a sequence might look something like::
 
                     {
-                        "context": [1],
-                        "data": [
+                        'context': [1],
+                        'data': [
                             [1, 3, 4, 5, 11, 3, 4],
-                            [2, 2, 3, 4,  5, 1, 2],
-                            [1, 3, 4, 5,  2, 3, 1]
-                        ]
+                            [2, 2, 3, 4, 5, 1, 2],
+                            [1, 3, 4, 5, 2, 3, 1],
+                        ],
                     }
 
                 The "context" attribute maps to a list of variables which
@@ -406,9 +406,7 @@ def _compute_loss(self, X_padded, Y_padded, seq_len):
                     p_true = X_padded[: seq_len[i], i, missing_idx]
                     p_pred = missing[: seq_len[i], i]
                     log_likelihood += torch.sum(p_true * p_pred)
-                    log_likelihood += torch.sum(
-                        (1.0 - p_true) * torch.log(1.0 - torch.exp(p_pred))
-                    )
+                    log_likelihood += torch.sum((1.0 - p_true) * torch.log(1.0 - torch.exp(p_pred)))
 
             elif props['type'] in ['count']:
                 r_idx, p_idx, missing_idx = props['indices']
@@ -428,9 +426,7 @@ def _compute_loss(self, X_padded, Y_padded, seq_len):
                     p_true = X_padded[: seq_len[i], i, missing_idx]
                     p_pred = missing[: seq_len[i], i]
                     log_likelihood += torch.sum(p_true * p_pred)
-                    log_likelihood += torch.sum(
-                        (1.0 - p_true) * torch.log(1.0 - torch.exp(p_pred))
-                    )
+                    log_likelihood += torch.sum((1.0 - p_true) * torch.log(1.0 - torch.exp(p_pred)))
 
             elif props['type'] in ['categorical', 'ordinal']:
                 idx = list(props['indices'].values())

@@ -59,7 +59,7 @@ def segment_by_time(sequence, segment_size, sequence_index):
     while start <= max_time:
         end = start + segment_size
         selected = (start <= sequence_index) & (sequence_index < end)
-        sequences.append(sequence[selected.values].reset_index(drop=True))
+        sequences.append(sequence[selected.to_numpy()].reset_index(drop=True))
         start = end
 
     return sequences
@@ -112,7 +112,7 @@ def _convert_to_dicts(segments, context_columns):
             if len(context.drop_duplicates()) > 1:
                 raise ValueError('Context columns are not constant within each segment.')
 
-            context = context.iloc[0].values
+            context = context.iloc[0].to_numpy()
             segment = segment.drop(context_columns, axis=1)
         else:
             context = []
@@ -180,7 +180,7 @@ def assemble_sequences(
         segments = []
         groupby_columns = entity_columns[0] if len(entity_columns) == 1 else entity_columns
         for _, sequence in data.groupby(groupby_columns):
-            sequence.drop(entity_columns, axis=1, inplace=True)
+            sequence = sequence.drop(entity_columns, axis=1)
             if context_columns:
                 if len(sequence[context_columns].drop_duplicates()) > 1:
                     raise ValueError('Context columns are not constant within each entity.')

@@ -61,7 +61,7 @@ dev = [
     'watchdog>=1.0.1,<5',
 
     # style check
-    'ruff>=0.3.2,<0.7.2',
+    'ruff>=0.3.2,<1',
 
     # distribute on PyPI
     'twine>=1.10.0,<4',
@@ -160,15 +160,18 @@ build-backend = 'setuptools.build_meta'
 
 [tool.ruff]
 preview = true
-line-length = 99
+line-length = 100
+indent-width = 4
 src = ["deepecho"]
 target-version = "py312"
 exclude = [
     "docs",
     ".tox",
     ".git",
     "__pycache__",
-    ".ipynb_checkpoints"
+    "*.ipynb",
+    ".ipynb_checkpoints",
+    "tasks.py",
 ]
 
 [tool.ruff.lint]
@@ -178,20 +181,31 @@ select = [
     # Pycodestyle
     "E",
     "W",
+    # pydocstyle
+    "D",
     # isort
-    "I001"
+    "I001",
+    # print statements
+    "T201",
+    # pandas-vet
+    "PD",
+    # numpy 2.0
+    "NPY201"
 ]
 ignore = [
-    "E501",
+    # pydocstyle
     "D107",  # Missing docstring in __init__
     "D417",   # Missing argument descriptions in the docstring, this is a bug from pydocstyle: https://github.com/PyCQA/pydocstyle/issues/449
+    "PD901",
+    "PD101",
 ]
 
 [tool.ruff.lint.pep8-naming]
 extend-ignore-names = ["X", "C", "X_padded", "Y", "Y_padded"]
 
 [tool.ruff.lint.isort]
 known-first-party = ["deepecho"]
+lines-between-types = 0
 
 [tool.ruff.lint.per-file-ignores]
 "__init__.py" = ["F401", "E402", "F403", "F405", "E501", "I001"]
@@ -200,6 +214,12 @@ known-first-party = ["deepecho"]
 quote-style = "single"
 indent-style = "space"
 preview = true
+docstring-code-format = true
+docstring-code-line-length = "dynamic"
 
 [tool.ruff.lint.pydocstyle]
 convention = "google"
+
+[tool.ruff.lint.pycodestyle]
+max-doc-length = 100
+max-line-length = 100