Skip to content

Commit

Permalink
Speed up columns slices (#900)
Browse files Browse the repository at this point in the history
  • Loading branch information
Mr-Geekman authored Sep 7, 2022
1 parent 18a064a commit 533fce8
Show file tree
Hide file tree
Showing 13 changed files with 286 additions and 51 deletions.
39 changes: 39 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,42 @@ jobs:
- name: Upload coverage
uses: codecov/codecov-action@v2

test-pandas-versions:
runs-on: ubuntu-latest
strategy:
matrix:
pandas-version:
- ">=1.1,<1.2"
- ">=1.2,<1.3"
- ">=1.3,<1.4"
- ">=1.4"
fail-fast: false

steps:
- uses: actions/checkout@v2

- name: Set up Python
id: setup-python
uses: actions/setup-python@v2
with:
python-version: 3.8

- name: Install Poetry
uses: snok/install-poetry@v1
with:
virtualenvs-create: true
virtualenvs-in-project: true

- name: Install dependencies
run: |
poetry install -E "all tests" -vv
pip install "pandas${{ matrix.pandas-version }}"
- name: PyTest ("tsdataset transforms")
run: |
poetry run pytest tests/test_datasets -v --cov=etna --cov-report=xml
poetry run pytest tests/test_transforms -v --cov=etna --cov-report=xml
- name: Upload coverage
uses: codecov/codecov-action@v2
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed
-
-
-
- Make slicing faster in `TSDataset._merge_exog`, `FilterFeaturesTransform`, `AddConstTransform`, `LambdaTransform`, `LagTransform`, `LogTransform`, `SklearnTransform`, `WindowStatisticsTransform`; make CICD test different pandas versions ([#900](https://github.com/tinkoff-ai/etna/pull/900))
-
-
### Fixed
Expand Down
1 change: 1 addition & 0 deletions etna/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
from etna.datasets.datasets_generation import generate_periodic_df
from etna.datasets.tsdataset import TSDataset
from etna.datasets.utils import duplicate_data
from etna.datasets.utils import set_columns_wide
3 changes: 1 addition & 2 deletions etna/datasets/tsdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,8 +406,7 @@ def _check_regressors(df: pd.DataFrame, df_regressors: pd.DataFrame):
def _merge_exog(self, df: pd.DataFrame) -> pd.DataFrame:
if self.df_exog is None:
raise ValueError("Something went wrong, Trying to merge df_exog which is None!")
segments = sorted(set(df.columns.get_level_values("segment")))
df_regressors = self.df_exog.loc[:, pd.IndexSlice[segments, self.known_future]]
df_regressors = self.df_exog.loc[:, pd.IndexSlice[:, self.known_future]]
self._check_regressors(df=df, df_regressors=df_regressors)
df = pd.concat((df, self.df_exog), axis=1).loc[df.index].sort_index(axis=1, level=(0, 1))
return df
Expand Down
55 changes: 55 additions & 0 deletions etna/datasets/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from enum import Enum
from typing import List
from typing import Optional
from typing import Sequence

import pandas as pd
Expand Down Expand Up @@ -120,3 +121,57 @@ def __getitem__(self, index):

def __len__(self):
return len(self.ts_samples)


def set_columns_wide(
df_left: pd.DataFrame,
df_right: pd.DataFrame,
timestamps_left: Optional[Sequence[pd.Timestamp]] = None,
timestamps_right: Optional[Sequence[pd.Timestamp]] = None,
segments_left: Optional[Sequence[str]] = None,
features_right: Optional[Sequence[str]] = None,
features_left: Optional[Sequence[str]] = None,
segments_right: Optional[Sequence[str]] = None,
) -> pd.DataFrame:
"""Set columns in a left dataframe with values from the right dataframe.
Parameters
----------
df_left:
dataframe to set columns in
df_right:
dataframe to set columns from
timestamps_left:
timestamps to select in ``df_left``
timestamps_right:
timestamps to select in ``df_right``
segments_left:
segments to select in ``df_left``
segments_right:
segments to select in ``df_right``
features_left:
features to select in ``df_left``
features_right:
features to select in ``df_right``
Returns
-------
:
a new dataframe with changed columns
"""
# sort columns
df_left = df_left.sort_index(axis=1)
df_right = df_right.sort_index(axis=1)

# prepare indexing
timestamps_left_index = slice(None) if timestamps_left is None else timestamps_left
timestamps_right_index = slice(None) if timestamps_right is None else timestamps_right
segments_left_index = slice(None) if segments_left is None else segments_left
segments_right_index = slice(None) if segments_right is None else segments_right
features_left_index = slice(None) if features_left is None else features_left
features_right_index = slice(None) if features_right is None else features_right

right_value = df_right.loc[timestamps_right_index, (segments_right_index, features_right_index)]
df_left.loc[timestamps_left_index, (segments_left_index, features_left_index)] = right_value.values

return df_left
5 changes: 3 additions & 2 deletions etna/transforms/feature_selection/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,15 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
if self.include is not None:
if not set(self.include).issubset(features):
raise ValueError(f"Features {set(self.include) - set(features)} are not present in the dataset.")
segments = sorted(set(df.columns.get_level_values("segment")))
result = result.loc[:, pd.IndexSlice[segments, self.include]]
result = result.loc[:, pd.IndexSlice[:, self.include]]
if self.exclude is not None and self.exclude:
if not set(self.exclude).issubset(features):
raise ValueError(f"Features {set(self.exclude) - set(features)} are not present in the dataset.")
result = result.drop(columns=self.exclude, level="feature")
if self.return_features:
self._df_removed = df.drop(result.columns, axis=1)

result = result.sort_index(axis=1)
return result

def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
Expand Down
23 changes: 16 additions & 7 deletions etna/transforms/math/add_constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pandas as pd

from etna.datasets import set_columns_wide
from etna.transforms.base import Transform
from etna.transforms.utils import match_target_quantiles

Expand Down Expand Up @@ -75,10 +76,12 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
segments = sorted(set(df.columns.get_level_values("segment")))

result = df.copy()
features = df.loc[:, pd.IndexSlice[segments, self.in_column]]
features = df.loc[:, pd.IndexSlice[:, self.in_column]]
transformed_features = features + self.value
if self.inplace:
result.loc[:, pd.IndexSlice[segments, self.in_column]] = transformed_features
result = set_columns_wide(
result, transformed_features, features_left=[self.in_column], features_right=[self.in_column]
)
else:
column_name = self._get_column_name()
transformed_features.columns = pd.MultiIndex.from_product([segments, [column_name]])
Expand All @@ -101,17 +104,23 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
result = df.copy()
if self.inplace:
segments = sorted(set(df.columns.get_level_values("segment")))
features = df.loc[:, pd.IndexSlice[segments, self.in_column]]
features = df.loc[:, pd.IndexSlice[:, self.in_column]]
transformed_features = features - self.value
result.loc[:, pd.IndexSlice[segments, self.in_column]] = transformed_features
result = set_columns_wide(
result, transformed_features, features_left=[self.in_column], features_right=[self.in_column]
)
if self.in_column == "target":
segment_columns = result.columns.get_level_values("feature").tolist()
quantiles = match_target_quantiles(set(segment_columns))
for quantile_column_nm in quantiles:
features = df.loc[:, pd.IndexSlice[segments, quantile_column_nm]]
features = df.loc[:, pd.IndexSlice[:, quantile_column_nm]]
transformed_features = features - self.value
result.loc[:, pd.IndexSlice[segments, quantile_column_nm]] = transformed_features
result = set_columns_wide(
result,
transformed_features,
features_left=[quantile_column_nm],
features_right=[quantile_column_nm],
)

return result

Expand Down
23 changes: 16 additions & 7 deletions etna/transforms/math/apply_lambda.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import pandas as pd

from etna.datasets import set_columns_wide
from etna.transforms.base import Transform
from etna.transforms.utils import match_target_quantiles

Expand Down Expand Up @@ -94,10 +95,12 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
result = df.copy()
segments = sorted(set(df.columns.get_level_values("segment")))
features = df.loc[:, pd.IndexSlice[segments, self.in_column]]
features = df.loc[:, pd.IndexSlice[:, self.in_column]].sort_index(axis=1)
transformed_features = self.transform_func(features)
if self.inplace:
result.loc[:, pd.IndexSlice[segments, self.in_column]] = transformed_features
result = set_columns_wide(
result, transformed_features, features_left=[self.in_column], features_right=[self.in_column]
)
else:
transformed_features.columns = pd.MultiIndex.from_product([segments, [self.change_column]])
result = pd.concat([result] + [transformed_features], axis=1)
Expand All @@ -119,15 +122,21 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
result_df = df.copy()
if self.inverse_transform_func:
segments = sorted(set(df.columns.get_level_values("segment")))
features = df.loc[:, pd.IndexSlice[segments, self.in_column]]
features = df.loc[:, pd.IndexSlice[:, self.in_column]].sort_index(axis=1)
transformed_features = self.inverse_transform_func(features)
result_df.loc[:, pd.IndexSlice[segments, self.in_column]] = transformed_features
result_df = set_columns_wide(
result_df, transformed_features, features_left=[self.in_column], features_right=[self.in_column]
)
if self.in_column == "target":
segment_columns = result_df.columns.get_level_values("feature").tolist()
quantiles = match_target_quantiles(set(segment_columns))
for quantile_column_nm in quantiles:
features = df.loc[:, pd.IndexSlice[segments, quantile_column_nm]]
features = df.loc[:, pd.IndexSlice[:, quantile_column_nm]].sort_index(axis=1)
transformed_features = self.inverse_transform_func(features)
result_df.loc[:, pd.IndexSlice[segments, quantile_column_nm]] = transformed_features
result_df = set_columns_wide(
result_df,
transformed_features,
features_left=[quantile_column_nm],
features_right=[quantile_column_nm],
)
return result_df
2 changes: 1 addition & 1 deletion etna/transforms/math/lags.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
result = df.copy()
segments = sorted(set(df.columns.get_level_values("segment")))
all_transformed_features = []
features = df.loc[:, pd.IndexSlice[segments, self.in_column]]
features = df.loc[:, pd.IndexSlice[:, self.in_column]]
for lag in self.lags:
column_name = self._get_column_name(lag)
transformed_features = features.shift(lag)
Expand Down
23 changes: 16 additions & 7 deletions etna/transforms/math/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
import pandas as pd

from etna.datasets import set_columns_wide
from etna.transforms.base import Transform
from etna.transforms.utils import match_target_quantiles

Expand Down Expand Up @@ -72,14 +73,16 @@ def transform(self, df: pd.DataFrame) -> pd.DataFrame:
transformed dataframe
"""
segments = sorted(set(df.columns.get_level_values("segment")))
features = df.loc[:, pd.IndexSlice[segments, self.in_column]]
features = df.loc[:, pd.IndexSlice[:, self.in_column]]
if (features < 0).any().any():
raise ValueError("LogPreprocess can be applied only to non-negative series")

result = df.copy()
transformed_features = np.log1p(features) / np.log(self.base)
if self.inplace:
result.loc[:, pd.IndexSlice[segments, self.in_column]] = transformed_features
result = set_columns_wide(
result, transformed_features, features_left=[self.in_column], features_right=[self.in_column]
)
else:
column_name = self._get_column_name()
transformed_features.columns = pd.MultiIndex.from_product([segments, [column_name]])
Expand All @@ -102,17 +105,23 @@ def inverse_transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
result = df.copy()
if self.inplace:
segments = sorted(set(df.columns.get_level_values("segment")))
features = df.loc[:, pd.IndexSlice[segments, self.in_column]]
features = df.loc[:, pd.IndexSlice[:, self.in_column]]
transformed_features = np.expm1(features * np.log(self.base))
result.loc[:, pd.IndexSlice[segments, self.in_column]] = transformed_features
result = set_columns_wide(
result, transformed_features, features_left=[self.in_column], features_right=[self.in_column]
)
if self.in_column == "target":
segment_columns = result.columns.get_level_values("feature").tolist()
quantiles = match_target_quantiles(set(segment_columns))
for quantile_column_nm in quantiles:
features = df.loc[:, pd.IndexSlice[segments, quantile_column_nm]]
features = df.loc[:, pd.IndexSlice[:, quantile_column_nm]]
transformed_features = np.expm1(features * np.log(self.base))
result.loc[:, pd.IndexSlice[segments, quantile_column_nm]] = transformed_features
result = set_columns_wide(
result,
transformed_features,
features_left=[quantile_column_nm],
features_right=[quantile_column_nm],
)

return result

Expand Down
Loading

1 comment on commit 533fce8

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.