Skip to content

Add regressors property to TSDataset #82

Merged
merged 10 commits into from
Oct 4, 2021
Merged
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Sequence anomalies ([#96](https://github.com/tinkoff-ai/etna-ts/pull/96))
- 'is_weekend' feature in DateFlagsTransform ([#101](https://github.com/tinkoff-ai/etna-ts/pull/101))
- Documentation example for models and note about inplace nature of forecast ([#112](https://github.com/tinkoff-ai/etna-ts/pull/112))
- Property regressors to TSDataset ([#82](https://github.com/tinkoff-ai/etna-ts/pull/82))

### Changed
- SklearnTransform out column names ([#99](https://github.com/tinkoff-ai/etna-ts/pull/99))
Expand Down
44 changes: 40 additions & 4 deletions etna/datasets/tsdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def __init__(self, df: pd.DataFrame, freq: str, df_exog: Optional[pd.DataFrame]
self.df = self._merge_exog(self.df)

self.transforms = None
self._update_regressors()

def transform(self, transforms: Iterable["Transform"]):
"""Apply given transform to the data."""
Expand All @@ -121,6 +122,7 @@ def transform(self, transforms: Iterable["Transform"]):
for transform in self.transforms:
tslogger.log(f"Transform {transform.__class__.__name__} is applied to dataset")
self.df = transform.transform(self.df)
self._update_regressors()

def fit_transform(self, transforms: Iterable["Transform"]):
"""Fit and apply given transforms to the data."""
Expand All @@ -129,6 +131,7 @@ def fit_transform(self, transforms: Iterable["Transform"]):
for transform in self.transforms:
tslogger.log(f"Transform {transform.__class__.__name__} is applied to dataset")
self.df = transform.fit_transform(self.df)
self._update_regressors()

def __repr__(self):
return self.df.__repr__()
Expand Down Expand Up @@ -198,10 +201,9 @@ def make_future(self, future_steps: int) -> "TSDataset":
df = self._merge_exog(df)

# check if we have enough values in regressors
for segment in self.segments:
regressors_columns = [x for x in self.df_exog[segment].columns if x.startswith("regressor")]
if regressors_columns:
regressors_index = self.df_exog.loc[:, pd.IndexSlice[segment, regressors_columns]].index
if self.regressors:
for segment in self.segments:
regressors_index = self.df_exog.loc[:, pd.IndexSlice[segment, self.regressors]].index
if not np.all(future_dates.isin(regressors_index)):
warnings.warn(
f"Some regressors don't have enough values in segment {segment}, "
Expand Down Expand Up @@ -278,6 +280,40 @@ def segments(self) -> List[str]:
"""
return self.df.columns.get_level_values("segment").unique().tolist()

def _update_regressors(self):
result = set()
for column in self.columns.get_level_values("feature"):
if column.startswith("regressor"):
result.add(column)
self._regressors = list(result)

@property
def regressors(self) -> List[str]:
"""Get list of all regressors across all segments in dataset.

Examples
--------
>>> from etna.datasets import generate_const_df
>>> df = generate_const_df(
... periods=30, start_time="2021-06-01",
... n_segments=2, scale=1
... )
>>> df_ts_format = TSDataset.to_dataset(df)
>>> regressors_timestamp = pd.date_range(start="2021-06-01", periods=50)
>>> df_regressors_1 = pd.DataFrame(
... {"timestamp": regressors_timestamp, "regressor_1": 1, "segment": "segment_0"}
... )
>>> df_regressors_2 = pd.DataFrame(
... {"timestamp": regressors_timestamp, "regressor_1": 2, "segment": "segment_1"}
... )
>>> df_exog = pd.concat([df_regressors_1, df_regressors_2], ignore_index=True)
>>> df_exog_ts_format = TSDataset.to_dataset(df_exog)
>>> ts = TSDataset(df_ts_format, df_exog=df_exog_ts_format, freq="D")
>>> ts.regressors
['regressor_1']
"""
return self._regressors

def plot(self, n_segments: int = 10, column: str = "target", segments: Optional[Sequence] = None):
""" Plot of random or chosen segments.

Expand Down
32 changes: 30 additions & 2 deletions tests/test_datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from etna.datasets import generate_ar_df
from etna.datasets.tsdataset import TSDataset
from etna.transforms import DateFlagsTransform


@pytest.fixture()
Expand Down Expand Up @@ -43,8 +44,8 @@ def df_and_regressors() -> Tuple[pd.DataFrame, pd.DataFrame]:
df = TSDataset.to_dataset(df)

timestamp = pd.date_range("2021-01-01", "2021-02-11")
df_1 = pd.DataFrame({"timestamp": timestamp, "regressor_aaa": 1, "segment": "1"})
df_2 = pd.DataFrame({"timestamp": timestamp[5:], "regressor_aaa": 2, "segment": "2"})
df_1 = pd.DataFrame({"timestamp": timestamp, "regressor_1": 1, "regressor_2": 2, "segment": "1"})
df_2 = pd.DataFrame({"timestamp": timestamp[5:], "regressor_1": 3, "regressor_2": 4, "segment": "2"})
df_exog = pd.concat([df_1, df_2], ignore_index=True)
df_exog = TSDataset.to_dataset(df_exog)

Expand Down Expand Up @@ -197,3 +198,30 @@ def test_getitem_all_indexes(tsdf_with_exog):
df_slice = tsdf_with_exog[:, :, :]
df_expected = tsdf_with_exog.df
pd.testing.assert_frame_equal(df_expected, df_slice)


def test_finding_regressors(df_and_regressors):
"""Check that ts.regressors property works correctly."""
df, df_exog = df_and_regressors
ts = TSDataset(df=df, df_exog=df_exog, freq="D")
assert sorted(ts.regressors) == ["regressor_1", "regressor_2"]


def test_updating_regressors_fit_transform(df_and_regressors):
"""Check that ts.regressors is updated after making ts.fit_transform()."""
df, df_exog = df_and_regressors
ts = TSDataset(df=df, df_exog=df_exog, freq="D")
date_flags_transform = DateFlagsTransform(
day_number_in_week=True,
day_number_in_month=False,
week_number_in_month=False,
week_number_in_year=False,
month_number_in_year=False,
year_number=False,
is_weekend=True,
)
initial_regressors = set(ts.regressors)
ts.fit_transform(transforms=[date_flags_transform])
final_regressors = set(ts.regressors)
assert initial_regressors.issubset(final_regressors)
assert final_regressors.difference(initial_regressors) == {"regressor_day_number_in_week", "regressor_is_weekend"}