You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Since MLForecast can handle static features, aggregate should create the one-hot-encoded versions of the hierarchical variables and add dummies for aggregated levels. Here's an updated aggregate function and an example that adds those dummies.
import os
import sys
import warnings
import numpy as np
import pandas as pd
import hierarchicalforecast.methods as hfm
from datetime import datetime
from typing import Optional
from hierarchicalforecast.utils import aggregate, _to_upper_hierarchy
from hierarchicalforecast.core import HierarchicalReconciliation
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse
from typing import Callable, Dict, List, Optional, Iterable
Y_df = (
pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/tourism.csv')
.query("State in ['Victoria', 'Tasmania']")
.query("Purpose in ['Business', 'Holiday', 'Visiting']")
)
Y_df = Y_df.rename({'Trips': 'y', 'Quarter': 'ds'}, axis=1)
Y_df.insert(0, 'Country', 'Australia')
Y_df = Y_df[['Country', 'Region', 'State', 'Purpose', 'ds', 'y']]
Y_df['ds'] = Y_df['ds'].str.replace(r'(\d+) (Q\d)', r'\1-\2', regex=True)
Y_df['ds'] = pd.to_datetime(Y_df['ds'])
Y_df.head()
spec = [
['Country'],
['Country', 'State'],
['Country', 'Purpose'],
['Country', 'State', 'Region'],
['Country', 'State', 'Purpose'],
['Country', 'State', 'Region', 'Purpose']
]
def aggregate2(
df: pd.DataFrame,
spec: List[List[str]],
is_balanced: bool = False,
sparse_s: bool = False,
add_dummies: bool = False
):
"""Utils Aggregation Function.
Aggregates bottom level series contained in the pandas DataFrame `df` according
to levels defined in the `spec` list.
Parameters
----------
df : pandas DataFrame
Dataframe with columns `['ds', 'y']` and columns to aggregate.
spec : list of list of str
List of levels. Each element of the list should contain a list of columns of `df` to aggregate.
is_balanced : bool (default=False)
Deprecated.
sparse_s : bool (default=False)
Return `S_df` as a sparse dataframe.
add_dummies: bool(default=False)
Add dummy features for hierarchical variables.
Returns
-------
Y_df : pandas DataFrame
Hierarchically structured series.
S_df : pandas DataFrame
Summing dataframe.
tags : dict
Aggregation indices.
"""
# Checks
if df.isnull().values.any():
raise ValueError('`df` contains null values')
if is_balanced:
warnings.warn(
"`is_balanced` is deprecated and will be removed in a future version. "
"Don't set this argument to suppress this warning.",
category=DeprecationWarning,
)
spec = sorted(spec, key=len)
if add_dummies:
all_spec = set()
all_spec = [x for x in [item for sublist in spec for item in sublist] if not (x in all_spec or all_spec.add(x))]
all_dummies = [f'{i}_{j}' for i in all_spec for j in df[i].unique()]
all_agg = [f'agg_{i}' for i in all_spec]
bottom = spec[-1]
aggs = []
tags = {}
for i, levels in enumerate(spec):
agg = df.groupby(levels + ['ds'])['y'].sum().reset_index('ds')
group = agg.index.get_level_values(0)
agg[levels[0]] = agg.index.get_level_values(levels[0]).values
for j, level in enumerate(levels):
if j > 0:
group = group + '/' + agg.index.get_level_values(level).str.replace('/', '_')
# Add the variable and the one-hot encoded version of the variable
if add_dummies:
agg[level] = agg.index.get_level_values(level).values
agg = pd.concat([agg, pd.get_dummies(agg[level], prefix=level, dtype=int)], axis=1)
agg.index = group
agg.index.name = 'unique_id'
tags['/'.join(levels)] = group.unique().values
# Add the aggregated levels for each hierarchical feature and fill in the one-hot encoded variables
if add_dummies:
for j in all_spec:
if j not in agg:
agg[j] = np.NaN
agg[f'agg_{j}'] = agg[j].isna().astype(int)
for j in all_dummies:
if j not in agg:
agg[j] = 0
aggs.append(agg)
dummies_cols = all_spec + all_dummies + all_agg if add_dummies else []
Y_df = (
pd.concat(aggs)
[['ds', 'y'] + dummies_cols]
)
# construct S
bottom_key = '/'.join(bottom)
bottom_levels = tags[bottom_key]
S = np.empty((len(bottom_levels), len(spec)), dtype=object)
for j, levels in enumerate(spec[:-1]):
S[:, j] = _to_upper_hierarchy(bottom, bottom_levels, '/'.join(levels))
S[:, -1] = tags[bottom_key]
categories = list(tags.values())
try:
encoder = OneHotEncoder(categories=categories, sparse_output=sparse_s, dtype=np.float32)
except TypeError: # sklearn < 1.2
encoder = OneHotEncoder(categories=categories, sparse=sparse_s, dtype=np.float32)
S = encoder.fit_transform(S).T
if sparse_s:
df_constructor = pd.DataFrame.sparse.from_spmatrix
else:
df_constructor = pd.DataFrame
S_df = df_constructor(S, index=np.hstack(categories), columns=bottom_levels)
return Y_df, S_df, tags
Y_df2, S_df, tags = aggregate2(Y_df, spec, add_dummies=False)
Y_df2 = Y_df2.reset_index()
Y_df2.head()
Y_df2, S_df, tags = aggregate2(Y_df, spec, add_dummies=True)
Y_df2 = Y_df2.reset_index()
Y_df2.head()
Use case
Improve MLForecast model performance by allowing users to include static features made up of the hierarchical variables.
The text was updated successfully, but these errors were encountered:
Description
Since MLForecast can handle static features,
aggregate
should create the one-hot-encoded versions of the hierarchical variables and add dummies for aggregated levels. Here's an updatedaggregate
function and an example that adds those dummies.Use case
Improve MLForecast model performance by allowing users to include static features made up of the hierarchical variables.
The text was updated successfully, but these errors were encountered: