tvdboom · tvdboom · Jan 26, 2024 · Jan 25, 2024 · Jan 25, 2024
diff --git a/atom/atom.py b/atom/atom.py
@@ -23,7 +23,7 @@
 import pandas as pd
 from beartype import beartype
 from joblib.memory import Memory
 from pandas._typing import DtypeObj
 from scipy import stats
 from sklearn.pipeline import Pipeline as SkPipeline
 from sklearn.utils.metaestimators import available_if
@@ -56,9 +56,9 @@
     FloatZeroToOneInc, Index, IndexSelector, Int, IntLargerEqualZero,
     IntLargerTwo, IntLargerZero, MetricConstructor, ModelsConstructor, NItems,
     NJobs, NormalizerStrats, NumericalStrats, Operators, Pandas, Predictor,
-    PrunerStrats, RowSelector, Scalar, ScalerStrats, Seasonality, Sequence,
-    Series, TargetSelector, Transformer, VectorizerStarts, Verbose, Warnings,
-    XSelector, YSelector, sequence_t,
+    PrunerStrats, RowSelector, Scalar, ScalerStrats, Seasonality,
+    SeasonalityMode, Sequence, Series, TargetSelector, Transformer,
+    VectorizerStarts, Verbose, Warnings, XSelector, YSelector, sequence_t,
 )
 from atom.utils.utils import (
     ClassMap, DataConfig, DataContainer, Goal, adjust_verbosity, bk,
@@ -288,7 +288,7 @@
    @ignore.setter
    def ignore(self, value: ColumnSelector | None):
        if value is not None:
            self._config.ignore = tuple(self.branch._get_columns(value, include_target=False))
        else:
            self._config.ignore = ()

@@ -481,7 +481,7 @@
                - **p_value:** Corresponding p-value.

        """
        columns_c = self.branch._get_columns(columns, only_numerical=True)

        df = pd.DataFrame(
            index=pd.MultiIndex.from_product(
@@ -493,7 +493,7 @@

        for col in columns_c:
            # Drop missing values from the column before testing
            X = replace_missing(self[col], self.missing).dropna().to_numpy(dtype=float)

            for test in ("adf", "kpss", "lb"):
                if test == "adf":
@@ -505,7 +505,7 @@
                    stat = l_jung.loc[l_jung["lb_pvalue"].idxmin()]

                # Add as column to the dataframe
                df.loc[(test, "score"), col] = round(stat[0], 4)
                df.loc[(test, "p_value"), col] = round(stat[1], 4)

        return df
@@ -567,7 +567,7 @@
        else:
            distributions_c = lst(distributions)

        columns_c = self.branch._get_columns(columns, only_numerical=True)

        df = pd.DataFrame(
            index=pd.MultiIndex.from_product(
@@ -579,7 +579,7 @@

        for col in columns_c:
            # Drop missing values from the column before testing
            X = replace_missing(self[col], self.missing).dropna().to_numpy(dtype=float)

            for dist in distributions_c:
                # Get KS-statistic with fitted distribution parameters
@@ -634,22 +634,22 @@
        self._log("Creating EDA report...", 1)

        if isinstance(rows, str):
            rows_c = [(self.branch._get_rows(rows), rows)]
        elif isinstance(rows, sequence_t):
            rows_c = [(self.branch._get_rows(r), r) for r in rows]
        elif isinstance(rows, dict):
            rows_c = [(self.branch._get_rows(v), k) for k, v in rows.items()]

        if len(rows_c) == 1:
            self.report = sv.analyze(
                source=rows_c[0],
                target_feat=self.branch._get_target(target, only_columns=True),
            )
        elif len(rows_c) == 2:
            self.report = sv.compare(
                source=rows_c[0],
                compare=rows_c[1],
                target_feat=self.branch._get_target(target, only_columns=True),
            )
        else:
            raise ValueError(
@@ -661,12 +661,12 @@
            if (path := Path(filename)).suffix != ".html":
                path = path.with_suffix(".html")

        self.report.show_notebook(filepath=path if filename else None)

    @composed(crash, method_to_log)
    def inverse_transform(
        self,
        X: XSelector | None = None,
        y: YSelector | None = None,
        *,
        verbose: Verbose | None = None,
@@ -709,9 +709,9 @@
            Original target column. Only returned if provided.

        """
        X, y = self._check_input(X, y, columns=self.branch.features, name=self.branch.target)

        with adjust_verbosity(self.pipeline, verbose) as pipeline:
            return pipeline.inverse_transform(X, y)

    @classmethod
@@ -786,14 +786,14 @@

        if data is not None:
            # Prepare the provided data
            container, holdout = atom._get_data(data)

            # Assign the data to the original branch
            if atom._branches._og is not None:
                atom._branches._og._container = container

            # Apply transformations per branch
            for branch in atom._branches:
                if branch._container is None:
                    branch._container = deepcopy(container)
                    branch._holdout = holdout
@@ -803,22 +803,22 @@
                        f"already contains data in branch {branch.name}."
                    )

                if len(atom._branches) > 2 and branch.pipeline:
                    atom._log(f"Transforming data for branch {branch.name}:", 1)

                X_train, y_train = branch.pipeline.transform(
                    X=branch.X_train,
                    y=branch.y_train,
                    filter_train_only=False,
                )
                X_test, y_test = branch.pipeline.transform(branch.X_test, branch.y_test)

                # Update complete dataset
                branch._container.data = bk.concat(
                    [merge(X_train, y_train), merge(X_test, y_test)]
                )

                if atom._config.index is False:
                    branch._container = DataContainer(
                        data=(dataset := branch._container.data.reset_index(drop=True)),
                        train_idx=dataset.index[:len(branch._container.train_idx)],
@@ -830,7 +830,7 @@
                if branch is not atom.branch:
                    branch.store()

        atom._log(f"{atom.__class__.__name__} successfully loaded.", 1)

        return atom

@@ -883,7 +883,7 @@
            else:
                path = path.with_name(f"{self.__class__.__name__}.csv")

        self.branch._get_rows(rows).to_csv(path, **kwargs)
        self._log("Data set successfully saved.", 1)

    @composed(crash, method_to_log)
@@ -922,7 +922,7 @@
             Whether to convert all features to sparse format. The value
             that is compressed is the most frequent value in the column.
 
-        columns: int, str, segment, sequence or None, default=None
+        columns: int, str, segment, sequence, dataframe or None, default=None
             [Selection of columns][row-and-column-selection] to shrink. If
             None, transform all columns.
 
@@ -972,7 +972,7 @@
            "float": [(x.name, np.finfo(x.type).min, np.finfo(x.type).max) for x in t3],
        }

        data = self.branch.dataset[self.branch._get_columns(columns)]

        # Convert back since convert_dtypes doesn't work properly for pyarrow dtypes
        data = data.astype({n: to_pyarrow(c, inverse=True) for n, c in data.items()})
@@ -986,20 +986,20 @@

            if old_t.name.startswith("string"):
                if str2cat and column.nunique() <= int(len(column) * 0.3):
                    self.branch._data.data[name] = get_data(pd.CategoricalDtype())
                    continue

            try:
                # Get the types to look at
                t = next(v for k, v in types.items() if old_t.name.lower().startswith(k))
            except StopIteration:
                self.branch._data.data[name] = get_data(column.dtype)
                continue

            # Use bool if values are in (0, 1)
            if int2bool and (t == types["int"] or t == types["uint"]):
                if column.isin([0, 1]).all() or column.isin([-1, 1]).all():
                    self.branch._data.data[name] = get_data(pd.BooleanDtype())
                    continue

            # Use uint if values are strictly positive
@@ -1007,7 +1007,7 @@
                t = types["uint"]

            # Find the smallest type that fits
            self.branch._data.data[name] = next(
                get_data(r[0]) for r in t if r[1] <= column.min() and r[2] >= column.max()
            )

@@ -1098,7 +1098,7 @@
    @composed(crash, method_to_log)
    def transform(
        self,
        X: XSelector | None = None,
        y: YSelector | None = None,
        *,
        verbose: Verbose | None = None,
@@ -1141,9 +1141,9 @@
            Transformed target column. Only returned if provided.

        """
        X, y = self._check_input(X, y, columns=self.og.features, name=self.og.target)

        with adjust_verbosity(self.pipeline, verbose) as pipeline:
            return pipeline.transform(X, y)

    # Base transformers ============================================ >>
@@ -1201,7 +1201,7 @@
             has the `n_jobs` and/or `random_state` parameters, it
             adopts atom's values.
 
-        columns: int, str, segment, sequence or None, default=None
+        columns: int, str, segment, sequence, dataframe or None, default=None
             Columns in the dataset to transform. If None, transform
             all features.
 
@@ -1234,7 +1234,7 @@
            transformer_c._train_only = train_only

        if columns is not None:
            cols = self.branch._get_columns(columns)
        else:
            cols = list(self.branch.features)

@@ -1271,7 +1271,7 @@
            # Check if the fitted estimator is retrieved from cache to inform
            # the user, else user might notice the lack of printed messages
            if self.memory.location is not None:
                if fit._is_in_cache_and_valid([*fit._get_output_identifiers(**kwargs)]):
                    self._log(
                        f"Retrieving cached results for {transformer_c.__class__.__name__}...", 1
                    )
@@ -1283,33 +1283,33 @@
            self._branches.add("og")

        if transformer_c._train_only:
            X, y = self.pipeline._mem_transform(transformer_c, self.X_train, self.y_train)
            self.train = merge(
                self.X_train if X is None else X,
                self.y_train if y is None else y,
            )
        else:
            X, y = self.pipeline._mem_transform(transformer_c, self.X, self.y)
            data = merge(self.X if X is None else X, self.y if y is None else y)

            # y can change the number of columns or remove rows -> reassign index
            self.branch._container = DataContainer(
                data=data,
                train_idx=self.branch._data.train_idx.intersection(data.index),
                test_idx=self.branch._data.test_idx.intersection(data.index),
                n_cols=self.branch._data.n_cols if y is None else len(get_cols(y)),
            )

        if self._config.index is False:
            self.branch._container = DataContainer(
                data=(data := self.dataset.reset_index(drop=True)),
                train_idx=data.index[: len(self.branch._data.train_idx)],
                test_idx=data.index[-len(self.branch._data.test_idx):],
                n_cols=self.branch._data.n_cols,
            )
            if self.branch._holdout is not None:
                self.branch._holdout.index = range(
                    len(data), len(data) + len(self.branch._holdout)
                )
        elif self.dataset.index.duplicated().any():
            raise ValueError(
@@ -1388,7 +1388,7 @@
             instance), and it has the `n_jobs` and/or `random_state`
             parameters, it adopts atom's values.
 
-        columns: int, str, segment, sequence or None, default=None
+        columns: int, str, segment, sequence, dataframe or None, default=None
             [Selection of columns][row-and-column-selection] to
             transform. Only select features or the target column, not
             both at the same time (if that happens, the target column
@@ -1463,7 +1463,7 @@
            Additional keyword arguments for the inverse function.

        """
        FunctionTransformer = self._get_est_class("FunctionTransformer", "preprocessing")

        columns = kwargs.pop("columns", None)
        transformer = FunctionTransformer(
@@ -1557,14 +1557,14 @@
        cleaner.missing_ = self.missing

        cleaner = self._add_transformer(cleaner, columns=columns)
        self.branch._mapping.update(cleaner.mapping_)

    @composed(crash, method_to_log)
    def decompose(
         self,
         *,
         model: str | Predictor | None = None,
-        mode: Literal["additive", "multiplicative"] = "additive",
+        mode: SeasonalityMode = "additive",
         **kwargs,
     ):
         """Detrend and deseasonalize the time series.
@@ -1584,9 +1584,7 @@
             * Use the `columns` parameter to only decompose the target
               column, e.g., `atom.decompose(columns=atom.target)`.
             * Use the [plot_decomposition][] method to visualize the
-              trend, seasonality and residuals of the time series. This
-              can help to determine if the data follows an additive or
-              multiplicative trend.
+              trend, seasonality and residuals of the time series.
 
         """
         columns = kwargs.pop("columns", None)
@@ -1679,7 +1677,7 @@
        )

        encoder = self._add_transformer(encoder, columns=columns)
        self.branch._mapping.update(encoder.mapping_)

    @composed(crash, method_to_log)
    def impute(
@@ -2053,7 +2051,7 @@
        columns = kwargs.pop("columns", None)
        feature_grouper = FeatureGrouper(
            groups={
                name: self.branch._get_columns(fxs, include_target=False)
                for name, fxs in groups.items()
            },
            operators=operators,
@@ -2182,7 +2180,7 @@
        trainer.run()

        # Overwrite models with the same name as new ones
        for model in trainer._models:
            if model.name in self._models:
                self._delete_models(model.name)
                self._log(
@@ -2190,7 +2188,7 @@
                    "The former model has been overwritten.", 3,
                )

        self._models.extend(trainer._models)
        self._metric = trainer._metric

    @composed(crash, method_to_log)

diff --git a/atom/data_cleaning.py b/atom/data_cleaning.py
@@ -34,7 +34,7 @@
    TomekLinks,
 )
 from scipy.stats import zscore
 from sklearn.base import BaseEstimator, _clone_parametrized
 from sklearn.compose import ColumnTransformer
 from sklearn.experimental import enable_iterative_imputer  # noqa: F401
 from sklearn.impute import IterativeImputer, KNNImputer
@@ -47,8 +47,9 @@
     Bins, Bool, CategoricalStrats, DataFrame, DiscretizerStrats, Engine,
     Estimator, FloatLargerZero, IntLargerEqualZero, IntLargerTwo,
     IntLargerZero, NJobs, NormalizerStrats, NumericalStrats, Pandas, Predictor,
-    PrunerStrats, Scalar, ScalerStrats, Sequence, Series, Transformer, Verbose,
-    XSelector, YSelector, dataframe_t, sequence_t, series_t,
+    PrunerStrats, Scalar, ScalerStrats, SeasonalityMode, Sequence, Series,
+    Transformer, Verbose, XSelector, YSelector, dataframe_t, sequence_t,
+    series_t,
 )
 from atom.utils.utils import (
     Goal, bk, composed, crash, get_col_order, get_cols, it, lst, merge,
@@ -90,7 +91,7 @@

    def fit(
        self,
        X: DataFrame | None = None,
        y: Pandas | None = None,
        **fit_params,
    ) -> Self:
@@ -133,7 +134,7 @@
    @composed(crash, method_to_log)
    def fit_transform(
        self,
        X: XSelector | None = None,
        y: YSelector | None = None,
        **fit_params,
    ) -> Pandas | tuple[DataFrame, Pandas]:
@@ -175,7 +176,7 @@
    @composed(crash, method_to_log)
    def inverse_transform(
        self,
        X: DataFrame | None = None,
        y: Pandas | None = None,
    ) -> Pandas | tuple[DataFrame, Pandas]:
        """Do nothing.
@@ -346,7 +347,7 @@
        self.kwargs = kwargs

    @composed(crash, method_to_log)
    def fit(self, X: DataFrame, y: Pandas = -1) -> Self:
        """Fit to data.

        Parameters
@@ -373,7 +374,7 @@

        """
        if isinstance(y, series_t):
            self.target_names_in_ = np.array([y.name])
        else:
            raise ValueError("The Balancer class does not support multioutput tasks.")

@@ -421,13 +422,13 @@

        # Create dict of class counts in y
        if not hasattr(self, "mapping_"):
            self.mapping_ = {str(v): v for v in y.sort_values().unique()}

        self._counts = {}
        for key, value in self.mapping_.items():
            self._counts[key] = np.sum(y == value)

        self._estimator = estimator.fit(X, y)

        # Add the estimator as attribute to the instance
        setattr(self, f"{estimator.__class__.__name__.lower()}_", self._estimator)
@@ -435,7 +436,7 @@
        return self

    @composed(crash, method_to_log)
    def transform(self, X: DataFrame, y: Pandas = -1) -> tuple[DataFrame, Series]:
        """Balance the data.

        Parameters
@@ -460,10 +461,10 @@

        """

        def log_changes(y):
            """Print the changes per target class."""
            for key, value in self.mapping_.items():
                diff = self._counts[key] - np.sum(y == value)
                if diff > 0:
                    self._log(f" --> Removing {diff} samples from class {key}.", 2)
                elif diff < 0:
@@ -473,7 +474,7 @@
            self._log(f"Oversampling with {self._estimator.__class__.__name__}...", 1)

            index = X.index  # Save indices for later reassignment
            X, y = self._estimator.fit_resample(X, y)

            # Create indices for the new samples
            n_idx: list[int | str]
@@ -498,7 +499,7 @@

            # Select chosen rows (imblearn doesn't return them in order)
            samples = sorted(self._estimator.sample_indices_)
            X, y = X.iloc[samples], y.iloc[samples]  # type: ignore[call-overload]

            log_changes(y)

@@ -506,7 +507,7 @@
            self._log(f"Balancing with {self._estimator.__class__.__name__}...", 1)

            index = X.index
            X_new, y_new = self._estimator.fit_resample(X, y)

            # Select rows kept by the undersampler
            if self._estimator.__class__.__name__ == "SMOTEENN":
@@ -515,8 +516,8 @@
                samples = sorted(self._estimator.tomek_.sample_indices_)

            # Select the remaining samples from the old dataframe
            o_samples = [s for s in samples if s < len(X)]
            X, y = X.iloc[o_samples], y.iloc[o_samples]  # type: ignore[call-overload]

            # Create indices for the new samples
            if index.dtype.kind in "ifu":
@@ -528,7 +529,7 @@
                ]

            # Select the new samples and assign the new indices
            X_new = X_new.iloc[-len(X_new) + len(o_samples):]
            X_new.index = n_idx
            y_new = y_new.iloc[-len(y_new) + len(o_samples):]
            y_new.index = n_idx
@@ -544,7 +545,7 @@
                    self._log(f" --> Removing {diff} samples from class: {key}.", 2)

            # Add the new samples to the old dataframe
            X, y = bk.concat([X, X_new]), bk.concat([y, y_new])

        return X, y

@@ -720,7 +721,7 @@
        self.encode_target = encode_target

    @composed(crash, method_to_log)
    def fit(self, X: DataFrame | None = None, y: Pandas | None = None) -> Self:
        """Fit to data.

        Parameters
@@ -747,11 +748,11 @@
            Estimator instance.

        """
        self.mapping_: dict[str, Any] = {}
        self._estimators = {}

        if not hasattr(self, "missing_"):
            self.missing_ = DEFAULT_MISSING

        self._log("Fitting Cleaner...", 1)

@@ -759,7 +760,7 @@
            if isinstance(y, series_t):
                self.target_names_in_ = np.array([y.name])
            else:
                self.target_names_in_ = y.columns.to_numpy()

            if self.drop_chars:
                if isinstance(y, series_t):
@@ -773,13 +774,13 @@
            if self.encode_target:
                for col in get_cols(y):
                    if isinstance(col.iloc[0], sequence_t):  # Multilabel
                        MultiLabelBinarizer = self._get_est_class(
                            name="MultiLabelBinarizer",
                            module="preprocessing",
                        )
                        self._estimators[col.name] = MultiLabelBinarizer().fit(col)
                    elif list(uq := np.unique(col)) != list(range(col.nunique())):
                        LabelEncoder = self._get_est_class("LabelEncoder", "preprocessing")
                        self._estimators[col.name] = LabelEncoder().fit(col)
                        self.mapping_.update({col.name: {str(it(v)): i for i, v in enumerate(uq)}})

@@ -788,7 +789,7 @@
    @composed(crash, method_to_log)
    def transform(
        self,
        X: DataFrame | None = None,
        y: Pandas | None = None,
    ) -> Pandas | tuple[DataFrame, Pandas]:
        """Apply the data cleaning steps to the data.
@@ -824,7 +825,7 @@

        if X is not None:
            # Unify all missing values
            X = replace_missing(X, self.missing_)

            for name, column in X.items():
                dtype = column.dtype.name
@@ -835,7 +836,7 @@
                        f" --> Dropping feature {name} for having a prohibited type: {dtype}.",
                        2,
                    )
                    X = X.drop(columns=name)
                    continue

                elif dtype in CAT_TYPES:
@@ -847,14 +848,14 @@

            # Drop prohibited chars from column names
            if self.drop_chars:
                X = X.rename(columns=lambda x: re.sub(self.drop_chars, "", str(x)))

            # Drop duplicate samples
            if self.drop_duplicates:
                X = X.drop_duplicates(ignore_index=True)

            if self.convert_dtypes:
                X = X.convert_dtypes()

        if y is not None:
            if self.drop_chars:
@@ -869,7 +870,7 @@
                y = replace_missing(y, self.missing_).dropna()

                if X is not None:
                    X = X[X.index.isin(y.index)]  # Select only indices that remain

                if (d := length - len(y)) > 0:
                    self._log(f" --> Dropping {d} rows with missing values in target.", 2)
@@ -909,7 +910,7 @@
    @composed(crash, method_to_log)
    def inverse_transform(
        self,
        X: DataFrame | None = None,
        y: Pandas | None = None,
    ) -> Pandas | tuple[DataFrame, Pandas]:
        """Inversely transform the label encoding.
@@ -962,7 +963,7 @@

                    # Replace encoded columns with target column
                    if isinstance(y, series_t):
                        yt = to_series(out, y.index, col)
                    else:
                        yt = merge(yt, to_series(out, y.index, col))

@@ -1083,7 +1084,7 @@
         *,
         model: str | Predictor | None = None,
         sp: IntLargerZero | None = None,
-        mode: Literal["additive", "multiplicative"] = "additive",
+        mode: SeasonalityMode = "additive",
         n_jobs: NJobs = 1,
         verbose: Verbose = 0,
         logger: str | Path | Logger | None = None,
@@ -1100,7 +1101,7 @@
        self.mode = mode

    @composed(crash, method_to_log)
    def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
        """Fit to data.

        Parameters
@@ -1126,7 +1127,7 @@
                    **{x: getattr(self, x) for x in BaseTransformer.attrs if hasattr(self, x)},
                )
                model.task = Goal.forecast.infer_task(y)
                forecaster = model._get_est({})
            else:
                raise ValueError(
                    "Invalid value for the model parameter. Unknown "
@@ -1135,7 +1136,7 @@
                        [
                            f" --> {m.__name__} ({m.acronym})"
                            for m in MODELS
                            if "forecast" in m._estimators
                        ]
                    )
                )
@@ -1146,7 +1147,7 @@

        self._log("Fitting Decomposer...", 1)

        self._estimators: dict[Hashable, tuple[Transformer, Transformer]] = {}
        for name, column in X.select_dtypes(include="number").items():
            trend = Detrender(
                forecaster=forecaster,
@@ -1163,7 +1164,7 @@
        return self

    @composed(crash, method_to_log)
    def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
        """Decompose the data.

        Parameters
@@ -1188,7 +1189,7 @@
        return X

    @composed(crash, method_to_log)
    def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
        """Inversely transform the data.

        Parameters
@@ -1389,7 +1390,7 @@
        self.labels = labels

    @composed(crash, method_to_log)
    def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
        """Fit to data.

        Parameters
@@ -1407,7 +1408,7 @@

        """

        def get_labels(col: str, bins: Sequence[Scalar]) -> tuple[str, ...]:
            """Get labels for the specified bins.

            Parameters
@@ -1426,7 +1427,7 @@
            """
            default = [
                f"({np.round(bins[i], 2)}, {np.round(bins[i + 1], 1)}]"
                for i in range(len(bins[:-1]))
            ]

            if self.labels is None:
@@ -1445,12 +1446,12 @@

            return labels

        Xt, yt = self._check_input(X, y)
        self._check_feature_names(Xt, reset=True)
        self._check_n_features(Xt, reset=True)

        self._estimators: dict[str, Estimator] = {}
        self._labels: dict[str, Sequence[str]] = {}

        self._log("Fitting Discretizer...", 1)

@@ -1467,7 +1468,7 @@
            if self.strategy != "custom":
                if isinstance(bins_c, sequence_t):
                    try:
                        bins_x = bins_c[i]  # Fetch the i-th bin for the i-th column
                    except IndexError:
                        raise ValueError(
                            "Invalid value for the bins parameter. The length of the "
@@ -1477,7 +1478,7 @@
                else:
                    bins_x = bins_c

                KBinsDiscretizer = self._get_est_class("KBinsDiscretizer", "preprocessing")

                # cuML implementation has no subsample and random_state
                kwargs: dict[str, Any] = {}
@@ -1507,7 +1508,7 @@
                else:
                    bins_c = [-np.inf, *bins_c, np.inf]

                FunctionTransformer = self._get_est_class(
                    name="FunctionTransformer",
                    module="preprocessing",
                )
@@ -1521,7 +1522,7 @@
        return self

    @composed(crash, method_to_log)
    def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
        """Bin the data into intervals.

        Parameters
@@ -1716,7 +1717,7 @@
        self.kwargs = kwargs

    @composed(crash, method_to_log)
    def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
        """Fit to data.

        Note that leaving y=None can lead to errors if the `strategy`
@@ -1746,9 +1747,9 @@
            Estimator instance.

        """
        self.mapping_ = {}
        self._to_value = {}
        self._categories = {}

        strategies = {
            "backwarddifference": BackwardDifferenceEncoder,
@@ -1801,7 +1802,7 @@
            # Replace infrequent classes with the string in `value`
            if self.infrequent_to_value:
                values = column.value_counts()
                self._to_value[name] = values[values <= infrequent_to_value].index.tolist()
                X[name] = column.replace(self._to_value[name], self.value)

            # Get the unique categories before fitting
@@ -1846,14 +1847,14 @@
            handle_unknown="value",
        )

        rest_enc = estimator(
            cols=encoders["rest"],
            handle_missing="return_nan",
            handle_unknown="value",
            **self.kwargs,
        )

        self._estimator = ColumnTransformer(
            transformers=[
                ("ordinal", ordinal_enc, encoders["ordinal"]),
                ("onehot", onehot_enc, encoders["onehot"]),
@@ -1867,7 +1868,7 @@
        return self

    @composed(crash, method_to_log)
    def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
        """Encode the data.

        Parameters
@@ -1887,7 +1888,7 @@
        self._log("Encoding categorical columns...", 1)

        # Convert infrequent classes to value
        X = X.replace(self._to_value, self.value)

        for name, categories in self._categories.items():
            if name in self._estimator.transformers_[0][2]:
@@ -1911,10 +1912,10 @@
            if uc := len(X[name].dropna()[~X[name].isin(categories)]):
                self._log(f"   --> Handling {uc} unknown classes.", 2)

        Xt = self._estimator.transform(X)

        # Drop _nan columns (since missing values are propagated)
        Xt = Xt.loc[:, ~Xt.columns.str.endswith("_nan")]

        return Xt[get_col_order(Xt, X.columns.tolist(), self._estimator.feature_names_in_)]

@@ -2101,7 +2102,7 @@
        self.max_nan_cols = max_nan_cols

    @composed(crash, method_to_log)
    def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
        """Fit to data.

        Parameters
@@ -2119,20 +2120,20 @@

        """
        if not hasattr(self, "missing_"):
            self.missing_ = DEFAULT_MISSING

        self._log("Fitting Imputer...", 1)

        # Unify all values to impute
        X = replace_missing(X, self.missing_)

        if self.max_nan_rows is not None:
            if self.max_nan_rows <= 1:
                self._max_nan_rows = int(X.shape[1] * self.max_nan_rows)
            else:
                self._max_nan_rows = int(self.max_nan_rows)

            X = X.dropna(axis=0, thresh=X.shape[1] - self._max_nan_rows)
            if X.empty:
                raise ValueError(
                    "Invalid value for the max_nan_rows parameter, got "
@@ -2147,10 +2148,10 @@
            else:
                max_nan_cols = int(self.max_nan_cols)

            X = X.drop(columns=X.columns[X.isna().sum() > max_nan_cols])

        # Load the imputer class from sklearn or cuml (note the different modules)
        SimpleImputer = self._get_est_class(
            name="SimpleImputer",
            module="preprocessing" if self.engine.get("estimator") == "cuml" else "impute",
        )
@@ -2185,11 +2186,11 @@
                fill_value=self.strat_cat,
            )

        ColumnTransformer = self._get_est_class("ColumnTransformer", "compose")

        self._estimator = ColumnTransformer(
            transformers=[
                ("num_imputer", num_imputer, list(X.select_dtypes(include="number"))),
                ("cat_imputer", cat_imputer, list(X.select_dtypes(include=CAT_TYPES))),
            ],
            remainder="passthrough",
@@ -2202,7 +2203,7 @@
    @composed(crash, method_to_log)
    def transform(
        self,
        X: DataFrame,
        y: Pandas | None = None,
    ) -> Pandas | tuple[DataFrame, Pandas]:
        """Impute the missing values.
@@ -2240,17 +2241,17 @@
        num_imputer = self._estimator.named_transformers_["num_imputer"]
        cat_imputer = self._estimator.named_transformers_["cat_imputer"]

        get_stat = lambda est, n: est.statistics_[est.feature_names_in_.tolist().index(n)]

        self._log("Imputing missing values...", 1)

        # Unify all values to impute
        X = replace_missing(X, self.missing_)

        # Drop rows with too many missing values
        if self.max_nan_rows is not None:
            length = len(X)
            X = X.dropna(axis=0, thresh=X.shape[1] - self._max_nan_rows)
            if diff := length - len(X):
                self._log(
                    f" --> Dropping {diff} samples for containing more "
@@ -2260,7 +2261,7 @@

        if self.strat_num == "drop":
            length = len(X)
            X = X.dropna(subset=self._estimator.transformers_[0][2])
            if diff := length - len(X):
                self._log(
                    f" --> Dropping {diff} samples for containing "
@@ -2270,7 +2271,7 @@

        if self.strat_cat == "drop":
            length = len(X)
            X = X.dropna(subset=self._estimator.transformers_[1][2])
            if diff := length - len(X):
                self._log(
                    f" --> Dropping {diff} samples for containing "
@@ -2288,7 +2289,7 @@
                        f"({nans * 100 // len(X)}%) missing values.",
                        2,
                    )
                    X = X.drop(columns=name)
                    continue

                if self.strat_num != "drop" and name in num_imputer.feature_names_in_:
@@ -2325,14 +2326,14 @@
                            2,
                        )

        X = self._estimator.transform(X)

        # Make y consistent with X
        if y is not None:
            y = y[y.index.isin(X.index)]

        # Reorder columns to original order
        X = X[[col for col in self.feature_names_in_ if col in X.columns]]

        return variable_return(X, y)

@@ -2487,7 +2488,7 @@
        self.kwargs = kwargs

    @composed(crash, method_to_log)
    def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
        """Fit to data.

        Parameters
@@ -2519,7 +2520,7 @@
        elif self.strategy == "quantile":
            kwargs = self.kwargs.copy()
            estimator = self._get_est_class(strategies[self.strategy], "preprocessing")
            self._estimator = estimator(
                output_distribution=kwargs.pop("output_distribution", "normal"),
                random_state=kwargs.pop("random_state", self.random_state),
                **kwargs,
@@ -2547,7 +2548,7 @@
        return self

    @composed(crash, method_to_log)
    def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
        """Apply the transformations to the data.

        Parameters
@@ -2565,14 +2566,14 @@

        """
        self._log("Normalizing features...", 1)
        Xt = self._estimator.transform(X[self._estimator.feature_names_in_])

        X.update(Xt)

        return X[self.feature_names_in_]

    @composed(crash, method_to_log)
    def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
        """Apply the inverse transformation to the data.

        Parameters
@@ -2590,8 +2591,8 @@

        """
        self._log("Inversely normalizing features...", 1)
        Xt = self._estimator.inverse_transform(X[self._estimator.feature_names_in_])
        Xt = to_df(Xt, index=X.index, columns=self._estimator.feature_names_in_)

        X.update(Xt)

@@ -2769,7 +2770,7 @@
    @composed(crash, method_to_log)
    def transform(
        self,
        X: DataFrame,
        y: Pandas | None = None,
    ) -> Pandas | tuple[DataFrame, Pandas]:
        """Apply the outlier strategy on the data.
@@ -2908,7 +2909,7 @@
            self._log(f" --> Dropping {len(mask) - sum(mask)} outliers.", 2)

            # Keep only the non-outliers from the data
            X = X[mask]
            if y is not None:
                y = y[mask]

@@ -3054,7 +3055,7 @@
        self.kwargs = kwargs

    @composed(crash, method_to_log)
    def fit(self, X: DataFrame, y: Pandas | None = None) -> Self:
        """Fit to data.

        Parameters
@@ -3091,7 +3092,7 @@
            )

        estimator = self._get_est_class(strategies[self.strategy], "preprocessing")
        self._estimator = estimator(**self.kwargs)

        self._log("Fitting Scaler...", 1)
        self._estimator.fit(num_cols)
@@ -3102,7 +3103,7 @@
        return self

    @composed(crash, method_to_log)
    def transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
        """Perform standardization by centering and scaling.

        Parameters
@@ -3120,14 +3121,14 @@

        """
        self._log("Scaling features...", 1)
        Xt = self._estimator.transform(X[self._estimator.feature_names_in_])

        X.update(Xt)

        return X

    @composed(crash, method_to_log)
    def inverse_transform(self, X: DataFrame, y: Pandas | None = None) -> DataFrame:
        """Apply the inverse transformation to the data.

        Parameters
@@ -3145,8 +3146,8 @@

        """
        self._log("Inversely scaling features...", 1)
        Xt = self._estimator.inverse_transform(X[self._estimator.feature_names_in_])
        Xt = to_df(Xt, index=X.index, columns=self._estimator.feature_names_in_)

        X.update(Xt)


diff --git a/atom/plots/basefigure.py b/atom/plots/basefigure.py
@@ -190,7 +190,7 @@ def get_elem(
         else:
             return self.style[element].setdefault(name, next(getattr(self, element)))
 
-    def showlegend(self, name: str, legend: Legend | dict | None) -> bool:
+    def showlegend(self, name: str, legend: Legend | dict[str, Any] | None) -> bool:
         """Get whether the trace should be showed in the legend.
 
         If there's already a trace with the same name, it's not

diff --git a/atom/plots/baseplot.py b/atom/plots/baseplot.py
@@ -223,7 +223,7 @@
        elif isinstance(rows, dict):
            rows_c = rows

        yield from rows_c.items()

    def _get_metric(self, metric: MetricSelector, *, max_one: Bool = False) -> list[str]:
        """Check and return the provided metric index.
@@ -392,8 +392,8 @@
         child: str | None = None,
         legend: Legend | dict[str, Any] | None = None,
         **kwargs,
-    ) -> go.Scatter:
-        """Draw a line.
+    ):
+        """Draw a line on the current figure.
 
         Unify the style to draw a line, where parent and child
         (e.g., model - data set or column - distribution) keep the
@@ -408,19 +408,16 @@
         child: str or None, default=None
             Name of the secondary attribute.
 
-        legend: str, dict or None
+        legend: str, dict or None, default=None
             Legend argument provided by the user.
 
         **kwargs
             Additional keyword arguments for the trace.
 
-        Returns
-        -------
-        go.Scatter
-            New trace to add to figure.
-
         """
-        return go.Scatter(
+        BasePlot._fig.figure.add_scatter(
+            name=kwargs.pop("name", child or parent),
+            mode=kwargs.pop("mode", "lines"),
             line=kwargs.pop(
                 "line", {
                     "width": self.line_width,
@@ -440,15 +437,14 @@
                 "hovertemplate",
                 f"(%{{x}}, %{{y}})<extra>{parent}{f' - {child}' if child else ''}</extra>",
             ),
-            name=kwargs.pop("name", child or parent),
             legendgroup=kwargs.pop("legendgroup", parent),
             legendgrouptitle=kwargs.pop(
                 "legendgrouptitle",
                 {"text": parent, "font_size": self.label_fontsize} if child else None,
             ),
             showlegend=kwargs.pop(
                 "showlegend",
-                BasePlot._fig.showlegend(f"{parent}-{child}", legend)
+                BasePlot._fig.showlegend(f"{parent}-{child}" if child else parent, legend)
             ),
             **kwargs,
         )