Merge pull request #181 from MaxHalford/support-unseen-values-in-mca

Handle unseen values in MCA
MaxHalford · Nov 17, 2024 · 731204c · 731204c
2 parents 0644d88 + cba2c9f
commit 731204c
Show file tree

Hide file tree

Showing 6 changed files with 1,198 additions and 888 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/prince/famd.py b/prince/famd.py
@@ -112,7 +112,7 @@ def row_coordinates(self, X):
         prop = X_cat.sum() / X_cat.sum().sum() * 2
         X_cat = X_cat.sub(X_cat.mean(axis="rows")).div(prop**0.5, axis="columns")
 
-        Z = pd.concat([X_num, X_cat], axis=1)
+        Z = pd.concat([X_num, X_cat.sparse.to_dense()], axis=1).fillna(0.0)
 
         return super().row_coordinates(Z)
 

diff --git a/prince/mca.py b/prince/mca.py
@@ -22,7 +22,6 @@ def __init__(
         random_state=None,
         engine="sklearn",
         one_hot=True,
-        handle_unknown="error",
     ):
         super().__init__(
             n_components=n_components,
@@ -33,12 +32,12 @@ def __init__(
             engine=engine,
         )
         self.one_hot = one_hot
-        self.handle_unknown = handle_unknown
 
     def _prepare(self, X):
         if self.one_hot:
-            # Create the one-hot encoder if it doesn't exist (usually because we're in the fit method)
-            X = pd.get_dummies(X, columns=X.columns)
+            X = pd.get_dummies(X, columns=X.columns, prefix_sep="__")
+            if (one_hot_columns_ := getattr(self, "one_hot_columns_", None)) is not None:
+                X = X.reindex(columns=one_hot_columns_.union(X.columns), fill_value=False)
         return X
 
     def get_feature_names_out(self, input_features=None):
@@ -62,6 +61,7 @@ def fit(self, X, y=None):
 
         # One-hot encode the data
         one_hot = self._prepare(X)
+        self.one_hot_columns_ = one_hot.columns
 
         # We need the number of columns to apply the Greenacre correction
         self.J_ = one_hot.shape[1]

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,15 +1,15 @@
 [tool.poetry]
 name = "prince"
-version = "0.13.1"
+version = "0.14.0"
 description = "Factor analysis in Python: PCA, CA, MCA, MFA, FAMD, GPA"
 authors = ["Max Halford <[email protected]>"]
 license = "MIT"
 
 [tool.poetry.dependencies]
 python = "^3.9"
-scikit-learn = "^1.0.2"
-pandas = "^1.4.1 || ^2.0.0"
-altair = "^4.2.2 || ^5.0.0"
+scikit-learn = "^1.5.1"
+pandas = "^2.2.0"
+altair = "^5.0.0"
 
 [tool.poetry.group.dev.dependencies]
 nbconvert = "^7.2.9"

diff --git a/tests/test_famd.py b/tests/test_famd.py
@@ -96,3 +96,33 @@ def test_col_contrib(self):
         F = load_df_from_R("famd$var$contrib")
         P = self.famd.column_contributions_
         np.testing.assert_allclose(F, P * 100)
+
+
+def test_issue_169():
+    """
+
+    https://github.com/MaxHalford/prince/issues/169
+
+    >>> import pandas as pd
+    >>> from prince import FAMD
+    >>> df = pd.DataFrame({'var1':['c', 'a', 'b','c'], 'var2':['x','y','y','z'],'var2': [0.,10.,30.4,0.]})
+
+    >>> famd = FAMD(n_components=2, random_state=42)
+    >>> famd = famd.fit(df[:3])
+
+    >>> famd.transform(df[0:3])
+    component         0         1
+    0         -1.303760 -0.658334
+    1         -0.335621  0.981047
+    2          1.639381 -0.322713
+
+    >>> famd.transform(df[0:2])
+    component         0         1
+    0         -1.000920 -0.669274
+    1         -0.092001  0.669274
+
+    >>> famd.transform(df[3:])
+    component         0             1
+    3         -0.869173 -1.215925e-16
+
+    """
diff --git a/tests/test_mca.py b/tests/test_mca.py
@@ -64,7 +64,10 @@ def test_col_coords(self):
             if self.sup_cols:
                 F = pd.concat((F, load_df_from_R("ca$quali.sup$coord")))
             P = self.ca.column_coordinates(self.dataset)
-            np.testing.assert_allclose(F.abs(), P.abs())
+            # Prince adds a prefix to each column. We need to remove it in order to align the rows
+            # of the two dataframes
+            P.index = [idx.split("__", 1)[1] for idx in P.index]
+            np.testing.assert_allclose(F.abs(), P.abs().loc[F.index])
         else:
             super().test_col_coords()
 
@@ -74,7 +77,10 @@ def test_col_cos2(self):
             if self.sup_cols:
                 F = pd.concat((F, load_df_from_R("ca$quali.sup$cos2")))
             P = self.ca.column_cosine_similarities(self.dataset)
-            np.testing.assert_allclose(F, P)
+            # Prince adds a prefix to each column. We need to remove it in order to align the rows
+            # of the two dataframes
+            P.index = [idx.split("__", 1)[1] for idx in P.index]
+            np.testing.assert_allclose(F, P.loc[F.index])
         else:
             super().test_col_cos2()
 
@@ -89,23 +95,23 @@ def test_with_and_without_one_hot():
     >>> mca = prince.MCA(n_components=2, one_hot=True, engine="scipy")
     >>> mca = mca.fit(df)
     >>> mca.transform(df).round(2).abs().sort_index(axis='columns')
-         0     1
-    0  2.0  0.00
-    1  0.5  0.65
-    2  0.5  0.65
-    3  0.5  0.65
-    4  0.5  1.94
+          0    1
+    0  0.00  2.0
+    1  0.65  0.5
+    2  0.65  0.5
+    3  0.65  0.5
+    4  1.94  0.5
 
     >>> mca = prince.MCA(n_components=2, one_hot=False, engine="scipy")
     >>> one_hot = pd.get_dummies(df, columns=['foo', 'bar'])
     >>> mca = mca.fit(one_hot)
     >>> mca.transform(one_hot).round(2).abs().sort_index(axis='columns')
-         0     1
-    0  2.0  0.00
-    1  0.5  0.65
-    2  0.5  0.65
-    3  0.5  0.65
-    4  0.5  1.94
+          0    1
+    0  0.00  2.0
+    1  0.65  0.5
+    2  0.65  0.5
+    3  0.65  0.5
+    4  1.94  0.5
 
     """
 
@@ -122,12 +128,12 @@ def test_issue_131():
     >>> mca = prince.MCA(engine="scipy")
     >>> mca = mca.fit(df)
     >>> mca.transform(df).round(2).abs().sort_index(axis='columns')
-         0     1
-    0  2.0  0.00
-    1  0.5  0.65
-    2  0.5  0.65
-    3  0.5  0.65
-    4  0.5  1.94
+          0    1
+    0  0.00  2.0
+    1  0.65  0.5
+    2  0.65  0.5
+    3  0.65  0.5
+    4  1.94  0.5
 
     >>> mca.K_, mca.J_
     (2, 8)
@@ -185,3 +191,49 @@ def test_type_doesnt_matter():
 
     for i in range(len(outputs) - 1):
         np.testing.assert_allclose(outputs[i], outputs[i + 1])
+
+
+issue_161_data = """
+,category,userid,location,applicationname,browser\n
+0,Portal Login,[email protected],"San Jose, CA, United States",A,Chrome\n
+1,Application Access,[email protected],"San Jose, CA, United States",B,Other\n
+2,Application Access,[email protected],"San Jose, CA, United States",C,Other\n
+3,Portal Login,[email protected],"San Diego, CA, United States",A,Chrome\n
+"""
+
+
+def test_issue_161():
+    """
+
+    https://github.com/MaxHalford/prince/issues/161
+
+    >>> import io
+    >>> data = pd.read_csv(io.StringIO(issue_161_data), index_col=0)
+
+    >>> mca = prince.MCA(
+    ...     n_components=10,
+    ...     n_iter=3,
+    ...     copy=True,
+    ...     check_input=True,
+    ...     engine='sklearn',
+    ...     random_state=42
+    ... )
+    >>> mca = mca.fit(data[:3])
+
+    >>> mca.eigenvalues_summary
+              eigenvalue % of variance % of variance (cumulative)
+    component
+    0              0.673        67.32%                     67.32%
+    1              0.327        32.68%                    100.00%
+
+    >>> mca.row_coordinates(data[:3])
+              0         1
+    0  1.120811 -0.209242
+    1 -0.820491 -0.571660
+    2 -0.300320  0.780902
+
+    >>> mca.transform(data[3:])
+              0         1
+    3  1.664888 -0.640285
+
+    """