Skip to content

Commit

Permalink
Merge pull request #181 from MaxHalford/support-unseen-values-in-mca
Browse files Browse the repository at this point in the history
Handle unseen values in MCA
  • Loading branch information
MaxHalford authored Nov 17, 2024
2 parents 0644d88 + cba2c9f commit 731204c
Show file tree
Hide file tree
Showing 6 changed files with 1,198 additions and 888 deletions.
1,946 changes: 1,087 additions & 859 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion prince/famd.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def row_coordinates(self, X):
prop = X_cat.sum() / X_cat.sum().sum() * 2
X_cat = X_cat.sub(X_cat.mean(axis="rows")).div(prop**0.5, axis="columns")

Z = pd.concat([X_num, X_cat], axis=1)
Z = pd.concat([X_num, X_cat.sparse.to_dense()], axis=1).fillna(0.0)

return super().row_coordinates(Z)

Expand Down
8 changes: 4 additions & 4 deletions prince/mca.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ def __init__(
random_state=None,
engine="sklearn",
one_hot=True,
handle_unknown="error",
):
super().__init__(
n_components=n_components,
Expand All @@ -33,12 +32,12 @@ def __init__(
engine=engine,
)
self.one_hot = one_hot
self.handle_unknown = handle_unknown

def _prepare(self, X):
if self.one_hot:
# Create the one-hot encoder if it doesn't exist (usually because we're in the fit method)
X = pd.get_dummies(X, columns=X.columns)
X = pd.get_dummies(X, columns=X.columns, prefix_sep="__")
if (one_hot_columns_ := getattr(self, "one_hot_columns_", None)) is not None:
X = X.reindex(columns=one_hot_columns_.union(X.columns), fill_value=False)
return X

def get_feature_names_out(self, input_features=None):
Expand All @@ -62,6 +61,7 @@ def fit(self, X, y=None):

# One-hot encode the data
one_hot = self._prepare(X)
self.one_hot_columns_ = one_hot.columns

# We need the number of columns to apply the Greenacre correction
self.J_ = one_hot.shape[1]
Expand Down
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
[tool.poetry]
name = "prince"
version = "0.13.1"
version = "0.14.0"
description = "Factor analysis in Python: PCA, CA, MCA, MFA, FAMD, GPA"
authors = ["Max Halford <[email protected]>"]
license = "MIT"

[tool.poetry.dependencies]
python = "^3.9"
scikit-learn = "^1.0.2"
pandas = "^1.4.1 || ^2.0.0"
altair = "^4.2.2 || ^5.0.0"
scikit-learn = "^1.5.1"
pandas = "^2.2.0"
altair = "^5.0.0"

[tool.poetry.group.dev.dependencies]
nbconvert = "^7.2.9"
Expand Down
30 changes: 30 additions & 0 deletions tests/test_famd.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,33 @@ def test_col_contrib(self):
F = load_df_from_R("famd$var$contrib")
P = self.famd.column_contributions_
np.testing.assert_allclose(F, P * 100)


def test_issue_169():
"""
https://github.com/MaxHalford/prince/issues/169
>>> import pandas as pd
>>> from prince import FAMD
>>> df = pd.DataFrame({'var1':['c', 'a', 'b','c'], 'var2':['x','y','y','z'],'var2': [0.,10.,30.4,0.]})
>>> famd = FAMD(n_components=2, random_state=42)
>>> famd = famd.fit(df[:3])
>>> famd.transform(df[0:3])
component 0 1
0 -1.303760 -0.658334
1 -0.335621 0.981047
2 1.639381 -0.322713
>>> famd.transform(df[0:2])
component 0 1
0 -1.000920 -0.669274
1 -0.092001 0.669274
>>> famd.transform(df[3:])
component 0 1
3 -0.869173 -1.215925e-16
"""
92 changes: 72 additions & 20 deletions tests/test_mca.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,10 @@ def test_col_coords(self):
if self.sup_cols:
F = pd.concat((F, load_df_from_R("ca$quali.sup$coord")))
P = self.ca.column_coordinates(self.dataset)
np.testing.assert_allclose(F.abs(), P.abs())
# Prince adds a prefix to each column. We need to remove it in order to align the rows
# of the two dataframes
P.index = [idx.split("__", 1)[1] for idx in P.index]
np.testing.assert_allclose(F.abs(), P.abs().loc[F.index])
else:
super().test_col_coords()

Expand All @@ -74,7 +77,10 @@ def test_col_cos2(self):
if self.sup_cols:
F = pd.concat((F, load_df_from_R("ca$quali.sup$cos2")))
P = self.ca.column_cosine_similarities(self.dataset)
np.testing.assert_allclose(F, P)
# Prince adds a prefix to each column. We need to remove it in order to align the rows
# of the two dataframes
P.index = [idx.split("__", 1)[1] for idx in P.index]
np.testing.assert_allclose(F, P.loc[F.index])
else:
super().test_col_cos2()

Expand All @@ -89,23 +95,23 @@ def test_with_and_without_one_hot():
>>> mca = prince.MCA(n_components=2, one_hot=True, engine="scipy")
>>> mca = mca.fit(df)
>>> mca.transform(df).round(2).abs().sort_index(axis='columns')
0 1
0 2.0 0.00
1 0.5 0.65
2 0.5 0.65
3 0.5 0.65
4 0.5 1.94
0 1
0 0.00 2.0
1 0.65 0.5
2 0.65 0.5
3 0.65 0.5
4 1.94 0.5
>>> mca = prince.MCA(n_components=2, one_hot=False, engine="scipy")
>>> one_hot = pd.get_dummies(df, columns=['foo', 'bar'])
>>> mca = mca.fit(one_hot)
>>> mca.transform(one_hot).round(2).abs().sort_index(axis='columns')
0 1
0 2.0 0.00
1 0.5 0.65
2 0.5 0.65
3 0.5 0.65
4 0.5 1.94
0 1
0 0.00 2.0
1 0.65 0.5
2 0.65 0.5
3 0.65 0.5
4 1.94 0.5
"""

Expand All @@ -122,12 +128,12 @@ def test_issue_131():
>>> mca = prince.MCA(engine="scipy")
>>> mca = mca.fit(df)
>>> mca.transform(df).round(2).abs().sort_index(axis='columns')
0 1
0 2.0 0.00
1 0.5 0.65
2 0.5 0.65
3 0.5 0.65
4 0.5 1.94
0 1
0 0.00 2.0
1 0.65 0.5
2 0.65 0.5
3 0.65 0.5
4 1.94 0.5
>>> mca.K_, mca.J_
(2, 8)
Expand Down Expand Up @@ -185,3 +191,49 @@ def test_type_doesnt_matter():

for i in range(len(outputs) - 1):
np.testing.assert_allclose(outputs[i], outputs[i + 1])


issue_161_data = """
,category,userid,location,applicationname,browser\n
0,Portal Login,[email protected],"San Jose, CA, United States",A,Chrome\n
1,Application Access,[email protected],"San Jose, CA, United States",B,Other\n
2,Application Access,[email protected],"San Jose, CA, United States",C,Other\n
3,Portal Login,[email protected],"San Diego, CA, United States",A,Chrome\n
"""


def test_issue_161():
"""
https://github.com/MaxHalford/prince/issues/161
>>> import io
>>> data = pd.read_csv(io.StringIO(issue_161_data), index_col=0)
>>> mca = prince.MCA(
... n_components=10,
... n_iter=3,
... copy=True,
... check_input=True,
... engine='sklearn',
... random_state=42
... )
>>> mca = mca.fit(data[:3])
>>> mca.eigenvalues_summary
eigenvalue % of variance % of variance (cumulative)
component
0 0.673 67.32% 67.32%
1 0.327 32.68% 100.00%
>>> mca.row_coordinates(data[:3])
0 1
0 1.120811 -0.209242
1 -0.820491 -0.571660
2 -0.300320 0.780902
>>> mca.transform(data[3:])
0 1
3 1.664888 -0.640285
"""

0 comments on commit 731204c

Please sign in to comment.