-
-
Notifications
You must be signed in to change notification settings - Fork 184
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #181 from MaxHalford/support-unseen-values-in-mca
Handle unseen values in MCA
- Loading branch information
Showing
6 changed files
with
1,198 additions
and
888 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,15 @@ | ||
[tool.poetry] | ||
name = "prince" | ||
version = "0.13.1" | ||
version = "0.14.0" | ||
description = "Factor analysis in Python: PCA, CA, MCA, MFA, FAMD, GPA" | ||
authors = ["Max Halford <[email protected]>"] | ||
license = "MIT" | ||
|
||
[tool.poetry.dependencies] | ||
python = "^3.9" | ||
scikit-learn = "^1.0.2" | ||
pandas = "^1.4.1 || ^2.0.0" | ||
altair = "^4.2.2 || ^5.0.0" | ||
scikit-learn = "^1.5.1" | ||
pandas = "^2.2.0" | ||
altair = "^5.0.0" | ||
|
||
[tool.poetry.group.dev.dependencies] | ||
nbconvert = "^7.2.9" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,7 +64,10 @@ def test_col_coords(self): | |
if self.sup_cols: | ||
F = pd.concat((F, load_df_from_R("ca$quali.sup$coord"))) | ||
P = self.ca.column_coordinates(self.dataset) | ||
np.testing.assert_allclose(F.abs(), P.abs()) | ||
# Prince adds a prefix to each column. We need to remove it in order to align the rows | ||
# of the two dataframes | ||
P.index = [idx.split("__", 1)[1] for idx in P.index] | ||
np.testing.assert_allclose(F.abs(), P.abs().loc[F.index]) | ||
else: | ||
super().test_col_coords() | ||
|
||
|
@@ -74,7 +77,10 @@ def test_col_cos2(self): | |
if self.sup_cols: | ||
F = pd.concat((F, load_df_from_R("ca$quali.sup$cos2"))) | ||
P = self.ca.column_cosine_similarities(self.dataset) | ||
np.testing.assert_allclose(F, P) | ||
# Prince adds a prefix to each column. We need to remove it in order to align the rows | ||
# of the two dataframes | ||
P.index = [idx.split("__", 1)[1] for idx in P.index] | ||
np.testing.assert_allclose(F, P.loc[F.index]) | ||
else: | ||
super().test_col_cos2() | ||
|
||
|
@@ -89,23 +95,23 @@ def test_with_and_without_one_hot(): | |
>>> mca = prince.MCA(n_components=2, one_hot=True, engine="scipy") | ||
>>> mca = mca.fit(df) | ||
>>> mca.transform(df).round(2).abs().sort_index(axis='columns') | ||
0 1 | ||
0 2.0 0.00 | ||
1 0.5 0.65 | ||
2 0.5 0.65 | ||
3 0.5 0.65 | ||
4 0.5 1.94 | ||
0 1 | ||
0 0.00 2.0 | ||
1 0.65 0.5 | ||
2 0.65 0.5 | ||
3 0.65 0.5 | ||
4 1.94 0.5 | ||
>>> mca = prince.MCA(n_components=2, one_hot=False, engine="scipy") | ||
>>> one_hot = pd.get_dummies(df, columns=['foo', 'bar']) | ||
>>> mca = mca.fit(one_hot) | ||
>>> mca.transform(one_hot).round(2).abs().sort_index(axis='columns') | ||
0 1 | ||
0 2.0 0.00 | ||
1 0.5 0.65 | ||
2 0.5 0.65 | ||
3 0.5 0.65 | ||
4 0.5 1.94 | ||
0 1 | ||
0 0.00 2.0 | ||
1 0.65 0.5 | ||
2 0.65 0.5 | ||
3 0.65 0.5 | ||
4 1.94 0.5 | ||
""" | ||
|
||
|
@@ -122,12 +128,12 @@ def test_issue_131(): | |
>>> mca = prince.MCA(engine="scipy") | ||
>>> mca = mca.fit(df) | ||
>>> mca.transform(df).round(2).abs().sort_index(axis='columns') | ||
0 1 | ||
0 2.0 0.00 | ||
1 0.5 0.65 | ||
2 0.5 0.65 | ||
3 0.5 0.65 | ||
4 0.5 1.94 | ||
0 1 | ||
0 0.00 2.0 | ||
1 0.65 0.5 | ||
2 0.65 0.5 | ||
3 0.65 0.5 | ||
4 1.94 0.5 | ||
>>> mca.K_, mca.J_ | ||
(2, 8) | ||
|
@@ -185,3 +191,49 @@ def test_type_doesnt_matter(): | |
|
||
for i in range(len(outputs) - 1): | ||
np.testing.assert_allclose(outputs[i], outputs[i + 1]) | ||
|
||
|
||
issue_161_data = """ | ||
,category,userid,location,applicationname,browser\n | ||
0,Portal Login,[email protected],"San Jose, CA, United States",A,Chrome\n | ||
1,Application Access,[email protected],"San Jose, CA, United States",B,Other\n | ||
2,Application Access,[email protected],"San Jose, CA, United States",C,Other\n | ||
3,Portal Login,[email protected],"San Diego, CA, United States",A,Chrome\n | ||
""" | ||
|
||
|
||
def test_issue_161(): | ||
""" | ||
https://github.com/MaxHalford/prince/issues/161 | ||
>>> import io | ||
>>> data = pd.read_csv(io.StringIO(issue_161_data), index_col=0) | ||
>>> mca = prince.MCA( | ||
... n_components=10, | ||
... n_iter=3, | ||
... copy=True, | ||
... check_input=True, | ||
... engine='sklearn', | ||
... random_state=42 | ||
... ) | ||
>>> mca = mca.fit(data[:3]) | ||
>>> mca.eigenvalues_summary | ||
eigenvalue % of variance % of variance (cumulative) | ||
component | ||
0 0.673 67.32% 67.32% | ||
1 0.327 32.68% 100.00% | ||
>>> mca.row_coordinates(data[:3]) | ||
0 1 | ||
0 1.120811 -0.209242 | ||
1 -0.820491 -0.571660 | ||
2 -0.300320 0.780902 | ||
>>> mca.transform(data[3:]) | ||
0 1 | ||
3 1.664888 -0.640285 | ||
""" |