-
Notifications
You must be signed in to change notification settings - Fork 0
/
pca.py
138 lines (98 loc) · 4.91 KB
/
pca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
from matplotlib import pyplot as plt
# Set plotly renderer
rndr_type = "jupyterlab+png"
pio.renderers.default = rndr_type
class PCA(object):
def __init__(self):
self.U = None
self.S = None
self.V = None
def fit(self, X: np.ndarray) -> None: # 5 points
"""
Decompose dataset into principal components by finding the singular value decomposition of the centered dataset X
You may use the numpy.linalg.svd function
Don't return anything. You can directly set self.U, self.S and self.V declared in __init__ with
corresponding values from PCA. See the docstrings below for the expected shapes of U, S, and V transpose
Hint: np.linalg.svd by default returns the transpose of V
Make sure you remember to first center your data by subtracting the mean of each feature.
Args:
X: (N,D) numpy array corresponding to a dataset
Return:
None
Set:
self.U: (N, min(N,D)) numpy array
self.S: (min(N,D), ) numpy array
self.V: (min(N,D), D) numpy array
"""
centers = np.mean(X, axis=0)
X_centers = X - centers
self.U, self.S, self.V = np.linalg.svd(X_centers, full_matrices=False)
return
def transform(self, data: np.ndarray, K: int = 2) -> np.ndarray: # 2 pts
"""
Transform data to reduce the number of features such that final data (X_new) has K features (columns)
Utilize self.U, self.S and self.V that were set in fit() method.
Args:
data: (N,D) numpy array corresponding to a dataset
K: int value for number of columns to be kept
Return:
X_new: (N,K) numpy array corresponding to data obtained by applying PCA on data
Hint: Make sure you remember to first center your data by subtracting the mean of each feature.
"""
centers = data - np.mean(data, axis=0)
V_k = self.V[:K, :].T
X_new = centers @ V_k
return X_new
def transform_rv(
self, data: np.ndarray, retained_variance: float = 0.99
) -> np.ndarray: # 3 pts
"""
Transform data to reduce the number of features such that the retained variance given by retained_variance is kept
in X_new with K features
Utilize self.U, self.S and self.V that were set in fit() method.
Args:
data: (N,D) numpy array corresponding to a dataset
retained_variance: float value for amount of variance to be retained
Return:
X_new: (N,K) numpy array corresponding to data obtained by applying PCA on data, where K is the number of columns
to be kept to ensure retained variance value is retained_variance
Hint: Make sure you remember to first center your data by subtracting the mean of each feature.
"""
self.fit(data)
total_variance = np.sum(self.S ** 2)
explained_variances = np.cumsum(self.S**2) / total_variance
K = np.argmax(explained_variances >= retained_variance) + 1
V_k = self.V[:K].T
centers = data - np.mean(data, axis=0)
X_new = centers @ V_k
return X_new
def get_V(self) -> np.ndarray:
"""Getter function for value of V"""
return self.V
def visualize(self, X: np.ndarray, y: np.ndarray, fig_title) -> None: # 5 pts
"""
You have to plot two different scatterplots (2d and 3d) for this function. For plotting the 2d scatterplot, use your PCA implementation to reduce the dataset to only 2 features. You'll need to run PCA on the dataset and then transform it so that the new dataset only has 2 features.
Create a scatter plot of the reduced data set and differentiate points that have different true labels using color using plotly.
Hint: Refer to https://plotly.com/python/line-and-scatter/ for making scatter plots with plotly.
Hint: We recommend converting the data into a pandas dataframe before plotting it. Refer to https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html for more details.
Args:
xtrain: (N,D) numpy array, where N is number of instances and D is the dimensionality of each instance
ytrain: (N,) numpy array, the true labels
Return: None
"""
self.fit(X)
x2 = self.transform(X, K = 2)
df = pd.DataFrame(x2, columns = ['x_val', 'y_val'])
df['label'] = y
fig = px.scatter(df, x = 'x_val', y = 'y_val', color = 'label', title = f'{fig_title}')
fig.show()
x3 = self.transform(X, K = 3)
df['z_val'] = x3[:, 2]
fig2 = px.scatter_3d(df, x= 'x_val',y = 'y_val',z = 'z_val',color = 'label', title = f'{fig_title}')
fig2.show()
return
raise NotImplementedError