-
Notifications
You must be signed in to change notification settings - Fork 0
/
Latent.tex
52 lines (47 loc) · 2.35 KB
/
Latent.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
\section*{Mixture Models}
\subsection*{Gaussian Mixtures}
$P(x\mid z) = \sum_iw_i\mathcal{N}({\mu}_i, {\Sigma}_i)$\\
MLE is a nonconvex problem $\rightarrow$ EM.
\subsection*{Hard-EM}
\textbf{E-step: } Compute \\$z_i^{(t)} = \operatorname{argmax_z} P(z\mid x_i, \theta^{(t-1)})\\
\hspace*{1.4mm}= \operatorname{argmax_z} P(z\mid \theta^{(t-1)}) P(x_i\mid z,\theta^{(t-1)})\\
\stackrel{\text{GMM}}{=}\operatorname{argmax_z} w_z^{(t-1)}\mathcal{N}(x;\ \mu_z^{(t-1)},\Sigma_z^{(t-1)})$\\
\textbf{M-step: }
$\theta^{(t)} = \operatorname{argmax_\theta} P(x_{1:n},z^{(t)}_{1:n}\mid \theta)$\\
Hard-EM converges to a local maximum
of $P(x_{1:n},z^{(t)}_{1:n}\mid \theta)$. It tends to do poorly if clusters overlap.
Hard-EM for GMM with $w_z=\frac{1}{k}, \Sigma_z=\sigma^2{I}$
is equivalent to k-means.
\subsection*{Soft-EM}
\textbf{E-step: \newline} Compute the distribution
of $Z\mid x,\theta^{(t-1)}$, i.e. for each $x$ the responsibilities
\begin{equation*}
\begin{aligned}
\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!\!
P_{\theta^{(t-1)}}(Z=j\mid x) &= \frac{P(Z=j)P(x\mid Z=j)}{P(x)} \\
&\stackrel{\text{GMM}}{=}
\frac{w_j \mathcal{N}(x;\ \Sigma_j,\mu_j)}{\sum_k w_k \mathcal{N}(x;\ \Sigma_k,\mu_k)}.
\end{aligned}
\end{equation*}
\textbf{M-step:}
\begin{equation*}
\begin{aligned}
\theta^{(t)}&=\argmax_\theta
\mathbb{E}_{Z_{1:n}\mid x_{1:n},\theta^{(t-1)}}
\big[\log P_\theta(x_{1:n},Z_{1:n})\big]\\
&\stackrel{\text{iid\&cond.ind.}}{=}
\sum_{i=1}^n \mathbb{E}_{Z_i\mid x_i,\theta^{(t-1)}}
\big[\log P_\theta(x_i,Z_i)\big] \\
&= \sum_{i=1}^n \sum_{j=1}^k P_{\theta^{(t-1)}}(Z_i=j\mid x_i)\log P_\theta(x_i,Z_i=j)
\end{aligned}
\end{equation*}
GMM M-step:\\ $w_j^{(t)} \leftarrow \frac{1}{n} \sum_{i} \gamma_j^{(t)} (x_i)$; \\
$\mu_j^{(t)} \leftarrow \frac{\sum_{i} \gamma_j^{(t)} (x_i) x_i}{\sum_{i} \gamma_j^{(t)} (x_i)}\\ %\text{|learning with GMMs } ^t = ^*\\
\Sigma_j^{(t)} \leftarrow \frac{\sum_{i} \gamma_j^{(t)}(x_i) (x_i - \mu_j^{(t)}) (x_i - \mu_j^{(t)})^T}{\sum_{i} \gamma_j^{(t)}(x_i)} \{+\nu^2\mathbb{I}\}$\\ %\text{|}\gamma^{(t) = \gamma}$
The cluster size can be selected via CV.
EM converges to a local maximum for GMMs, dependent on
initialization.
\subsection*{Semi-Supervised Learning w/ GMMs:}
Set $P_{\theta^{(t-1)}}(Z=j\mid x) =1_{\{j = y\}}$ for labeled points
$(x,y).$
\\