Merge pull request #206 from AmandaKwok28/main

EC1
JEFworks-Lab · Mar 15, 2024 · 2f5fc4b · 2f5fc4b
2 parents 7b9db92 + 99cc4d1
commit 2f5fc4b
Show file tree

Hide file tree

Showing 4 changed files with 124 additions and 10 deletions.
diff --git a/_posts/2024-02-14-akwok1.md b/_posts/2024-02-14-akwok1.md
@@ -28,12 +28,15 @@ Note: some additional plots were included to show how the gene fits into the k m
 
   While creating plots, I used multiple test correction to correct for the p values found. I also used the wilcox test to determine which genes were most likely a part of cluster 9 in order to find upregulated differentially expressed genes within that cluster. 
 
+  Note: The change I made to this homework was a replacement of the 4th graph titled Cluster Cell 9 where I color cluster 9 to show it's location among the other cells to make it more apparent that it indeed is a cluster. From this you can see it's relatively spread out but in a way that indicates that intuitively, it makes sense that this gene cluster CD93 is a cluster of endothelial cells (since only endothelial cells would be basically everywhere).
+
+
   NIH article:  https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8900912/
   Source for cell population type: https://www.proteinatlas.org/ENSG00000125810-CD93
 
 
-### Code: 
-# Homework 4
+### Code: (Updated)
+# hw4 redo
 
 # Load data
 data <- read.csv(paste('C:/Users/Amand/OneDrive/Documents/JHU-undergrad',
@@ -55,14 +58,13 @@ gexpnorm <- log10(gexp/rowSums(gexp) * mean(rowSums(gexp))+1)
 
 # pca
 pcs <- prcomp(gexpnorm)
-plot(pcs$sdev[1:100])
 
 # tsne
 set.seed(123)
 emb <- Rtsne(pcs$x[,1:20])$Y
 
-# kmeans clustering on tSNE embedding: first 20 pcs for processing speed -------
-set.seed(123)
+# kmeans + totw
+set.seed(0)
 k <- kmeans(pcs$x[,1:20], centers = 13)
 com <- as.factor(k$cluster)
 p1 <- ggplot(data.frame(emb,com)) + 
@@ -71,7 +73,7 @@ p1 <- ggplot(data.frame(emb,com)) +
   theme(plot.title = element_text(hjust = 0.5))
 p1
 
-c = 9  # setting cluster of interest
+c = 10  # setting cluster of interest
 
 # figure out which gene is most responsible for each cluster (lowest pvs) ------
 results2 <- sapply(1:13, function(i) {
@@ -109,11 +111,27 @@ p4 <- ggplot(df3) +
   theme(plot.title = element_text(hjust = 0.5)) + theme_bw()
 p4
 
+# revised p4 -------------------------------------------------------------------
+tmp <- ggplot(data.frame(gexpnorm, com, pos)) + 
+  geom_point(aes(x = aligned_x, y = aligned_y, col = com))
+
+tmp # looks like cluster 5 is interesting...
+
+gexp_cluster9 <- gexpnorm[com == c, ]
+df3 <- data.frame(pos, gexpnorm, com)
+p4 <- ggplot(df3) +
+  geom_point(aes(x = aligned_x, y = aligned_y, col = ifelse(com == 9, "CD93", "other")), size = 0.2) +
+  labs(title = "Cell Cluster 9",
+       color = 'Cells With') +
+  scale_color_manual(values = c('red', 'grey')) +
+  theme(plot.title = element_text(hjust = 0.5)) + theme_bw()
+p4
+
 # pca on cluster 8 -------------------------------------------------------------
 pc9 <- prcomp(gexp_cluster9)
-plot(pc8$sdev[1:100])
+plot(pc9$sdev[1:100])
 
-df4 <- data.frame(pc8$x)
+df4 <- data.frame(pc9$x)
 p5 <- ggplot(df4) +
   geom_point(aes(x = PC1, y = PC2)) +
   theme_bw() + labs(title = "PCA Cluster 9") +
@@ -124,7 +142,7 @@ p5
 emb2 <- Rtsne(pc9$x)$Y
 df5 <- data.frame(pc9$x, emb2) 
 p6 <- ggplot(df5) +
-  geom_point(aes(x = X1, y = X2)) + theme_bw() + labs(title = "tSNE Cluster 9") +
+  geom_point(aes(x = X1, y = X2), size = 1) + theme_bw() + labs(title = "tSNE Cluster 9") +
   theme(plot.title = element_text(hjust = 0.5))
 p6
 
@@ -151,7 +169,7 @@ top_genes <- names(top_adjusted[top_adjusted < 0.05])
 
 df6 <- data.frame(gexp_cluster9, pos[com == c, ], emb2, gene = gexp_cluster9$CD93)
 p7 <- ggplot(df6) + 
-  geom_point(aes(x = X1, y = X2, col = gene)) +
+  geom_point(aes(x = X1, y = X2, col = gene), size = 1) +
   theme_bw() + labs(title = "CD93 Expression Cluster 9") +
   theme(plot.title = element_text(hjust = 0.5))
 
@@ -181,3 +199,10 @@ p8
 # final figure -----------------------------------------------------------------
 p1 + p2 + p3 + p4 + p6 + p7 + p8 + plot_layout(ncol = 4)
 
+
+
+
+
+
+
+
diff --git a/_posts/2024-03-06-akwok1.md b/_posts/2024-03-06-akwok1.md
@@ -0,0 +1,89 @@
+---
+layout: post
+title:  "Dimensionality Reduction using GGanimate"
+author: Amanda Kwok
+jhed: akwok1
+categories: [ HW EC1 ]
+image: homework/hwEC1/akwok1.gif
+featured: false
+---
+
+### Description of figure
+In the gif, I visualized the pca (linear) dimensionality reduction in 2D space which transitions into the tSNE (nonlinear) dimensionality reduciton in 2D space. The axes for the pcs are PC1 and PC2 since those principle components capture the most variance in the data. The two kinds of dimensionality reduction seem to show that they can tell you drastically different things about a dataset. The tSNE seems to suggest there are several clusters/cell types or bundles of genes that are related whereas the PCA tells you that there are distinctly 2 groups in the data. Whether or not there are further groupings in those 2 groups isn't told by the PCA. The benefit of pca is that you can be confident that your results are correct in the sense that the dimension reduction is linear which means it preserves spatial relationships. On the other hand, the tSNE does not necessarily preserve spatial relationships and aims more to minimize the KL divergence between distributions which could lead to incorrectly interpretted results. 
+
+
+### Code:
+# gganimate EC
+
+# clear everything
+rm(list = ls())
+
+# load data
+data <- read.csv(paste('C:/Users/Amand/OneDrive/Documents/JHU-undergrad',
+               '/Junior Year/Sem2/Genomic Data/Amanda-K/data/pikachu.csv.gz', sep = ''), 
+         row.names = 1)
+
+# load relevant Libraries
+library(ggplot2)
+library(gganimate)
+library(Rtsne)
+library(patchwork)
+
+# normalize the data
+pos <- data[,4:5]
+gexp <- data[,6:ncol(data)]
+
+gexp[1:5,1:5]
+gexpnorm <- log10(gexp/rowSums(gexp) * mean(rowSums(gexp))+1)
+hist(rowSums(gexpnorm))
+
+# linear dimensionality reduction (pca)
+pcs <- prcomp(gexpnorm)
+df <- data.frame(pcs$x)
+p1 <- ggplot(df) + geom_point(aes(x=PC1, y=PC2))
+
+p1
+
+# nonlinear dimensionality reduction (emb)
+set.seed(123)
+emb <- Rtsne(gexpnorm)$Y
+df2 <- data.frame(emb)
+p2 <- ggplot(df2) + geom_point(aes(x=X1, y=X2))
+
+p2
+
+
+# use gganimate
+pc12 <- df[,1:2]
+colnames(pc12) <- c('Dim1', 'Dim2')
+tsne <- df2
+colnames(tsne) <- c('Dim1', 'Dim2')
+
+df3 <- rbind(cbind(pc12, order="PCA"), 
+                   cbind(tsne, order="tSNE"))
+df3$order <- factor(df3$order)
+
+# Create animated plot
+animated_plot <- ggplot(df3) +
+  geom_point(aes(Dim1,Dim2)) +
+  labs(title = "Linear vs. NonLinear \n Dimensionality Reduction") +
+  transition_states(
+    order,
+    transition_length = 2,
+    state_length = 1
+  ) +
+  view_follow() +
+  theme(plot.title = element_text(hjust=0.5)) # center the title
+
+
+animated_plot
+anim_save("EC1.gif", animated_plot, path = 'C:/Users/Amand/OneDrive/Documents/JHU-undergrad/Junior Year/Sem2/Genomic Data/EC')
+
+
+
+
+
+
+
+
+
diff --git a/homework/hw4/hw4_akwok1.png b/homework/hw4/hw4_akwok1.png
diff --git a/homework/hwEC1/akwok1.gif b/homework/hwEC1/akwok1.gif