Skip to content

Commit

Permalink
Merge pull request #206 from AmandaKwok28/main
Browse files Browse the repository at this point in the history
EC1
  • Loading branch information
JEFworks authored Mar 15, 2024
2 parents 7b9db92 + 99cc4d1 commit 2f5fc4b
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 10 deletions.
45 changes: 35 additions & 10 deletions _posts/2024-02-14-akwok1.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,15 @@ Note: some additional plots were included to show how the gene fits into the k m

While creating plots, I used multiple test correction to correct for the p values found. I also used the wilcox test to determine which genes were most likely a part of cluster 9 in order to find upregulated differentially expressed genes within that cluster.

Note: The change I made to this homework was a replacement of the 4th graph titled Cluster Cell 9 where I color cluster 9 to show it's location among the other cells to make it more apparent that it indeed is a cluster. From this you can see it's relatively spread out but in a way that indicates that intuitively, it makes sense that this gene cluster CD93 is a cluster of endothelial cells (since only endothelial cells would be basically everywhere).


NIH article: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8900912/
Source for cell population type: https://www.proteinatlas.org/ENSG00000125810-CD93


### Code:
# Homework 4
### Code: (Updated)
# hw4 redo

# Load data
data <- read.csv(paste('C:/Users/Amand/OneDrive/Documents/JHU-undergrad',
Expand All @@ -55,14 +58,13 @@ gexpnorm <- log10(gexp/rowSums(gexp) * mean(rowSums(gexp))+1)

# pca
pcs <- prcomp(gexpnorm)
plot(pcs$sdev[1:100])

# tsne
set.seed(123)
emb <- Rtsne(pcs$x[,1:20])$Y

# kmeans clustering on tSNE embedding: first 20 pcs for processing speed -------
set.seed(123)
# kmeans + totw
set.seed(0)
k <- kmeans(pcs$x[,1:20], centers = 13)
com <- as.factor(k$cluster)
p1 <- ggplot(data.frame(emb,com)) +
Expand All @@ -71,7 +73,7 @@ p1 <- ggplot(data.frame(emb,com)) +
theme(plot.title = element_text(hjust = 0.5))
p1

c = 9 # setting cluster of interest
c = 10 # setting cluster of interest

# figure out which gene is most responsible for each cluster (lowest pvs) ------
results2 <- sapply(1:13, function(i) {
Expand Down Expand Up @@ -109,11 +111,27 @@ p4 <- ggplot(df3) +
theme(plot.title = element_text(hjust = 0.5)) + theme_bw()
p4

# revised p4 -------------------------------------------------------------------
tmp <- ggplot(data.frame(gexpnorm, com, pos)) +
geom_point(aes(x = aligned_x, y = aligned_y, col = com))

tmp # looks like cluster 5 is interesting...

gexp_cluster9 <- gexpnorm[com == c, ]
df3 <- data.frame(pos, gexpnorm, com)
p4 <- ggplot(df3) +
geom_point(aes(x = aligned_x, y = aligned_y, col = ifelse(com == 9, "CD93", "other")), size = 0.2) +
labs(title = "Cell Cluster 9",
color = 'Cells With') +
scale_color_manual(values = c('red', 'grey')) +
theme(plot.title = element_text(hjust = 0.5)) + theme_bw()
p4

# pca on cluster 8 -------------------------------------------------------------
pc9 <- prcomp(gexp_cluster9)
plot(pc8$sdev[1:100])
plot(pc9$sdev[1:100])

df4 <- data.frame(pc8$x)
df4 <- data.frame(pc9$x)
p5 <- ggplot(df4) +
geom_point(aes(x = PC1, y = PC2)) +
theme_bw() + labs(title = "PCA Cluster 9") +
Expand All @@ -124,7 +142,7 @@ p5
emb2 <- Rtsne(pc9$x)$Y
df5 <- data.frame(pc9$x, emb2)
p6 <- ggplot(df5) +
geom_point(aes(x = X1, y = X2)) + theme_bw() + labs(title = "tSNE Cluster 9") +
geom_point(aes(x = X1, y = X2), size = 1) + theme_bw() + labs(title = "tSNE Cluster 9") +
theme(plot.title = element_text(hjust = 0.5))
p6

Expand All @@ -151,7 +169,7 @@ top_genes <- names(top_adjusted[top_adjusted < 0.05])

df6 <- data.frame(gexp_cluster9, pos[com == c, ], emb2, gene = gexp_cluster9$CD93)
p7 <- ggplot(df6) +
geom_point(aes(x = X1, y = X2, col = gene)) +
geom_point(aes(x = X1, y = X2, col = gene), size = 1) +
theme_bw() + labs(title = "CD93 Expression Cluster 9") +
theme(plot.title = element_text(hjust = 0.5))

Expand Down Expand Up @@ -181,3 +199,10 @@ p8
# final figure -----------------------------------------------------------------
p1 + p2 + p3 + p4 + p6 + p7 + p8 + plot_layout(ncol = 4)








89 changes: 89 additions & 0 deletions _posts/2024-03-06-akwok1.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
---
layout: post
title: "Dimensionality Reduction using GGanimate"
author: Amanda Kwok
jhed: akwok1
categories: [ HW EC1 ]
image: homework/hwEC1/akwok1.gif
featured: false
---

### Description of figure
In the gif, I visualized the pca (linear) dimensionality reduction in 2D space which transitions into the tSNE (nonlinear) dimensionality reduciton in 2D space. The axes for the pcs are PC1 and PC2 since those principle components capture the most variance in the data. The two kinds of dimensionality reduction seem to show that they can tell you drastically different things about a dataset. The tSNE seems to suggest there are several clusters/cell types or bundles of genes that are related whereas the PCA tells you that there are distinctly 2 groups in the data. Whether or not there are further groupings in those 2 groups isn't told by the PCA. The benefit of pca is that you can be confident that your results are correct in the sense that the dimension reduction is linear which means it preserves spatial relationships. On the other hand, the tSNE does not necessarily preserve spatial relationships and aims more to minimize the KL divergence between distributions which could lead to incorrectly interpretted results.


### Code:
# gganimate EC

# clear everything
rm(list = ls())

# load data
data <- read.csv(paste('C:/Users/Amand/OneDrive/Documents/JHU-undergrad',
'/Junior Year/Sem2/Genomic Data/Amanda-K/data/pikachu.csv.gz', sep = ''),
row.names = 1)

# load relevant Libraries
library(ggplot2)
library(gganimate)
library(Rtsne)
library(patchwork)

# normalize the data
pos <- data[,4:5]
gexp <- data[,6:ncol(data)]

gexp[1:5,1:5]
gexpnorm <- log10(gexp/rowSums(gexp) * mean(rowSums(gexp))+1)
hist(rowSums(gexpnorm))

# linear dimensionality reduction (pca)
pcs <- prcomp(gexpnorm)
df <- data.frame(pcs$x)
p1 <- ggplot(df) + geom_point(aes(x=PC1, y=PC2))

p1

# nonlinear dimensionality reduction (emb)
set.seed(123)
emb <- Rtsne(gexpnorm)$Y
df2 <- data.frame(emb)
p2 <- ggplot(df2) + geom_point(aes(x=X1, y=X2))

p2


# use gganimate
pc12 <- df[,1:2]
colnames(pc12) <- c('Dim1', 'Dim2')
tsne <- df2
colnames(tsne) <- c('Dim1', 'Dim2')

df3 <- rbind(cbind(pc12, order="PCA"),
cbind(tsne, order="tSNE"))
df3$order <- factor(df3$order)

# Create animated plot
animated_plot <- ggplot(df3) +
geom_point(aes(Dim1,Dim2)) +
labs(title = "Linear vs. NonLinear \n Dimensionality Reduction") +
transition_states(
order,
transition_length = 2,
state_length = 1
) +
view_follow() +
theme(plot.title = element_text(hjust=0.5)) # center the title


animated_plot
anim_save("EC1.gif", animated_plot, path = 'C:/Users/Amand/OneDrive/Documents/JHU-undergrad/Junior Year/Sem2/Genomic Data/EC')









Binary file modified homework/hw4/hw4_akwok1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added homework/hwEC1/akwok1.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 2f5fc4b

Please sign in to comment.