Skip to content

Commit

Permalink
Merge pull request #201 from ShailiTripathi/main
Browse files Browse the repository at this point in the history
hw4 for ec
  • Loading branch information
JEFworks authored Mar 15, 2024
2 parents f5ce37a + de81ee5 commit c8b75c9
Show file tree
Hide file tree
Showing 4 changed files with 134 additions and 64 deletions.
130 changes: 66 additions & 64 deletions _posts/2024-02-14-stripat9.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ External Source: The Human Protein Altas (https://www.proteinatlas.org/ENSG00000
library(ggplot2)
library(Rtsne)
library(patchwork)
library(gganimate)
library(lattice)
library(ggrepel)
library(gridExtra)
data <- read.csv('/Users/shailitripathi/Downloads/GENOMIC/Homework/pikachu.csv.gz', row.names=1)
dim(data)
Expand All @@ -35,71 +39,68 @@ gexp <- data[,6:ncol(data)]
gexpnorm <- log10(gexp/rowSums(gexp) * mean(rowSums(gexp)) + 1)
# PCA
# PCA
pcs <- prcomp(gexpnorm)
plot(pcs$sdev[1:100])
# TSNE
plot(pcs$sdev[1:30], type = 'l') # plot variance of first 30 pcs
pc_plot <- ggplot(data.frame(pcs$x)) + geom_point(aes(x = PC1, y = PC2)) +
geom_hline(yintercept = 0, linetype = "solid", color = "black") +
geom_vline(xintercept = 0, linetype = "solid", color = "black") +
theme_minimal()
pc_plot
emb <- Rtsne(gexpnorm)$Y
# K-MEANS
# TOTAL WITHINESS
set.seed(123)
results <- sapply(seq(2, 30, by = 1), function(i) {
print(i)
com <- kmeans(emb, centers = i, iter.max = 20)
totw <- sapply(1:20, function(i) {
com <- kmeans(pcs$x[, 1:5], centers = i)
return(com$tot.withinss)
})
results
plot(results, type = "l")
plot(totw, type = 'l')
# K-MEANS
set.seed(123)
com <- as.factor(kmeans(pcs$x[,1:20], centers=10)$cluster)
com <- kmeans(pcs$x[, 1:5], centers = 10)
cluster_labels <- as.factor(com$cluster)
k_means <- ggplot(data.frame(emb, com)) +
geom_point(aes(x=X1, y=X2, col = com), size = 0.03) +
ggtitle("10 Clusters") +
pca_cluster <- ggplot(data.frame(pcs$x[, 1:2], com = cluster_labels)) +
geom_point(aes(x = PC1, y = PC2, col = com), size = 0.5) +
labs(title = "Clusters in PCA Space") +
geom_hline(yintercept = 0, linetype = "solid", color = "black") +
geom_vline(xintercept = 0, linetype = "solid", color = "black") +
theme_minimal()
k_means
# CLUSTER OF INTEREST - physical space
actual_cluster <- ggplot(data.frame(pos, com = cluster_labels)) +
geom_point(aes(x = aligned_x, y = aligned_y, col = com), size = 0.25) +
labs(title = "Clusters in Actual Space")
cluster_data <- data[com==10, ]
cluster <- ggplot(cluster_data, aes(x = aligned_x, y = aligned_y)) +
geom_point(color = 'cornflowerblue', size = 0.5) +
ggtitle("Cluster 10") +
theme_minimal()
cluster
pca_cluster + actual_cluster
# CLUSTERS OF INTEREST
# CLUSTER OF INTEREST - reduced dimensional space: pca
cluster_data <- data[cluster_labels==4, ]
cluster4 <- ggplot(cluster_data, aes(x = aligned_x, y = aligned_y)) +
geom_point(color = 'chartreuse3', size = 0.5) +
ggtitle("Cluster 1") +
theme_minimal()
cluster_gexp <- gexpnorm[row.names(gexpnorm) %in% row.names(cluster_data), ]
cluster_pcs <- prcomp(cluster_gexp)
cluster_pca <- ggplot(data.frame(cluster_pcs$x[, 1:2]), aes(x = PC1, y = PC2)) +
geom_point(size = 0.5, color = "cornflowerblue") +
labs(x = "PC1", y = "PC2", title = "PCA of Cluster 10") +
cluster_colors <- ifelse(cluster_labels == 4, "Cluster 4", "Other Clusters")
cluster4_pca <- ggplot(data.frame(pcs$x[, 1:2], com = cluster_colors)) +
geom_point(aes(x = PC1, y = PC2, color = com), size = 0.5) +
labs(title = "Cluster 1 in PCA Space") +
geom_hline(yintercept = 0, linetype = "solid", color = "black") +
geom_vline(xintercept = 0, linetype = "solid", color = "black") +
theme_minimal()
cluster_pca
scale_color_manual(values = c("Cluster 4" = "chartreuse3", "Other Clusters" = "gray")) +
theme_minimal()
# DIFFERENTIALLY EXPRESSED GENES - Wilcox test
results <- sapply(colnames(gexpnorm), function(g) {
wilcox.test(gexpnorm[com == 10, g],
gexpnorm[com != 10, g],
wilcox.test(gexpnorm[cluster_labels == 4, g],
gexpnorm[cluster_labels != 4, g],
alternative = "greater")$p.val
})
results
names(sort(results[results < 0.05], decreasing = FALSE))[1:10]
top_10_genes <- names(sort(results[results < 0.05], decreasing = FALSE))[1:10]
percent_expression <- sapply(top_10_genes, function(gene) {
mean(cluster_data[gene] > 0) * 100
Expand All @@ -110,41 +111,42 @@ plot_data <- data.frame(
)
dif_exp <- ggplot(plot_data, aes(x = Gene, y = Percent_Expression)) +
geom_bar(stat = "identity", fill = "cyan4") +
labs(x = "Top 10 Genes", y = "% of Cells Expressing Gene", title = "Differentially Expressed Genes in Cluster 10") +
geom_bar(stat = "identity", fill = "chartreuse4") +
labs(x = "Top 10 Genes", y = "% of Cells Expressing Gene", title = "Differentially Expressed Genes in Cluster 1") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
dif_exp
# CDH1 - physical space
cdh1_data <- data.frame(data, com, gene=gexpnorm[, 'CDH1'])
# CDH1
cdh1 <- ggplot(cdh1_data, aes(x = aligned_x, y = aligned_y)) +
geom_point(color = 'cyan4', size = 0.1) +
CDH1_data <- data.frame(data, cluster_labels, gene=gexpnorm[, 'CDH1'])
CDH1 <- ggplot(CDH1_data, aes(x = aligned_x, y = aligned_y, color = CDH1)) +
geom_point(size = 0.5) +
ggtitle("CDH1") +
scale_color_gradient(low = "white", high = "chartreuse4" ) +
theme_minimal()
cdh1
# CDH1 - reduced dimensional space: pca
pca_result <- prcomp(cdh1_data[, c("aligned_x", "aligned_y", "gene")], scale. = TRUE)
pca_scores <- as.data.frame(pca_result$x)
pca_scores$CDH1 <- cdh1_data$CDH1 # Source: ChatGPT
cdh1_pca <- ggplot(pca_scores, aes(x = PC1, y = PC2, color = CDH1)) +
geom_point(size = 0.5, color = "cyan4") +
ggtitle("PCA of CDH1 ") +
CDH1_colors <- ifelse(gexpnorm[, 'CDH1'] > 0, "CDH1", "No CDH1")
CDH1_pca <- ggplot(data.frame(pcs$x[, 1:2], com = CDH1_colors)) +
geom_point(aes(x = PC1, y = PC2, color = com), size = 0.5) +
labs(title = "CDH1 in PCA Space") +
geom_hline(yintercept = 0, linetype = "solid", color = "black") +
geom_vline(xintercept = 0, linetype = "solid", color = "black") +
theme_minimal()
cdh1_pca
scale_color_manual(values = c("CDH1" = "chartreuse4", "No CDH1" = "gray")) +
theme_minimal()
# DATA VISUALIZATION
k_means + dif_exp +
cluster + cluster_pca +
cdh1 + cdh1_pca +
plot_layout(ncol = 2, widths = c(1, 1))
lay <- rbind(c(1,2),
c(3,4),
c(5, 5),
c(6, 7))
grid.arrange(actual_cluster, pca_cluster,
cluster4, cluster4_pca,
dif_exp,
CDH1, CDH1_pca,
layout_matrix = lay,
top = "Gene Expression in Pickachu Data")
```
68 changes: 68 additions & 0 deletions _posts/2024-03-05-stripat9.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
---
layout: post
title: "Normalizing and Transforming Gene Expression"
author: Shaili Tripathi
jhed: stripat9
categories: [ hw EC1 ]
image: homework/hwEC1/shailitripathi.gif
featured: false
---

### Description of my visualization:

To explore the question of what happens if you do or do not normalize and transform gene expression data prior to dimensionallity reduction, I created a gif that shows the dimensionality reduction with and without normalization and transformation. I created a plot of principal component analysis done on the original gene expression data, one on normalized gene expression data (gexp / rowSums(gexp) * mean(rowSums(gexp)) + 1), and one on the log transform of the normalized data, and then combined them in an animation to see where each point goes as a result. From the animation, we can see that un-normalized data has biases that can lead to skewed results. This would make it difficult to compare samples and the expression of various genes accurately. The transformation also help with accurate analysis as it ensures that highly expressed genes do not dominate the analysis, allowing us to capture the underlying structure with PCA more easily.

### The entire code used to generate the figure so that it can be reproduced.
```{r}
library(ggplot2)
library(Rtsne)
library(gganimate)
data <- read.csv('/Users/shailitripathi/Downloads/GENOMIC/Homework/pikachu.csv.gz', row.names=1)
data[1:5, 1:5]
pos <- data[,4:5]
geneofinterest <- data$ERBB2
# Data
gexp <- data[6:ncol(data)]
gexpnorm <- gexp / rowSums(gexp) * mean(rowSums(gexp)) + 1
gexpnorm_log <- log10(gexpnorm)
# Plots
pc1 <- prcomp(gexp)
df1 <- data.frame(pc1$x[,1:2], gene = geneofinterest)
colnames(df1) <- c('PC_1', 'PC_2', 'ERBB2')
ggplot(df1) +
geom_point(aes(PC_1, y=PC_2, col=ERBB2)) +
labs(title = "PCs of Original Data") +
scale_color_gradient(low = "darkorange3", high = "cornflowerblue")
pc2 <- prcomp(gexpnorm)
df2 <- data.frame(pc2$x[,1:2], gene = geneofinterest)
colnames(df2) <- c('PC_1', 'PC_2', 'ERBB2')
ggplot(df2) +
geom_point(aes(PC_1, y=PC_2, col=ERBB2)) +
labs(title = "PCs of Normalized Data") +
scale_color_gradient(low = "darkorange3", high = "cornflowerblue")
pc3 <- prcomp(gexpnorm_log)
df3 <- data.frame(pc3$x[,1:2], gene = geneofinterest)
colnames(df3) <- c('PC_1', 'PC_2', 'ERBB2')
ggplot(df3) +
geom_point(aes(PC_1, y=PC_2, col=ERBB2)) +
labs(title = "PCs of Log-transformed Normalized Data") +
scale_color_gradient(low = "darkorange3", high = "cornflowerblue")
# Combine Plots Statically
df <- rbind(cbind(df1, order=1),
cbind(df2, order=2),
cbind(df3, order=3))
p <- ggplot(df) +
geom_point(aes(x=PC_1, y=PC_2, col=ERBB2)) +
scale_color_gradient(low = "darkorange3", high = "cornflowerblue")
# Combine Plots as Animation
animated_plot <- p + transition_states(order) + view_follow()
anim_save("shailitripathi.gif", animate(animated_plot))
```
Binary file modified homework/hw4/hw4_stripat9.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added homework/hwEC1/shailitripathi.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit c8b75c9

Please sign in to comment.