-
Notifications
You must be signed in to change notification settings - Fork 0
/
hmp_wgs_pathway_networks.R
166 lines (145 loc) · 6.6 KB
/
hmp_wgs_pathway_networks.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# Shared pathway network
# Here we'll create a network of taxa. Taxa will be connected if the
# two taxa are both correlated with at least 1 pathway.
# A taxon connected to many other taxa in this network may be correlated with
# lots of pathways, or it may be correlated with only pathways that are very
# popular among other taxa.
# Groups of taxa have some overlapping pathways.
# In particular, the true representation is a simplicial complex, becuase if
# n taxa all correlate with a pathway, then all n taxa will
# be connected to each other.
# We can consider picking out pathways we care about and then looking at
# the filtered network that use these pathways.
# It would also make a nice heatmap
setwd("~/Documents")
library(ggplot2)
library(reshape2)
library(viridis)
library(RColorBrewer)
library(stringr)
library(igraph)
library(MicrobiomeDB, quietly=TRUE)
library(rlist)
# Grab collections
HMP_MGX_species <- getCollection(microbiomeData::HMP_MGX, "Shotgun metagenomics Species (Relative taxonomic abundance analysis)")
HMP_MGX_pathways <- getCollection(microbiomeData::HMP_MGX, "Shotgun metagenomics Metagenome enzyme pathway abundance data" )
# Correlation
pathway_vs_species <- correlation(HMP_MGX_species, HMP_MGX_pathways, method = 'spearman')
# Filter and extract data.table. IMPORTANT, this filtering here is our definition
# of "correlated". To create the network, we must turn correlation into a binary
# thing - either a taxon is correlated with a pathway or it isn't. To make this
# more clear, when i mean correlated and passing filters, i'll write "correlated".
corr_stats <- data.table::setDT(pathway_vs_species@statistics@statistics)
corr_stats_filtered <- corr_stats[corr_stats$correlationCoef >= 0.80 & corr_stats$pValue <= 0.01, ]
# Now what we need to do is turn this list of "correlations" into a network
# I'll use for loops for clarity.
# Initialize empty data.frame
network_df <- data.frame(
taxonA = character(),
taxonB = character(),
sharedPathway = character(),
stringsAsFactors = FALSE
)
pathways <- as.character(unique(corr_stats_filtered$data2))
for (pathway in pathways) {
taxa_correlated_with_pathway <- corr_stats_filtered$data1[which(corr_stats_filtered$data2 == pathway)]
# We need at least two taxa to be correlated with the pathway to make any edges!
if (length(taxa_correlated_with_pathway) > 1){
# Add all pairs of taxa to the network data frame
for (i in 1:(length(taxa_correlated_with_pathway)-1)) {
taxon <- taxa_correlated_with_pathway[i]
for (j in (i+1):length(taxa_correlated_with_pathway)) {
# Add edge from taxon to taxa_correlated_with_pathway[j]
network_df <- rbind(network_df, data.frame(taxonA = taxon, taxonB = taxa_correlated_with_pathway[j], sharedPathway=pathway, stringsAsFactors = FALSE))
}
}
}
}
# Let's do a check to make sure we have the correct number of edges
# The incidence of each pathway gives us the size of the cliques, then we can
# calculate the number of edges from the cliques. There may be duplicates, but
# they would be the same duplicates as above
pathway_frequencies <- as.data.frame(table(corr_stats_filtered$data2))$Freq
num_edges <- 0
for (pathway_frequency in pathway_frequencies) {
# Calculate the number of edges in the clique
num_edges <- num_edges + (pathway_frequency * (pathway_frequency - 1) / 2)
}
if (num_edges == nrow(network_df)) {
print("Check complete")
} else {
print("Hey there's probably a bug....")
}
# Now make an igraph network
network <- igraph::graph_from_data_frame(network_df, directed=FALSE)
# We may have many duplicate edges. If two nodes have k edges between them, it
# means they are "correlated" with the same k pathways.
# We are not using that information at this time, so we'll simplify the graph
# by removing duplicate edges.
# simplified_network <- simplify(
# network,
# remove.multiple = TRUE
# )
E(network)$color <- as.factor(network_df$sharedPathway)
igraph::plot.igraph(
network,
arrow.mode=0,
vertex.color="white",
vertex.label.dist=1,
vertex.label.color="black",
vertex.label.degree=0,
vertex.size=2,
main="Pathway network"
)
## Let's draw some nicer graphs
component_membership <- components(network)$membership
components_node_list <- lapply(seq(1:max(component_membership)), function(i) {
return(V(network)$name[which(component_membership==i)])
})
connected_components_graphs <- lapply(seq(1:max(component_membership)), function(i) {
g <- induced_subgraph(network, components_node_list[[i]], "create_from_scratch")
})
# Plot graph and legend
for(i in seq(1:max(component_membership))) {
component_i <- connected_components_graphs[[i]]
# Find the unique pathways
component_i_pathways <- unique(E(component_i)$color)
# Create a legend data.frame that maps the pathway to a color
if (length(as.character(component_i_pathways)) > 7) {
legend_data <- data.frame(
pathway = component_i_pathways,
color = viridis(length(component_i_pathways))
)
} else {
legend_data <- data.frame(
pathway = component_i_pathways,
color = brewer.pal(length(component_i_pathways), "Set3")
)
}
# Remap the edge colors to the legend colors
E(component_i)$color <- legend_data$color[match(E(component_i)$color, legend_data$pathway)]
plot(component_i, vertex.color="white", vertex.label.dist=1, vertex.label.color="black", vertex.label.degree=0, vertex.size=4, main=paste("Component", i), edge.width=2)
legend("bottom", legend=legend_data$pathway, fill=legend_data$color)
}
## Want to know which pathways are connecting nodes?
corr_incidence <- reshape(corr_stats_filtered[, c("data1", "data2", "correlationCoef")], direction="wide", idvar="data1", timevar="data2")
# binarize corr_incidence
binarized_incidence <- corr_incidence[,-1]
binarized_incidence[is.na(binarized_incidence)] <- 0
binarized_incidence[binarized_incidence > 0] <- 1
binarized_matrix <- data.matrix(binarized_incidence, rownames.force = NA)
rownames(binarized_matrix) <- corr_incidence$data1
colnames(binarized_matrix) <- str_replace(colnames(binarized_matrix), "correlationCoef.", "")
# Reorder rows based on components
taxa_order <- unlist(components_node_list)
taxa_order <- rlist::list.reverse(taxa_order)
binarized_matrix <- binarized_matrix[taxa_order,,drop=FALSE]
# Reorder columns based on components
pathway_order <- unlist(lapply(connected_components_graphs, function(g) {
g_df <- igraph::as_data_frame(g)
return(unique(g_df$sharedPathway))
}))
pathway_order <- rlist::list.reverse(pathway_order)
binarized_matrix <- binarized_matrix[,pathway_order]
# A nicer looking heatmap
heatmap(binarized_matrix, scale = "none", col = c("white", "#6f7075"), Rowv = NA, Colv = NA, cexRow = 0.6)