hmp_wgs_pathway_networks.R

# Shared pathway network
# Here we'll create a network of taxa. Taxa will be connected if the
# two taxa are both correlated with at least 1 pathway.
# A taxon connected to many other taxa in this network may be correlated with 
# lots of pathways, or it may be correlated with only pathways that are very
# popular among other taxa. 
# Groups of taxa have some overlapping pathways. 
# In particular, the true representation is a simplicial complex, becuase if 
# n taxa all correlate with a pathway, then all n taxa will
# be connected to each other.
# We can consider picking out pathways we care about and then looking at
# the filtered network that use these pathways.
# It would also make a nice heatmap

setwd("~/Documents")
library(ggplot2)
library(reshape2)
library(viridis)
library(RColorBrewer)
library(stringr)
library(igraph)
library(MicrobiomeDB, quietly=TRUE)
library(rlist)

# Grab collections
HMP_MGX_species <- getCollection(microbiomeData::HMP_MGX, "Shotgun metagenomics Species (Relative taxonomic abundance analysis)")
HMP_MGX_pathways <- getCollection(microbiomeData::HMP_MGX, "Shotgun metagenomics Metagenome enzyme pathway abundance data" )

# Correlation
pathway_vs_species <- correlation(HMP_MGX_species, HMP_MGX_pathways, method = 'spearman')

# Filter and extract data.table. IMPORTANT, this filtering here is our definition
# of "correlated". To create the network, we must turn correlation into a binary
# thing - either a taxon is correlated with a pathway or it isn't. To make this
# more clear, when i mean correlated and passing filters, i'll write "correlated".
corr_stats <- data.table::setDT(pathway_vs_species@statistics@statistics)
corr_stats_filtered <- corr_stats[corr_stats$correlationCoef >= 0.80 & corr_stats$pValue <= 0.01, ]


# Now what we need to do is turn this list of "correlations" into a network
# I'll use for loops for clarity.
# Initialize empty data.frame
network_df <- data.frame(
  taxonA = character(),
  taxonB = character(),
  sharedPathway = character(),
  stringsAsFactors = FALSE
)

pathways <- as.character(unique(corr_stats_filtered$data2))
for (pathway in pathways) {
  taxa_correlated_with_pathway <- corr_stats_filtered$data1[which(corr_stats_filtered$data2 == pathway)]
  # We need at least two taxa to be correlated with the pathway to make any edges!
  if (length(taxa_correlated_with_pathway) > 1){
    # Add all pairs of taxa to the network data frame
    for (i in 1:(length(taxa_correlated_with_pathway)-1)) {
      taxon <- taxa_correlated_with_pathway[i]
      for (j in (i+1):length(taxa_correlated_with_pathway)) {
        # Add edge from taxon to taxa_correlated_with_pathway[j]
        network_df <- rbind(network_df, data.frame(taxonA = taxon, taxonB = taxa_correlated_with_pathway[j], sharedPathway=pathway, stringsAsFactors = FALSE))
      }
    }
  }
}

# Let's do a check to make sure we have the correct number of edges
# The incidence of each pathway gives us the size of the cliques, then we can
# calculate the number of edges from the cliques. There may be duplicates, but 
# they would be the same duplicates as above
pathway_frequencies <- as.data.frame(table(corr_stats_filtered$data2))$Freq
num_edges <- 0
for (pathway_frequency in pathway_frequencies) {
  # Calculate the number of edges in the clique
  num_edges <- num_edges + (pathway_frequency * (pathway_frequency - 1) / 2)
}
if (num_edges == nrow(network_df)) {
  print("Check complete")
} else {
  print("Hey there's probably a bug....")
}

# Now make an igraph network
network <- igraph::graph_from_data_frame(network_df, directed=FALSE)
# We may have many duplicate edges. If two nodes have k edges between them, it
# means they are "correlated" with the same k pathways. 
# We are not using that information at this time, so we'll simplify the graph
# by removing duplicate edges.
# simplified_network <- simplify(
#   network,
#   remove.multiple = TRUE
# )
E(network)$color <- as.factor(network_df$sharedPathway)

igraph::plot.igraph(
  network,
  arrow.mode=0,
  vertex.color="white",
  vertex.label.dist=1,
  vertex.label.color="black",
  vertex.label.degree=0,
  vertex.size=2,
  main="Pathway network"
)


## Let's draw some nicer graphs
component_membership <- components(network)$membership
components_node_list <- lapply(seq(1:max(component_membership)), function(i) {
  return(V(network)$name[which(component_membership==i)])
})
connected_components_graphs <- lapply(seq(1:max(component_membership)), function(i) {
  g <- induced_subgraph(network, components_node_list[[i]], "create_from_scratch")
})


# Plot graph and legend
for(i in seq(1:max(component_membership))) {
  component_i <- connected_components_graphs[[i]]
  # Find the unique pathways
  component_i_pathways <- unique(E(component_i)$color)
  
  # Create a legend data.frame that maps the pathway to a color
  if (length(as.character(component_i_pathways)) > 7) {
    legend_data <- data.frame(
      pathway = component_i_pathways,
      color = viridis(length(component_i_pathways))
    )
  } else {
    legend_data <- data.frame(
      pathway = component_i_pathways,
      color = brewer.pal(length(component_i_pathways), "Set3")
    )
  }
  # Remap the edge colors to the legend colors
  E(component_i)$color <- legend_data$color[match(E(component_i)$color, legend_data$pathway)]
  plot(component_i, vertex.color="white", vertex.label.dist=1, vertex.label.color="black", vertex.label.degree=0, vertex.size=4, main=paste("Component", i), edge.width=2)
  legend("bottom", legend=legend_data$pathway, fill=legend_data$color)
}


## Want to know which pathways are connecting nodes?
corr_incidence <- reshape(corr_stats_filtered[, c("data1", "data2", "correlationCoef")], direction="wide", idvar="data1", timevar="data2")
# binarize corr_incidence
binarized_incidence <- corr_incidence[,-1]
binarized_incidence[is.na(binarized_incidence)] <- 0
binarized_incidence[binarized_incidence > 0] <- 1
binarized_matrix <- data.matrix(binarized_incidence, rownames.force = NA)
rownames(binarized_matrix) <- corr_incidence$data1
colnames(binarized_matrix) <- str_replace(colnames(binarized_matrix), "correlationCoef.", "")
# Reorder rows based on components
taxa_order <- unlist(components_node_list)
taxa_order <- rlist::list.reverse(taxa_order)
binarized_matrix <- binarized_matrix[taxa_order,,drop=FALSE]
# Reorder columns based on components
pathway_order <- unlist(lapply(connected_components_graphs, function(g) {
  g_df <- igraph::as_data_frame(g)
  return(unique(g_df$sharedPathway))
}))
pathway_order <- rlist::list.reverse(pathway_order)
binarized_matrix <- binarized_matrix[,pathway_order]

# A nicer looking heatmap
heatmap(binarized_matrix, scale = "none", col = c("white", "#6f7075"), Rowv = NA, Colv = NA, cexRow = 0.6)