-
Notifications
You must be signed in to change notification settings - Fork 3
database_creation
First we retrieve the STRING protein-protein interaction data (http://string-db.org/), perform an initial thresholding to only keep the high confidence data and load it into Neo4j. We first need to make sure that the neo4j-shell
command can reach the appropriate Neo4j database and that the latest version of hwhelper
is installed.
suppressPackageStartupMessages(library(hwhelper))
library(igraph)
library(Matrix)
if (file.exists("9606.protein.links.v9.1.txt.gz") == F){
system("wget http://string-db.org/newstring_download/protein.links.v9.1/9606.protein.links.v9.1.txt.gz")
}
string.ppi <- read.delim("9606.protein.links.v9.1.txt.gz", sep="", stringsAsFactors=F)
names(string.ppi) <- c("stringID", "stringID", "score")
hc.string.ppi <- string.ppi[string.ppi$score > 700,]
load.neo4j(.data=hc.string.ppi, edge.name="ASSOC", commit.size=10000L, unique.rels=F)
## Preprocessing...
## Missing CONSTRAINT(s) for: StringID,StringID will build them first and continue with loading...
## Loading into Neo4j...
In addition we will keep a matrix copy for use in the prioritization functionality.
hc.string.ppi$score <- hc.string.ppi$score/1000
use.graph <- graph.data.frame(hc.string.ppi, directed=FALSE)
temp.sparse <- get.adjacency(graph=use.graph, sparse=TRUE, attr="score", type="both")/2
writeMM(temp.sparse, file="9606.protein.links.v9.1.mm.mtx")
## NULL
stopifnot(all(colnames(temp.sparse) == rownames(temp.sparse)))
write.table(rownames(temp.sparse), file="9606.protein.links.v9.1.mm.names", sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)
We can then load the STRING ID to Entrez ID mapping file into Neo4j.
if (file.exists("entrez_gene_id.vs.string.v9.05.28122012.txt")==F){
system("wget ftp://string-db.org/STRING/9.1/mapping_files/Entrez_mappings/entrez_gene_id.vs.string.v9.05.28122012.txt")
}
ppi.map <- read.delim("entrez_gene_id.vs.string.v9.05.28122012.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
names(ppi.map) <- c("entrezID", "stringID")
load.neo4j(.data=ppi.map, edge.name="MAPPED_TO", commit.size=10000L)
## Preprocessing...
## Missing CONSTRAINT(s) for: EntrezID will build them first and continue with loading...
## Loading into Neo4j...
Also load in gene symbol data for the entrez IDs. This will create a EntrezID -> Symbol mapping with a relationship called REFERRED_TO. This relationship will have a property called synonyms containing a list of all the synonyms for the given Entrez IDs.
if(file.exists("Homo_sapiens.gene_info.gz")==F){
system("wget ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz")
}
#header is like: '#Format: tax_id GeneID Symbol LocusTag Synonyms' so need to remove the Format: prior to using...
ent.info <- read.delim("Homo_sapiens.gene_info.gz", sep="\t", header=F, skip=1, stringsAsFactors=F)
ent.info.header <- readLines("Homo_sapiens.gene_info.gz", n=1)
ent.info.match <- regmatches(ent.info.header, regexec(":[[:space:]]+([[:alnum:][:space:]_]+)[[:space:]]+\\(", ent.info.header))[[1]][2]
use.header <- strsplit(ent.info.match, "\\s+")[[1]]
names(ent.info) <- use.header
ent.dta <- ent.info[,c("GeneID", "Symbol", "Synonyms")]
names(ent.dta) <- c("entrezID", "symbol", "synonyms")
load.neo4j(.data=ent.dta, edge.name="REFERRED_TO", commit.size=10000L, array.delim="|")
## Preprocessing...
## Missing CONSTRAINT(s) for: Symbol will build them first and continue with loading...
## Loading into Neo4j...
As it is also desired to relate pathway information to our base set of Entrez IDs we can then add in Pathway Commons pathway information, converted to Entrez IDs. Here we will keep database as a property of the Pathway node and will only keep non-NA relationships with entrez IDs
path.dta <- hwhelper:::read.pc.gmt(filename="Pathway Commons.4.All.GSEA.gmt", organism.code="9606")
## Loading required package: org.Hs.eg.db
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: GenomeInfoDb
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
##
## The following object is masked from 'package:igraph':
##
## compare
##
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
##
## The following object is masked from 'package:Matrix':
##
## expand
##
## The following object is masked from 'package:igraph':
##
## simplify
##
##
## Attaching package: 'AnnotationDbi'
##
## The following object is masked from 'package:GenomeInfoDb':
##
## species
##
## Loading required package: DBI
load.path.dta <- path.dta[,c("pathway", "entrezID", "database")]
names(load.path.dta)[3] <- "pathway.database"
load.path.dta <- load.path.dta[complete.cases(load.path.dta),]
#there are duplicated entries and uniqueness constraints so will also need to deal with it ahead of time...
#maybe deal with this as follows:
load.path.dta <- load.path.dta[!duplicated(load.path.dta),]
load.path.dta$pathway <- paste(load.path.dta$pathway, paste0("(", load.path.dta$pathway.database, ")"))
load.neo4j(.data=load.path.dta, edge.name="PATHWAY_CONTAINS", commit.size=10000L)
## Preprocessing...
## Missing CONSTRAINT(s) for: Pathway will build them first and continue with loading...
## Loading into Neo4j...
sessionInfo()
## R version 3.1.2 (2014-10-31)
## Platform: x86_64-apple-darwin10.8.0 (64-bit)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel methods stats graphics grDevices utils
## [8] datasets base
##
## other attached packages:
## [1] org.Hs.eg.db_3.0.0 RSQLite_1.0.0 DBI_0.3.1
## [4] AnnotationDbi_1.28.1 GenomeInfoDb_1.2.4 IRanges_2.0.1
## [7] S4Vectors_0.4.0 Matrix_1.1-5 hwhelper_0.99.12
## [10] rjson_0.2.15 igraph_0.7.1 reshape2_1.4.1
## [13] Biobase_2.26.0 BiocGenerics_0.12.1 rmarkdown_0.7
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.8 evaluate_0.5.5 formatR_1.0 grid_3.1.2
## [5] htmltools_0.2.6 knitr_1.9 lattice_0.20-29 plyr_1.8.1
## [9] Rcpp_0.11.4 stringr_0.6.2 tools_3.1.2 yaml_2.1.13