Skip to content

database_creation

Dan Bottomly edited this page Jul 31, 2015 · 6 revisions

First we retrieve the STRING protein-protein interaction data (http://string-db.org/), perform an initial thresholding to only keep the high confidence data and load it into Neo4j. We first need to make sure that the neo4j-shell command can reach the appropriate Neo4j database and that the latest version of hwhelper is installed.

    suppressPackageStartupMessages(library(hwhelper))
    library(igraph)
    library(Matrix)

    if (file.exists("9606.protein.links.v9.1.txt.gz") == F){
      system("wget http://string-db.org/newstring_download/protein.links.v9.1/9606.protein.links.v9.1.txt.gz")
    }
    
    string.ppi <- read.delim("9606.protein.links.v9.1.txt.gz", sep="", stringsAsFactors=F)
    
    names(string.ppi) <- c("stringID", "stringID", "score")
    
    hc.string.ppi <- string.ppi[string.ppi$score > 700,]
    
    load.neo4j(.data=hc.string.ppi, edge.name="ASSOC", commit.size=10000L, unique.rels=F)
## Preprocessing...
## Missing CONSTRAINT(s) for: StringID,StringID will build them first and continue with loading...
## Loading into Neo4j...

In addition we will keep a matrix copy for use in the prioritization functionality.

    hc.string.ppi$score <- hc.string.ppi$score/1000
    
    use.graph <- graph.data.frame(hc.string.ppi, directed=FALSE)
    
    temp.sparse <- get.adjacency(graph=use.graph, sparse=TRUE, attr="score", type="both")/2
    
    writeMM(temp.sparse, file="9606.protein.links.v9.1.mm.mtx")
## NULL
    stopifnot(all(colnames(temp.sparse) == rownames(temp.sparse)))
    
    write.table(rownames(temp.sparse), file="9606.protein.links.v9.1.mm.names", sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)

We can then load the STRING ID to Entrez ID mapping file into Neo4j.

    if (file.exists("entrez_gene_id.vs.string.v9.05.28122012.txt")==F){
      system("wget ftp://string-db.org/STRING/9.1/mapping_files/Entrez_mappings/entrez_gene_id.vs.string.v9.05.28122012.txt")
    }
    
    ppi.map <- read.delim("entrez_gene_id.vs.string.v9.05.28122012.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
    names(ppi.map) <- c("entrezID", "stringID")
    
    load.neo4j(.data=ppi.map, edge.name="MAPPED_TO", commit.size=10000L)
## Preprocessing...
## Missing CONSTRAINT(s) for: EntrezID will build them first and continue with loading...
## Loading into Neo4j...

Also load in gene symbol data for the entrez IDs. This will create a EntrezID -> Symbol mapping with a relationship called REFERRED_TO. This relationship will have a property called synonyms containing a list of all the synonyms for the given Entrez IDs.

    if(file.exists("Homo_sapiens.gene_info.gz")==F){
      system("wget ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz")
    }
    
    #header is like: '#Format: tax_id GeneID Symbol LocusTag Synonyms' so need to remove the Format: prior to using...
    
    ent.info <- read.delim("Homo_sapiens.gene_info.gz", sep="\t", header=F, skip=1, stringsAsFactors=F)
    ent.info.header <- readLines("Homo_sapiens.gene_info.gz", n=1)
    
    ent.info.match <- regmatches(ent.info.header, regexec(":[[:space:]]+([[:alnum:][:space:]_]+)[[:space:]]+\\(", ent.info.header))[[1]][2]
    use.header <- strsplit(ent.info.match, "\\s+")[[1]]
    
    names(ent.info) <- use.header
    
    ent.dta <- ent.info[,c("GeneID", "Symbol", "Synonyms")]
    names(ent.dta) <- c("entrezID", "symbol", "synonyms")
    
    load.neo4j(.data=ent.dta, edge.name="REFERRED_TO", commit.size=10000L, array.delim="|")
## Preprocessing...
## Missing CONSTRAINT(s) for: Symbol will build them first and continue with loading...
## Loading into Neo4j...

As it is also desired to relate pathway information to our base set of Entrez IDs we can then add in Pathway Commons pathway information, converted to Entrez IDs. Here we will keep database as a property of the Pathway node and will only keep non-NA relationships with entrez IDs

    path.dta <- hwhelper:::read.pc.gmt(filename="Pathway Commons.4.All.GSEA.gmt", organism.code="9606")
## Loading required package: org.Hs.eg.db
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: GenomeInfoDb
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## 
## The following object is masked from 'package:igraph':
## 
##     compare
## 
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## 
## The following object is masked from 'package:Matrix':
## 
##     expand
## 
## The following object is masked from 'package:igraph':
## 
##     simplify
## 
## 
## Attaching package: 'AnnotationDbi'
## 
## The following object is masked from 'package:GenomeInfoDb':
## 
##     species
## 
## Loading required package: DBI
    load.path.dta <- path.dta[,c("pathway", "entrezID", "database")]
    names(load.path.dta)[3] <- "pathway.database"
    load.path.dta <- load.path.dta[complete.cases(load.path.dta),]
    
    #there are duplicated entries and uniqueness constraints so will also need to deal with it ahead of time...
    
    #maybe deal with this as follows:
    
    load.path.dta <- load.path.dta[!duplicated(load.path.dta),]
    
    load.path.dta$pathway <- paste(load.path.dta$pathway, paste0("(", load.path.dta$pathway.database, ")"))
    
    load.neo4j(.data=load.path.dta, edge.name="PATHWAY_CONTAINS", commit.size=10000L)
## Preprocessing...
## Missing CONSTRAINT(s) for: Pathway will build them first and continue with loading...
## Loading into Neo4j...
    sessionInfo()
## R version 3.1.2 (2014-10-31)
## Platform: x86_64-apple-darwin10.8.0 (64-bit)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  methods   stats     graphics  grDevices utils    
## [8] datasets  base     
## 
## other attached packages:
##  [1] org.Hs.eg.db_3.0.0   RSQLite_1.0.0        DBI_0.3.1           
##  [4] AnnotationDbi_1.28.1 GenomeInfoDb_1.2.4   IRanges_2.0.1       
##  [7] S4Vectors_0.4.0      Matrix_1.1-5         hwhelper_0.99.12    
## [10] rjson_0.2.15         igraph_0.7.1         reshape2_1.4.1      
## [13] Biobase_2.26.0       BiocGenerics_0.12.1  rmarkdown_0.7       
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.8    evaluate_0.5.5  formatR_1.0     grid_3.1.2     
##  [5] htmltools_0.2.6 knitr_1.9       lattice_0.20-29 plyr_1.8.1     
##  [9] Rcpp_0.11.4     stringr_0.6.2   tools_3.1.2     yaml_2.1.13
Clone this wiki locally