diff --git a/bin/phenoSim.R b/bin/phenoSim.R index d284229..aa0a030 100755 --- a/bin/phenoSim.R +++ b/bin/phenoSim.R @@ -2,17 +2,17 @@ rm(list = ls()) library(ontologyIndex) library(ontologySimilarity) +library(data.table) args <- commandArgs(trailingOnly = TRUE) -PATIENT_HPO <- args[1] -OMIM_HGMD <- args[2] -OMIM_OBO <- args[3] -OMIM_GENEMAP2 <- args[4] -OMIM_PHENO <- args[5] -OUTFILE_CZ_NAME <- args[6] -OUTFILE_DX_NAME <- args[7] - -dat <- read.csv(OMIM_HGMD, sep = "\t") +VEP <- args[1] +PATIENT_HPO <- args[2] +CACHED_OMIM_OBO <- args[3] +OMIM_HGMD <- args[4] +OMIM_GENEMAP2 <- args[5] +OMIM_PHENO <- args[6] +OUTFILE_CZ_NAME <- args[7] +OUTFILE_DX_NAME <- args[8] library(dplyr) get_HPO_list <- function(df1) { @@ -26,9 +26,22 @@ get_HPO_list <- function(df1) { return(df2) } +# Read the file +lines <- readLines(VEP) + +for (line in lines) { + if (startsWith(line, "#Uploaded_variation")) { + break + } +} +vep = read.table(VEP, sep="\t", fill=T) +colnames(vep) = strsplit(line, "\t")[[1]] + +dat <- fread(OMIM_HGMD, sep = "\t") +dat <- dat %>% filter(acc_num %in% unique(vep$hgmd) | gene_sym %in% unique(vep$SYMBOL)) # Load HPO_obo -HPO_obo <- get_OBO(OMIM_OBO, propagate_relationships = c("is_a", "part_of"), extract_tags = "minimal") +HPO_obo <- readRDS(CACHED_OMIM_OBO) # set simi_thresh simi_thresh <- 0 @@ -41,8 +54,7 @@ if (dim(dat)[1] == 0) { colnames(dat2) <- col_names } else { dat <- dat[!is.na(dat$hpo_id), ] - dat2_ori <- get_HPO_list(dat) - dat2 <- dat2_ori + dat2 <- get_HPO_list(dat) # Load patient HPO HPO.orig <- read.table(PATIENT_HPO, sep = "\t", fill = T, header = F, stringsAsFactors = F) diff --git a/main.nf b/main.nf index a90a6f5..de217f3 100644 --- a/main.nf +++ b/main.nf @@ -447,14 +447,54 @@ process GENESYM_TO_PHRANK { """ } +process CACHE_OMIM_OBO { + container 'zhandongliulab/aim-lite-r' + storeDir "${params.outdir}/general/hp_obo/" + input: + path obo + + output: + path "hp_obo.rds", emit: obo + + script: + """ + #!/usr/bin/env Rscript + library(dplyr) + library(ontologyIndex) + + hp_obo <- get_OBO($obo, propagate_relationships = c("is_a", "part_of"), extract_tags = "minimal") + saveRDS(hp_obo, file="hp_obo.rds") + """ +} -process HPO_SIM { +process NORMALIZE_HPO { + tag "${vep.simpleName}" + + input: + path vep + path hpo + + output: + path "normalized_hpos.txt", emit: "hpo" + + script: + """ + if [[ -z \$(egrep 'HP:[0-9]{7}' $hpo) ]] ; then + echo "HP:0000001" > normalized_hpos.txt + else + cp $hpo normalized_hpos.txt + fi + """ +} +process HPO_SIM_BY_HGMD_OMIM { container 'zhandongliulab/aim-lite-r' + tag "${vep.simpleName}" input: + path vep path hpo + path cached_omim_obo path omim_hgmd_phen - path omim_obo path omim_genemap2 path omim_pheno @@ -464,11 +504,7 @@ process HPO_SIM { script: """ - if [[ -z \$(egrep 'HP:[0-9]{7}' $hpo) ]] ; then - echo "HP:0000001" > $hpo - fi - - phenoSim.R $hpo $omim_hgmd_phen $omim_obo $omim_genemap2 $omim_pheno \\ + phenoSim.R $vep $hpo $cached_omim_obo $omim_hgmd_phen $omim_genemap2 $omim_pheno \\ ${params.run_id}-cz ${params.run_id}-dx """ @@ -749,6 +785,28 @@ workflow VCF_PRE_PROCESS { vcf = FILTER_PROBAND.out.vcf } +workflow HPO_SIM { + take: + vep + hpo + + main: + CACHE_OMIM_OBO(params.omim_obo) + NORMALIZE_HPO(vep, hpo) + HPO_SIM_BY_HGMD_OMIM( + vep, + NORMALIZE_HPO.out.hpo, + CACHE_OMIM_OBO.out.obo, + params.omim_hgmd_phen, + params.omim_genemap2, + params.omim_pheno, + ) + + emit: + hgmd_sim = HPO_SIM_BY_HGMD_OMIM.out.hgmd_sim + omim_sim = HPO_SIM_BY_HGMD_OMIM.out.omim_sim +} + workflow PHRANK_SCORING { take: vcf @@ -809,13 +867,12 @@ workflow { file(params.vep_idx) ) - HPO_SIM(params.input_hpo, - params.omim_hgmd_phen, - params.omim_obo, - params.omim_genemap2, - params.omim_pheno) + HPO_SIM( + ANNOTATE_BY_VEP.out.vep_output, + file(params.input_hpo), + ) - ANNOTATE_BY_MODULES ( + ANNOTATE_BY_MODULES( ANNOTATE_BY_VEP.out.vep_output, HPO_SIM.out.hgmd_sim, HPO_SIM.out.omim_sim,