Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cache OBO file and Refactor HPO_SIM process #116

Merged
merged 1 commit into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 24 additions & 12 deletions bin/phenoSim.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@
rm(list = ls())
library(ontologyIndex)
library(ontologySimilarity)
library(data.table)
args <- commandArgs(trailingOnly = TRUE)

PATIENT_HPO <- args[1]
OMIM_HGMD <- args[2]
OMIM_OBO <- args[3]
OMIM_GENEMAP2 <- args[4]
OMIM_PHENO <- args[5]
OUTFILE_CZ_NAME <- args[6]
OUTFILE_DX_NAME <- args[7]

dat <- read.csv(OMIM_HGMD, sep = "\t")
VEP <- args[1]
PATIENT_HPO <- args[2]
CACHED_OMIM_OBO <- args[3]
OMIM_HGMD <- args[4]
OMIM_GENEMAP2 <- args[5]
OMIM_PHENO <- args[6]
OUTFILE_CZ_NAME <- args[7]
OUTFILE_DX_NAME <- args[8]

library(dplyr)
get_HPO_list <- function(df1) {
Expand All @@ -26,9 +26,22 @@ get_HPO_list <- function(df1) {
return(df2)
}

# Read the file
lines <- readLines(VEP)

for (line in lines) {
if (startsWith(line, "#Uploaded_variation")) {
break
}
}
vep = read.table(VEP, sep="\t", fill=T)
colnames(vep) = strsplit(line, "\t")[[1]]

dat <- fread(OMIM_HGMD, sep = "\t")
dat <- dat %>% filter(acc_num %in% unique(vep$hgmd) | gene_sym %in% unique(vep$SYMBOL))

# Load HPO_obo
HPO_obo <- get_OBO(OMIM_OBO, propagate_relationships = c("is_a", "part_of"), extract_tags = "minimal")
HPO_obo <- readRDS(CACHED_OMIM_OBO)

# set simi_thresh
simi_thresh <- 0
Expand All @@ -41,8 +54,7 @@ if (dim(dat)[1] == 0) {
colnames(dat2) <- col_names
} else {
dat <- dat[!is.na(dat$hpo_id), ]
dat2_ori <- get_HPO_list(dat)
dat2 <- dat2_ori
dat2 <- get_HPO_list(dat)

# Load patient HPO
HPO.orig <- read.table(PATIENT_HPO, sep = "\t", fill = T, header = F, stringsAsFactors = F)
Expand Down
83 changes: 70 additions & 13 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -447,14 +447,54 @@ process GENESYM_TO_PHRANK {
"""
}

process CACHE_OMIM_OBO {
container 'zhandongliulab/aim-lite-r'
storeDir "${params.outdir}/general/hp_obo/"
input:
path obo

output:
path "hp_obo.rds", emit: obo

script:
"""
#!/usr/bin/env Rscript
library(dplyr)
library(ontologyIndex)

hp_obo <- get_OBO($obo, propagate_relationships = c("is_a", "part_of"), extract_tags = "minimal")
saveRDS(hp_obo, file="hp_obo.rds")
"""
}

process HPO_SIM {
process NORMALIZE_HPO {
tag "${vep.simpleName}"

input:
path vep
path hpo

output:
path "normalized_hpos.txt", emit: "hpo"

script:
"""
if [[ -z \$(egrep 'HP:[0-9]{7}' $hpo) ]] ; then
echo "HP:0000001" > normalized_hpos.txt
else
cp $hpo normalized_hpos.txt
fi
"""
}
process HPO_SIM_BY_HGMD_OMIM {
container 'zhandongliulab/aim-lite-r'
tag "${vep.simpleName}"

input:
path vep
path hpo
path cached_omim_obo
path omim_hgmd_phen
path omim_obo
path omim_genemap2
path omim_pheno

Expand All @@ -464,11 +504,7 @@ process HPO_SIM {

script:
"""
if [[ -z \$(egrep 'HP:[0-9]{7}' $hpo) ]] ; then
echo "HP:0000001" > $hpo
fi

phenoSim.R $hpo $omim_hgmd_phen $omim_obo $omim_genemap2 $omim_pheno \\
phenoSim.R $vep $hpo $cached_omim_obo $omim_hgmd_phen $omim_genemap2 $omim_pheno \\
${params.run_id}-cz ${params.run_id}-dx
"""

Expand Down Expand Up @@ -749,6 +785,28 @@ workflow VCF_PRE_PROCESS {
vcf = FILTER_PROBAND.out.vcf
}

workflow HPO_SIM {
take:
vep
hpo

main:
CACHE_OMIM_OBO(params.omim_obo)
NORMALIZE_HPO(vep, hpo)
HPO_SIM_BY_HGMD_OMIM(
vep,
NORMALIZE_HPO.out.hpo,
CACHE_OMIM_OBO.out.obo,
params.omim_hgmd_phen,
params.omim_genemap2,
params.omim_pheno,
)

emit:
hgmd_sim = HPO_SIM_BY_HGMD_OMIM.out.hgmd_sim
omim_sim = HPO_SIM_BY_HGMD_OMIM.out.omim_sim
}

workflow PHRANK_SCORING {
take:
vcf
Expand Down Expand Up @@ -809,13 +867,12 @@ workflow {
file(params.vep_idx)
)

HPO_SIM(params.input_hpo,
params.omim_hgmd_phen,
params.omim_obo,
params.omim_genemap2,
params.omim_pheno)
HPO_SIM(
ANNOTATE_BY_VEP.out.vep_output,
file(params.input_hpo),
)

ANNOTATE_BY_MODULES (
ANNOTATE_BY_MODULES(
ANNOTATE_BY_VEP.out.vep_output,
HPO_SIM.out.hgmd_sim,
HPO_SIM.out.omim_sim,
Expand Down
Loading