Add --input_variant parameter for handling single variant

LiuzLab · Dec 18, 2024 · 4fb7f87 · 4fb7f87
1 parent 5e37e29
commit 4fb7f87
Show file tree

Hide file tree

Showing 8 changed files with 139 additions and 71 deletions.
diff --git a/bin/add_c_nc.py b/bin/add_c_nc.py
@@ -10,13 +10,13 @@ def add_c_nc(score, ref):
     if ref == "hg38":
         clin_c = pd.read_csv("merge_expand/hg38/clin_c.tsv.gz", sep="\t")
         clin_nc = pd.read_csv("merge_expand/hg38/clin_nc.tsv.gz", sep="\t")
-        hgmd_nc = pd.read_csv("merge_expand/hg38/hgmd_nc.tsv.gz", sep="\t")
         hgmd_c = pd.read_csv("merge_expand/hg38/hgmd_c.tsv.gz", sep="\t")
+        hgmd_nc = pd.read_csv("merge_expand/hg38/hgmd_nc.tsv.gz", sep="\t")
     else:
         clin_c = pd.read_csv("merge_expand/hg19/clin_c.tsv.gz", sep="\t")
         clin_nc = pd.read_csv("merge_expand/hg19/clin_nc.tsv.gz", sep="\t")
-        hgmd_nc = pd.read_csv("merge_expand/hg19/hgmd_nc.tsv.gz", sep="\t")
         hgmd_c = pd.read_csv("merge_expand/hg19/hgmd_c.tsv.gz", sep="\t")
+        hgmd_nc = pd.read_csv("merge_expand/hg19/hgmd_nc.tsv.gz", sep="\t")
 
     a = score.pos.values
     ac = score.chrom.values
@@ -28,7 +28,7 @@ def add_c_nc(score, ref):
 
     i, j = np.where((a[:, None] >= bl) & (a[:, None] <= bh) & (ac[:, None] == bc))
 
-    cln = pd.concat(
+    clin = pd.concat(
         [
             temp.loc[i, :].reset_index(drop=True),
             clin_nc.loc[j, :].reset_index(drop=True),
@@ -38,7 +38,7 @@ def add_c_nc(score, ref):
 
     # Take into account When HGMD data base is empty
     if hgmd_nc.shape[0] == 0:
-        pass
+        hgmd = hgmd_nc
 
     else:
         # merge by region
@@ -58,29 +58,28 @@ def add_c_nc(score, ref):
             axis=1,
         )
 
-    merged = score.merge(cln, how="left", on="varId")
-    if hgmd_nc.shape[0] == 0:
-        merged["nc_HGMD_Exp"] = np.NaN
-        merged["nc_RANKSCORE"] = np.NaN
-    else:
-        merged = merged.merge(hgmd, how="left", on="varId")
+    merged = score.merge(
+        clin_c.rename(columns={'new_chr': 'chrom', 'new_pos': 'pos'}),
+        how="left",
+        on=["chrom", "pos", "ref", "alt"],
+    )
+    merged = merged.merge(clin, how="left", on="varId")
 
     if hgmd_c.shape[0] == 0:
         merged["c_HGMD_Exp"] = np.NaN
         merged["c_RANKSCORE"] = np.NaN
         merged["CLASS"] = np.NaN
     else:
         merged = merged.merge(
-            hgmd_c,
+            hgmd_c.rename(columns={'new_chr': 'chrom', 'new_pos': 'pos'}),
             how="left",
-            left_on=["chrom", "pos", "ref", "alt"],
-            right_on=["new_chr", "new_pos", "ref", "alt"],
+            on=["chrom", "pos", "ref", "alt"],
         )
-    merged = merged.merge(
-        clin_c,
-        how="left",
-        left_on=["chrom", "pos", "ref", "alt"],
-        right_on=["new_chr", "new_pos", "ref", "alt"],
-    )
+
+    if hgmd_nc.shape[0] == 0:
+        merged["nc_HGMD_Exp"] = np.NaN
+        merged["nc_RANKSCORE"] = np.NaN
+    else:
+        merged = merged.merge(hgmd, how="left", on="varId")
 
     return merged
diff --git a/bin/extraModel/generate_bivar_data.py b/bin/extraModel/generate_bivar_data.py
@@ -68,7 +68,7 @@ def process_sample(data_folder, sample_id, default_pred, labeling=False):
         ]
 
     # Remove all the duplicated variant pairs.
-    gene_var_pairs_df = pd.DataFrame(gene_var_pairs)
+    gene_var_pairs_df = pd.DataFrame(gene_var_pairs, columns=['geneEnsId', 'varId1', 'varId2'])
     # gene_var_pairs_df = gene_var_pairs_df.drop_duplicates(['varId1', 'varId2'])
 
     # Use only subset columns of features
@@ -130,4 +130,7 @@ def process_sample(data_folder, sample_id, default_pred, labeling=False):
     # Sort before saving
     recessive_feature_df = recessive_feature_df.sort_index()
 
+    if recessive_feature_df.shape[0] == 0:
+        return
+
     recessive_feature_df.to_csv(f"{recessive_folder}/{sample_id}.csv")
diff --git a/bin/feature.py b/bin/feature.py
@@ -258,7 +258,7 @@ def main():
         print("input annoatated varFile:", args.varFile)
         t1 = time.time()
         varDf = pd.read_csv(
-            args.varFile, sep="\t", skiprows=numHeaderSkip, error_bad_lines=False
+            args.varFile, sep="\t", skiprows=numHeaderSkip,
         )
         # varDf=varDf[0:10]
         # #do this if need to have a small test

diff --git a/bin/generate_input_vcf.py b/bin/generate_input_vcf.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3.8
+
+import string
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("variant", type=str)
+
+args = parser.parse_args()
+
+vcf_template = string.Template("""
+##fileformat=VCFv4.2
+##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
+##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
+##FILTER=<ID=PASS,Description="All filters passed">
+##reference=file:///staging/human/reference/b37/b37.fa.default/reference.bin
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMPLE
+$chrom	$pos	.	$ref	$alt	.	.	.	GT:AD:DP:GQ:PL	0/1:7,5:12:99:142,0,214
+""".strip())
+
+def main(variant):
+    chrom, pos, ref, alt = variant.split("-")
+    with open("input.vcf", "w", encoding="ascii") as text_file:
+        text_file.write(vcf_template.substitute(
+            chrom=chrom,
+            pos=pos,
+            ref=ref,
+            alt=alt,
+        ))
+
+if __name__ == "__main__":
+    main(**vars(args))
diff --git a/docs/source/nf_usage.txt b/docs/source/nf_usage.txt
@@ -18,6 +18,7 @@ Args:
   --outdir                  Output directory.
   --run_id                  Unique identifier for this run. (default: 1)
   --ref_ver                 Reference genome version [hg38 or hg19] (default: hg19)
+  --input_variant           Variant ID Formatted in 'chr-pos-alt-ref' (optional)
   --bed_filter              Path to a BED file to perform an analysis only for regions of interest (optional)
   --exome_filter            Enable exonic filter. Addition will filter out variants outside of exonic region  (default: false)
 

diff --git a/main.nf b/main.nf
@@ -27,53 +27,67 @@ def showVersion() {
 
 def validateInputParams() {
     def checkPathParamMap = [
-        "input_vcf": params.input_vcf,
         "input_hpo": params.input_hpo,
         "ref_dir"  : params.ref_dir,
         "ref_ver"  : params.ref_ver,
     ]
 
     checkPathParamMap.each { paramName, paramValue ->
-        if (paramValue) {
-            // Check if the file exists
-            if(!(paramName == "ref_ver")) {
-                def fileObj = file(paramValue, checkIfExists: true)
-                //  println("Check file: '--${paramName}' value '${paramValue}' ")
-
-                // Check the file type based on the parameter name
-                if (paramName == "input_vcf" && !(paramValue.endsWith(".vcf") || paramValue.endsWith(".vcf.gz"))) {
-                    println("Error: '--${paramName}' value '${paramValue}' should be a VCF file (.vcf) or (.vcf.gz)")
-                    println("To see usage and available parameters run `nextflow run main.nf --help`")
-                    exit 1
-                } else if (paramName == "input_hpo" && !(paramValue.endsWith(".hpo") || paramValue.endsWith(".txt"))) {
-                    println("Error: '--${paramName}' value '${paramValue}' should be an HPO file (.hpo) or (.txt)")
-                    println("To see usage and available parameters run `nextflow run main.nf --help`")
-                    exit 1
-                } else if (paramName == "ref_dir" && !fileObj.isDirectory()) {
-                    println("Error: '--${paramName}' value '${paramValue}' should be an directory.")
-                    println("To see usage and available parameters run `nextflow run main.nf --help`")
-                    exit 1
-                }
-            }
-
-            if (paramName == "ref_ver" && !(paramValue.equals("hg19") || paramValue.equals("hg38")) ) { 
-                println("Error: '--${paramName}' value ${paramValue} should be either set to 'hg19' or 'hg38'.")
-                println("To see usage and available parameters run `nextflow run main.nf --help`")
-                exit 1
-            }
+        if (paramValue)
+            return
 
-        } else {
-            println("Input parameter '${paramName}' not specified or is null!")
-            println("To see usage and available parameters run `nextflow run main.nf --help`")
-            exit 1
-        }
+        println("Input parameter '${paramName}' not specified or is null!")
+        println("To see usage and available parameters run `nextflow run main.nf --help`")
+        exit 1
+    }
+
+    if (params.input_variant && params.input_vcf) {
+        println("Error: Cannot use '--input_variant' with --input_vcf'")
+        println("To see usage and available parameters run `nextflow run main.nf --help`")
+        exit 1
+    }
+
+    if (params.input_vcf && !params.input_vcf.endsWith(".vcf") && !params.input_vcf.endsWith(".vcf.gz")) {
+        println("Error: '--input_vcf' value '${params.input_vcf}' should be a VCF file (.vcf) or (.vcf.gz)")
+        println("To see usage and available parameters run `nextflow run main.nf --help`")
+        exit 1
+    }
+
+    if (params.input_variant && params.input_variant.count('-') != 3) {
+        println("Error: '--input_variant' should be formated like 'X-47038564-T-C'")
+        println("To see usage and available parameters run `nextflow run main.nf --help`")
+        exit 1
+    }
+
+    if (!file(params.ref_dir).isDirectory()) {
+        println("Error: '--ref_dir' should be an directory.")
+        println("To see usage and available parameters run `nextflow run main.nf --help`")
+        exit 1
+    }
+
+    if (!params.ref_ver.equals("hg19") && !params.ref_ver.equals("hg38") ) {
+        println("Error: '--ref_ver' value ${params.ref_ver} should be either set to 'hg19' or 'hg38'.")
+        println("To see usage and available parameters run `nextflow run main.nf --help`")
+        exit 1
     }
 }
 
 showUsage()
 showVersion()
 validateInputParams()
 
+process GENERATE_INPUT_VCF {
+    input:
+    val id
+
+    output:
+    path "input.vcf", emit: vcf
+
+    """
+    generate_input_vcf.py $id
+    """
+}
+
 process VALIDATE_VCF {
     container 'quay.io/biocontainers/vcftools:0.1.16--pl5321hdcf5f25_11'
 
@@ -401,7 +415,7 @@ process FILTER_PROBAND {
     path ref_gnomad_exome_idx
 
     output:
-    path "${params.run_id}.filt.rmBL.vcf", emit: vcf
+    path "${params.run_id}.filt.rmBL.vcf.gz", emit: vcf
 
     script:
     """
@@ -420,6 +434,7 @@ process FILTER_PROBAND {
     isec_tmp1/0000.vcf.gz isec_tmp2/0000.vcf.gz
 
     mv isec_tmp3/0002.vcf ${params.run_id}.filt.rmBL.vcf
+    bgzip ${params.run_id}.filt.rmBL.vcf
     """
 
 }
@@ -435,13 +450,12 @@ process SPLIT_VCF_BY_CHROMOSOME {
     """
     # Get the list of chromosomes from the VCF file
 
-    bgzip ${vcf}
-    bcftools index ${vcf}.gz
+    bcftools index ${vcf}
 
-    bcftools query -f '%CHROM\n' ${vcf}.gz | sort | uniq > chrom_list.txt
+    bcftools query -f '%CHROM\n' ${vcf} | sort | uniq > chrom_list.txt
     # Split the VCF file by chromosome
     while read chrom; do
-        bcftools view -r \${chrom} ${vcf}.gz -Oz -o chr\${chrom}.vcf.gz
+        bcftools view -r \${chrom} ${vcf} -Oz -o chr\${chrom}.vcf.gz
     done < chrom_list.txt
     """
 }
@@ -673,7 +687,7 @@ workflow PHRANK_SCORING {
     VARIANTS_TO_ENSEMBL(VCF_TO_VARIANTS.out, params.ref_loc)
     ENSEMBL_TO_GENESYM(VARIANTS_TO_ENSEMBL.out, params.ref_to_sym, params.ref_sorted_sym)
     GENESYM_TO_PHRANK(ENSEMBL_TO_GENESYM.out,
-                    params.input_hpo,
+                    file(params.input_hpo),
                     params.phrank_dagfile,
                     params.phrank_disease_annotation,
                     params.phrank_gene_annotation,
@@ -684,19 +698,31 @@ workflow PHRANK_SCORING {
 }
 
 workflow {
-    VALIDATE_VCF(params.input_vcf)
+    if (params.input_vcf) {
+        input_vcf = file(params.input_vcf)
+    } else if (params.input_variant) {
+        GENERATE_INPUT_VCF(params.input_variant)
+        input_vcf = GENERATE_INPUT_VCF.out.vcf
+    }
+
+    VALIDATE_VCF(input_vcf)
     NORMALIZE_VCF(VALIDATE_VCF.out.vcf)
-    BUILD_REFERENCE_INDEX()
 
-    VCF_PRE_PROCESS(
-        NORMALIZE_VCF.out.vcf,
-        NORMALIZE_VCF.out.tbi,
-        BUILD_REFERENCE_INDEX.out.fasta,
-        BUILD_REFERENCE_INDEX.out.fasta_index,
-        BUILD_REFERENCE_INDEX.out.fasta_dict,
-    )
+    if (params.input_vcf) {
+        BUILD_REFERENCE_INDEX()
+        VCF_PRE_PROCESS(
+            NORMALIZE_VCF.out.vcf,
+            NORMALIZE_VCF.out.tbi,
+            BUILD_REFERENCE_INDEX.out.fasta,
+            BUILD_REFERENCE_INDEX.out.fasta_index,
+            BUILD_REFERENCE_INDEX.out.fasta_dict,
+        )
+        vcf = VCF_PRE_PROCESS.out.vcf
+    } else {
+        vcf = NORMALIZE_VCF.out.vcf
+    }
 
-    SPLIT_VCF_BY_CHROMOSOME(VCF_PRE_PROCESS.out.vcf)
+    SPLIT_VCF_BY_CHROMOSOME(vcf)
     ANNOTATE_BY_VEP(
         SPLIT_VCF_BY_CHROMOSOME.out.chr_vcfs.flatten(),
         params.vep_dir_cache,

diff --git a/nextflow.config b/nextflow.config
@@ -1,4 +1,7 @@
 params {
+    input_vcf = null
+    input_variant = null
+
     run_id = 1
     ref_ver = "hg19"
     ref_dir = " "
@@ -48,7 +51,7 @@ params {
 
     // VEP 
     vep_dbnsfp_name = params.ref_ver == "hg19" ? "dbNSFP4.3a_grch37.gz" : "dbNSFP4.1a_grch38.gz"
-    vep_gnomad_name = params.ref_ver == "hg19" ? "gnomad.genomes.r2.1.sites.grch37_noVEP.vcf.gz" : "gnomad.genomes.GRCh38.v3.1.2.sites.vcf.gz"
+    vep_gnomad_name = params.ref_ver == "hg19" ? "gnomad.genomes.r2.1.sites.grch37_noVEP.vcf.gz" : "gnomad.genomes.GRCh38.v3.1.2.sites.vcf.bgz"
     vep_cadd_name = params.ref_ver == "hg19" ? "hg19_whole_genome_SNVs.tsv.gz" : "hg38_whole_genome_SNV.tsv.gz"
 
     vep_dir_cache = "${ref_dir}/vep/${ref_ver}/"

diff --git a/test.txt b/test.txt