Skip to content

Commit

Permalink
Add --input_variant parameter for handling single variant
Browse files Browse the repository at this point in the history
  • Loading branch information
jylee-bcm committed Dec 18, 2024
1 parent 5e37e29 commit 4fb7f87
Show file tree
Hide file tree
Showing 8 changed files with 139 additions and 71 deletions.
37 changes: 18 additions & 19 deletions bin/add_c_nc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ def add_c_nc(score, ref):
if ref == "hg38":
clin_c = pd.read_csv("merge_expand/hg38/clin_c.tsv.gz", sep="\t")
clin_nc = pd.read_csv("merge_expand/hg38/clin_nc.tsv.gz", sep="\t")
hgmd_nc = pd.read_csv("merge_expand/hg38/hgmd_nc.tsv.gz", sep="\t")
hgmd_c = pd.read_csv("merge_expand/hg38/hgmd_c.tsv.gz", sep="\t")
hgmd_nc = pd.read_csv("merge_expand/hg38/hgmd_nc.tsv.gz", sep="\t")
else:
clin_c = pd.read_csv("merge_expand/hg19/clin_c.tsv.gz", sep="\t")
clin_nc = pd.read_csv("merge_expand/hg19/clin_nc.tsv.gz", sep="\t")
hgmd_nc = pd.read_csv("merge_expand/hg19/hgmd_nc.tsv.gz", sep="\t")
hgmd_c = pd.read_csv("merge_expand/hg19/hgmd_c.tsv.gz", sep="\t")
hgmd_nc = pd.read_csv("merge_expand/hg19/hgmd_nc.tsv.gz", sep="\t")

a = score.pos.values
ac = score.chrom.values
Expand All @@ -28,7 +28,7 @@ def add_c_nc(score, ref):

i, j = np.where((a[:, None] >= bl) & (a[:, None] <= bh) & (ac[:, None] == bc))

cln = pd.concat(
clin = pd.concat(
[
temp.loc[i, :].reset_index(drop=True),
clin_nc.loc[j, :].reset_index(drop=True),
Expand All @@ -38,7 +38,7 @@ def add_c_nc(score, ref):

# Take into account When HGMD data base is empty
if hgmd_nc.shape[0] == 0:
pass
hgmd = hgmd_nc

else:
# merge by region
Expand All @@ -58,29 +58,28 @@ def add_c_nc(score, ref):
axis=1,
)

merged = score.merge(cln, how="left", on="varId")
if hgmd_nc.shape[0] == 0:
merged["nc_HGMD_Exp"] = np.NaN
merged["nc_RANKSCORE"] = np.NaN
else:
merged = merged.merge(hgmd, how="left", on="varId")
merged = score.merge(
clin_c.rename(columns={'new_chr': 'chrom', 'new_pos': 'pos'}),
how="left",
on=["chrom", "pos", "ref", "alt"],
)
merged = merged.merge(clin, how="left", on="varId")

if hgmd_c.shape[0] == 0:
merged["c_HGMD_Exp"] = np.NaN
merged["c_RANKSCORE"] = np.NaN
merged["CLASS"] = np.NaN
else:
merged = merged.merge(
hgmd_c,
hgmd_c.rename(columns={'new_chr': 'chrom', 'new_pos': 'pos'}),
how="left",
left_on=["chrom", "pos", "ref", "alt"],
right_on=["new_chr", "new_pos", "ref", "alt"],
on=["chrom", "pos", "ref", "alt"],
)
merged = merged.merge(
clin_c,
how="left",
left_on=["chrom", "pos", "ref", "alt"],
right_on=["new_chr", "new_pos", "ref", "alt"],
)

if hgmd_nc.shape[0] == 0:
merged["nc_HGMD_Exp"] = np.NaN
merged["nc_RANKSCORE"] = np.NaN
else:
merged = merged.merge(hgmd, how="left", on="varId")

return merged
5 changes: 4 additions & 1 deletion bin/extraModel/generate_bivar_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def process_sample(data_folder, sample_id, default_pred, labeling=False):
]

# Remove all the duplicated variant pairs.
gene_var_pairs_df = pd.DataFrame(gene_var_pairs)
gene_var_pairs_df = pd.DataFrame(gene_var_pairs, columns=['geneEnsId', 'varId1', 'varId2'])
# gene_var_pairs_df = gene_var_pairs_df.drop_duplicates(['varId1', 'varId2'])

# Use only subset columns of features
Expand Down Expand Up @@ -130,4 +130,7 @@ def process_sample(data_folder, sample_id, default_pred, labeling=False):
# Sort before saving
recessive_feature_df = recessive_feature_df.sort_index()

if recessive_feature_df.shape[0] == 0:
return

recessive_feature_df.to_csv(f"{recessive_folder}/{sample_id}.csv")
2 changes: 1 addition & 1 deletion bin/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def main():
print("input annoatated varFile:", args.varFile)
t1 = time.time()
varDf = pd.read_csv(
args.varFile, sep="\t", skiprows=numHeaderSkip, error_bad_lines=False
args.varFile, sep="\t", skiprows=numHeaderSkip,
)
# varDf=varDf[0:10]
# #do this if need to have a small test
Expand Down
36 changes: 36 additions & 0 deletions bin/generate_input_vcf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env python3.8

import string
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("variant", type=str)

args = parser.parse_args()

vcf_template = string.Template("""
##fileformat=VCFv4.2
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##FILTER=<ID=PASS,Description="All filters passed">
##reference=file:///staging/human/reference/b37/b37.fa.default/reference.bin
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE
$chrom $pos . $ref $alt . . . GT:AD:DP:GQ:PL 0/1:7,5:12:99:142,0,214
""".strip())

def main(variant):
chrom, pos, ref, alt = variant.split("-")
with open("input.vcf", "w", encoding="ascii") as text_file:
text_file.write(vcf_template.substitute(
chrom=chrom,
pos=pos,
ref=ref,
alt=alt,
))

if __name__ == "__main__":
main(**vars(args))
1 change: 1 addition & 0 deletions docs/source/nf_usage.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Args:
--outdir Output directory.
--run_id Unique identifier for this run. (default: 1)
--ref_ver Reference genome version [hg38 or hg19] (default: hg19)
--input_variant Variant ID Formatted in 'chr-pos-alt-ref' (optional)
--bed_filter Path to a BED file to perform an analysis only for regions of interest (optional)
--exome_filter Enable exonic filter. Addition will filter out variants outside of exonic region (default: false)

Expand Down
124 changes: 75 additions & 49 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -27,53 +27,67 @@ def showVersion() {

def validateInputParams() {
def checkPathParamMap = [
"input_vcf": params.input_vcf,
"input_hpo": params.input_hpo,
"ref_dir" : params.ref_dir,
"ref_ver" : params.ref_ver,
]

checkPathParamMap.each { paramName, paramValue ->
if (paramValue) {
// Check if the file exists
if(!(paramName == "ref_ver")) {
def fileObj = file(paramValue, checkIfExists: true)
// println("Check file: '--${paramName}' value '${paramValue}' ")

// Check the file type based on the parameter name
if (paramName == "input_vcf" && !(paramValue.endsWith(".vcf") || paramValue.endsWith(".vcf.gz"))) {
println("Error: '--${paramName}' value '${paramValue}' should be a VCF file (.vcf) or (.vcf.gz)")
println("To see usage and available parameters run `nextflow run main.nf --help`")
exit 1
} else if (paramName == "input_hpo" && !(paramValue.endsWith(".hpo") || paramValue.endsWith(".txt"))) {
println("Error: '--${paramName}' value '${paramValue}' should be an HPO file (.hpo) or (.txt)")
println("To see usage and available parameters run `nextflow run main.nf --help`")
exit 1
} else if (paramName == "ref_dir" && !fileObj.isDirectory()) {
println("Error: '--${paramName}' value '${paramValue}' should be an directory.")
println("To see usage and available parameters run `nextflow run main.nf --help`")
exit 1
}
}

if (paramName == "ref_ver" && !(paramValue.equals("hg19") || paramValue.equals("hg38")) ) {
println("Error: '--${paramName}' value ${paramValue} should be either set to 'hg19' or 'hg38'.")
println("To see usage and available parameters run `nextflow run main.nf --help`")
exit 1
}
if (paramValue)
return

} else {
println("Input parameter '${paramName}' not specified or is null!")
println("To see usage and available parameters run `nextflow run main.nf --help`")
exit 1
}
println("Input parameter '${paramName}' not specified or is null!")
println("To see usage and available parameters run `nextflow run main.nf --help`")
exit 1
}

if (params.input_variant && params.input_vcf) {
println("Error: Cannot use '--input_variant' with --input_vcf'")
println("To see usage and available parameters run `nextflow run main.nf --help`")
exit 1
}

if (params.input_vcf && !params.input_vcf.endsWith(".vcf") && !params.input_vcf.endsWith(".vcf.gz")) {
println("Error: '--input_vcf' value '${params.input_vcf}' should be a VCF file (.vcf) or (.vcf.gz)")
println("To see usage and available parameters run `nextflow run main.nf --help`")
exit 1
}

if (params.input_variant && params.input_variant.count('-') != 3) {
println("Error: '--input_variant' should be formated like 'X-47038564-T-C'")
println("To see usage and available parameters run `nextflow run main.nf --help`")
exit 1
}

if (!file(params.ref_dir).isDirectory()) {
println("Error: '--ref_dir' should be an directory.")
println("To see usage and available parameters run `nextflow run main.nf --help`")
exit 1
}

if (!params.ref_ver.equals("hg19") && !params.ref_ver.equals("hg38") ) {
println("Error: '--ref_ver' value ${params.ref_ver} should be either set to 'hg19' or 'hg38'.")
println("To see usage and available parameters run `nextflow run main.nf --help`")
exit 1
}
}

showUsage()
showVersion()
validateInputParams()

process GENERATE_INPUT_VCF {
input:
val id

output:
path "input.vcf", emit: vcf

"""
generate_input_vcf.py $id
"""
}

process VALIDATE_VCF {
container 'quay.io/biocontainers/vcftools:0.1.16--pl5321hdcf5f25_11'

Expand Down Expand Up @@ -401,7 +415,7 @@ process FILTER_PROBAND {
path ref_gnomad_exome_idx

output:
path "${params.run_id}.filt.rmBL.vcf", emit: vcf
path "${params.run_id}.filt.rmBL.vcf.gz", emit: vcf

script:
"""
Expand All @@ -420,6 +434,7 @@ process FILTER_PROBAND {
isec_tmp1/0000.vcf.gz isec_tmp2/0000.vcf.gz
mv isec_tmp3/0002.vcf ${params.run_id}.filt.rmBL.vcf
bgzip ${params.run_id}.filt.rmBL.vcf
"""

}
Expand All @@ -435,13 +450,12 @@ process SPLIT_VCF_BY_CHROMOSOME {
"""
# Get the list of chromosomes from the VCF file
bgzip ${vcf}
bcftools index ${vcf}.gz
bcftools index ${vcf}
bcftools query -f '%CHROM\n' ${vcf}.gz | sort | uniq > chrom_list.txt
bcftools query -f '%CHROM\n' ${vcf} | sort | uniq > chrom_list.txt
# Split the VCF file by chromosome
while read chrom; do
bcftools view -r \${chrom} ${vcf}.gz -Oz -o chr\${chrom}.vcf.gz
bcftools view -r \${chrom} ${vcf} -Oz -o chr\${chrom}.vcf.gz
done < chrom_list.txt
"""
}
Expand Down Expand Up @@ -673,7 +687,7 @@ workflow PHRANK_SCORING {
VARIANTS_TO_ENSEMBL(VCF_TO_VARIANTS.out, params.ref_loc)
ENSEMBL_TO_GENESYM(VARIANTS_TO_ENSEMBL.out, params.ref_to_sym, params.ref_sorted_sym)
GENESYM_TO_PHRANK(ENSEMBL_TO_GENESYM.out,
params.input_hpo,
file(params.input_hpo),
params.phrank_dagfile,
params.phrank_disease_annotation,
params.phrank_gene_annotation,
Expand All @@ -684,19 +698,31 @@ workflow PHRANK_SCORING {
}

workflow {
VALIDATE_VCF(params.input_vcf)
if (params.input_vcf) {
input_vcf = file(params.input_vcf)
} else if (params.input_variant) {
GENERATE_INPUT_VCF(params.input_variant)
input_vcf = GENERATE_INPUT_VCF.out.vcf
}

VALIDATE_VCF(input_vcf)
NORMALIZE_VCF(VALIDATE_VCF.out.vcf)
BUILD_REFERENCE_INDEX()

VCF_PRE_PROCESS(
NORMALIZE_VCF.out.vcf,
NORMALIZE_VCF.out.tbi,
BUILD_REFERENCE_INDEX.out.fasta,
BUILD_REFERENCE_INDEX.out.fasta_index,
BUILD_REFERENCE_INDEX.out.fasta_dict,
)
if (params.input_vcf) {
BUILD_REFERENCE_INDEX()
VCF_PRE_PROCESS(
NORMALIZE_VCF.out.vcf,
NORMALIZE_VCF.out.tbi,
BUILD_REFERENCE_INDEX.out.fasta,
BUILD_REFERENCE_INDEX.out.fasta_index,
BUILD_REFERENCE_INDEX.out.fasta_dict,
)
vcf = VCF_PRE_PROCESS.out.vcf
} else {
vcf = NORMALIZE_VCF.out.vcf
}

SPLIT_VCF_BY_CHROMOSOME(VCF_PRE_PROCESS.out.vcf)
SPLIT_VCF_BY_CHROMOSOME(vcf)
ANNOTATE_BY_VEP(
SPLIT_VCF_BY_CHROMOSOME.out.chr_vcfs.flatten(),
params.vep_dir_cache,
Expand Down
5 changes: 4 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
params {
input_vcf = null
input_variant = null

run_id = 1
ref_ver = "hg19"
ref_dir = " "
Expand Down Expand Up @@ -48,7 +51,7 @@ params {

// VEP
vep_dbnsfp_name = params.ref_ver == "hg19" ? "dbNSFP4.3a_grch37.gz" : "dbNSFP4.1a_grch38.gz"
vep_gnomad_name = params.ref_ver == "hg19" ? "gnomad.genomes.r2.1.sites.grch37_noVEP.vcf.gz" : "gnomad.genomes.GRCh38.v3.1.2.sites.vcf.gz"
vep_gnomad_name = params.ref_ver == "hg19" ? "gnomad.genomes.r2.1.sites.grch37_noVEP.vcf.gz" : "gnomad.genomes.GRCh38.v3.1.2.sites.vcf.bgz"
vep_cadd_name = params.ref_ver == "hg19" ? "hg19_whole_genome_SNVs.tsv.gz" : "hg38_whole_genome_SNV.tsv.gz"

vep_dir_cache = "${ref_dir}/vep/${ref_ver}/"
Expand Down
Empty file removed test.txt
Empty file.

0 comments on commit 4fb7f87

Please sign in to comment.