Skip to content

Commit

Permalink
Merge pull request #112 from LiuzLab/onevar
Browse files Browse the repository at this point in the history
Add --input_variant parameter for handling single variant
  • Loading branch information
jylee-bcm authored Dec 18, 2024
2 parents 5e37e29 + 0f4a4f2 commit 75f7ca1
Show file tree
Hide file tree
Showing 9 changed files with 218 additions and 146 deletions.
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#FROM ubuntu:18.04
FROM ensemblorg/ensembl-vep:release_104.3
USER root
ENV DEBIAN_FRONTEND noninteractive
ENV DEBIAN_FRONTEND=noninteractive

# Install dependencies
RUN apt-get update && apt-get install -y \
Expand Down Expand Up @@ -38,7 +38,7 @@ RUN pip3 install -r /opt/requirements.txt
RUN pip3 install bgzip

# Install bcftools
RUN wget https://github.com/samtools/bcftools/releases/download/1.20/bcftools-1.20.tar.bz2
RUN curl -OL https://github.com/samtools/bcftools/releases/download/1.20/bcftools-1.20.tar.bz2
RUN mv bcftools-1.20.tar.bz2 /opt/bcftools-1.20.tar.bz2
RUN tar -xf /opt/bcftools-1.20.tar.bz2 -C /opt/ && \
rm /opt/bcftools-1.20.tar.bz2 && \
Expand All @@ -49,7 +49,7 @@ RUN tar -xf /opt/bcftools-1.20.tar.bz2 -C /opt/ && \
rm -rf /opt/bcftools-1.20

# Install bedtools
RUN wget https://github.com/arq5x/bedtools2/releases/download/v2.30.0/bedtools.static.binary
RUN curl -OL https://github.com/arq5x/bedtools2/releases/download/v2.30.0/bedtools.static.binary
RUN mv bedtools.static.binary /run/bedtools
RUN chmod a+x /run/bedtools

Expand Down
37 changes: 18 additions & 19 deletions bin/add_c_nc.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ def add_c_nc(score, ref):
if ref == "hg38":
clin_c = pd.read_csv("merge_expand/hg38/clin_c.tsv.gz", sep="\t")
clin_nc = pd.read_csv("merge_expand/hg38/clin_nc.tsv.gz", sep="\t")
hgmd_nc = pd.read_csv("merge_expand/hg38/hgmd_nc.tsv.gz", sep="\t")
hgmd_c = pd.read_csv("merge_expand/hg38/hgmd_c.tsv.gz", sep="\t")
hgmd_nc = pd.read_csv("merge_expand/hg38/hgmd_nc.tsv.gz", sep="\t")
else:
clin_c = pd.read_csv("merge_expand/hg19/clin_c.tsv.gz", sep="\t")
clin_nc = pd.read_csv("merge_expand/hg19/clin_nc.tsv.gz", sep="\t")
hgmd_nc = pd.read_csv("merge_expand/hg19/hgmd_nc.tsv.gz", sep="\t")
hgmd_c = pd.read_csv("merge_expand/hg19/hgmd_c.tsv.gz", sep="\t")
hgmd_nc = pd.read_csv("merge_expand/hg19/hgmd_nc.tsv.gz", sep="\t")

a = score.pos.values
ac = score.chrom.values
Expand All @@ -28,7 +28,7 @@ def add_c_nc(score, ref):

i, j = np.where((a[:, None] >= bl) & (a[:, None] <= bh) & (ac[:, None] == bc))

cln = pd.concat(
clin = pd.concat(
[
temp.loc[i, :].reset_index(drop=True),
clin_nc.loc[j, :].reset_index(drop=True),
Expand All @@ -38,7 +38,7 @@ def add_c_nc(score, ref):

# Take into account When HGMD data base is empty
if hgmd_nc.shape[0] == 0:
pass
hgmd = hgmd_nc

else:
# merge by region
Expand All @@ -58,29 +58,28 @@ def add_c_nc(score, ref):
axis=1,
)

merged = score.merge(cln, how="left", on="varId")
if hgmd_nc.shape[0] == 0:
merged["nc_HGMD_Exp"] = np.NaN
merged["nc_RANKSCORE"] = np.NaN
else:
merged = merged.merge(hgmd, how="left", on="varId")
merged = score.merge(
clin_c.rename(columns={'new_chr': 'chrom', 'new_pos': 'pos'}),
how="left",
on=["chrom", "pos", "ref", "alt"],
)
merged = merged.merge(clin, how="left", on="varId")

if hgmd_c.shape[0] == 0:
merged["c_HGMD_Exp"] = np.NaN
merged["c_RANKSCORE"] = np.NaN
merged["CLASS"] = np.NaN
else:
merged = merged.merge(
hgmd_c,
hgmd_c.rename(columns={'new_chr': 'chrom', 'new_pos': 'pos'}),
how="left",
left_on=["chrom", "pos", "ref", "alt"],
right_on=["new_chr", "new_pos", "ref", "alt"],
on=["chrom", "pos", "ref", "alt"],
)
merged = merged.merge(
clin_c,
how="left",
left_on=["chrom", "pos", "ref", "alt"],
right_on=["new_chr", "new_pos", "ref", "alt"],
)

if hgmd_nc.shape[0] == 0:
merged["nc_HGMD_Exp"] = np.NaN
merged["nc_RANKSCORE"] = np.NaN
else:
merged = merged.merge(hgmd, how="left", on="varId")

return merged
5 changes: 4 additions & 1 deletion bin/extraModel/generate_bivar_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def process_sample(data_folder, sample_id, default_pred, labeling=False):
]

# Remove all the duplicated variant pairs.
gene_var_pairs_df = pd.DataFrame(gene_var_pairs)
gene_var_pairs_df = pd.DataFrame(gene_var_pairs, columns=['geneEnsId', 'varId1', 'varId2'])
# gene_var_pairs_df = gene_var_pairs_df.drop_duplicates(['varId1', 'varId2'])

# Use only subset columns of features
Expand Down Expand Up @@ -130,4 +130,7 @@ def process_sample(data_folder, sample_id, default_pred, labeling=False):
# Sort before saving
recessive_feature_df = recessive_feature_df.sort_index()

if recessive_feature_df.shape[0] == 0:
return

recessive_feature_df.to_csv(f"{recessive_folder}/{sample_id}.csv")
2 changes: 1 addition & 1 deletion bin/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def main():
print("input annoatated varFile:", args.varFile)
t1 = time.time()
varDf = pd.read_csv(
args.varFile, sep="\t", skiprows=numHeaderSkip, error_bad_lines=False
args.varFile, sep="\t", skiprows=numHeaderSkip,
)
# varDf=varDf[0:10]
# #do this if need to have a small test
Expand Down
35 changes: 35 additions & 0 deletions bin/generate_input_vcf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env python3.8

import string
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("variant", type=str)

args = parser.parse_args()

vcf_template = string.Template("""
##fileformat=VCFv4.2
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##FILTER=<ID=PASS,Description="All filters passed">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE
$chrom $pos . $ref $alt . . . GT:AD:DP:GQ:PL 0/1:7,5:12:99:142,0,214
""".strip())

def main(variant):
chrom, pos, ref, alt = variant.split("-")
with open("input.vcf", "w", encoding="ascii") as text_file:
text_file.write(vcf_template.substitute(
chrom=chrom,
pos=pos,
ref=ref,
alt=alt,
))

if __name__ == "__main__":
main(**vars(args))
1 change: 1 addition & 0 deletions docs/source/nf_usage.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Args:
--outdir Output directory.
--run_id Unique identifier for this run. (default: 1)
--ref_ver Reference genome version [hg38 or hg19] (default: hg19)
--input_variant Variant ID Formatted in 'chr-pos-alt-ref' (optional)
--bed_filter Path to a BED file to perform an analysis only for regions of interest (optional)
--exome_filter Enable exonic filter. Addition will filter out variants outside of exonic region (default: false)

Expand Down
Loading

0 comments on commit 75f7ca1

Please sign in to comment.