diff --git a/test/benchmark/config/caller_comparison_config.yaml b/test/benchmark/config/caller_comparison_config.yaml index 5cfec362..6ef5380a 100644 --- a/test/benchmark/config/caller_comparison_config.yaml +++ b/test/benchmark/config/caller_comparison_config.yaml @@ -48,6 +48,19 @@ long_bam_name: l2: "AshkenazimTrio - HG002 NA24385 Son - PacBio CCS 10kb" l3: "AshkenazimTrio - HG002 NA24385 Son - 10X Genomics" +# short read data - simulation from hs37d5.fa +### GRCh38 +simulated_short_read_bam: + sim1: data/simulation_GRCh38/hg38_simulation_10000000_Illumina_reads_default_sorted.bam + sim2: data/simulation_GRCh38/hg38_simulation_10000000_Illumina_reads_InDel_sorted.bam + sim3: data/simulation_GRCh38/hg38_simulation_10000000_Illumina_reads_noSNP_sorted.bam + sim4: data/simulation_GRCh38/hg38_simulation_10000000_Illumina_reads_SNPandSV_sorted.bam +simulated_short_bam_name: + sim1: "AshkenazimTrio - HG002 NA24385 Son - hs37d5 Mason Simulation of 10000000 Illumina Reads - Default" + sim2: "AshkenazimTrio - HG002 NA24385 Son - hs37d5 Mason Simulation of 10000000 Illumina Reads - InDel" + sim3: "AshkenazimTrio - HG002 NA24385 Son - hs37d5 Mason Simulation of 10000000 Illumina Reads - no SNP" + sim4: "AshkenazimTrio - HG002 NA24385 Son - hs37d5 Mason Simulation of 10000000 Illumina Reads - SNP and SV" + ### GRCh37 reference_fa: Illumina_Paired_End: data/reference/GRCh37/hs37d5.fa @@ -56,12 +69,12 @@ reference_fa: PacBio_CCS: data/reference/GRCh37/hs37d5.fa 10X_Genomics: data/reference/GRCh37/hg19.reordered.fa ### GRCh38 -# reference_fa: # Illumina_Paired_End: data/reference/GRCh38/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fa # Illumina_Mate_Pair: data/reference/GRCh38/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fa # MtSinai_PacBio: data/reference/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa # PacBio_CCS: data/reference/GRCh38/??? # 10X_Genomics: data/reference/GRCh38/??? + simulation_ref: data/simulation_GRCh38/hg38_chr21.fa truth_set: gz: data/truth_set/HG002/HG002_SVs_Tier1_v0.6.vcf.gz @@ -77,6 +90,14 @@ truth_set_DEL: gz: data/truth_set/HG002/DEL/HG002_SVs_Tier1_v0.6.DEL.vcf.gz truth_set_DEL_renamed_chr: gz: data/truth_set/HG002/DEL/HG002_SVs_Tier1_v0.6.DEL.renamed_chr.vcf.gz +truth_set_simulation_default: + gz: data/simulation_GRCh38/hg38_SV_simulation_default_sorted.vcf.gz +truth_set_simulation_InDel: + gz: data/simulation_GRCh38/hg38_SV_simulation_InDel_sorted.vcf.gz +truth_set_simulation_noSNP: + gz: data/simulation_GRCh38/hg38_SV_simulation_noSNP_sorted.vcf.gz +truth_set_simulation_SNPandSV: + gz: data/simulation_GRCh38/hg38_SV_simulation_SNPandSV_sorted.vcf.gz # truth_set: # gz: data/truth_set/HG002_SVs_Tier1_v0.6.Hg38.vcf.gz # bed: data/truth_set/HG002_SVs_Tier1_v0.6.Hg38.bed diff --git a/test/benchmark/iGenVar_init.sh b/test/benchmark/iGenVar_init.sh index 408e5f0b..316b8120 100755 --- a/test/benchmark/iGenVar_init.sh +++ b/test/benchmark/iGenVar_init.sh @@ -69,6 +69,8 @@ mkdir -p data && cd data # short & long reads, reference and truth sets ./../Repos/iGenVar/test/benchmark/dataset_downloads.sh +# create simulated datasets +./../Repos/iGenVar/test/benchmark/simulation/mason_simulation.sh echo "$(tput setaf 1)$(tput setab 7)------- Datasets downloaded (7/9) --------$(tput sgr 0)" 1>&3 diff --git a/test/benchmark/simulation/mason_simulation.sh b/test/benchmark/simulation/mason_simulation.sh new file mode 100644 index 00000000..6eb78572 --- /dev/null +++ b/test/benchmark/simulation/mason_simulation.sh @@ -0,0 +1,166 @@ +#!/usr/bin/env sh + +cd data +mkdir -p simulation_GRCh38 && cd simulation_GRCh38 + +conda env create -f Repos/iGenVar/test/benchmark/envs/simulation.yaml +conda activate simulation + +# With a 47644184 bp of chr21 and 10000000 simulated reads, which are 200 bp long, we have a coverage of: +# (10000000*200)/47644184 = 41.9778414087 + +# Cut chromosome 21 from reference +sed -n '/^>chr21/,$p' ../reference/GRCh38/hg38.fa > hg38_chr21_.fa +sed '/>chr22/,$d' hg38_chr21_.fa > hg38_chr21.fa + +bwa index hg38_chr21.fa + +# Mason +# Tools for Biological Sequence Simulation +# ======================================== +# https://github.com/seqan/seqan/blob/master/apps/mason2/README + +git clone --single-branch --depth 1 --branch develop https://github.com/seqan/seqan +cd ../.. && mkdir seqan && cd seqan +cmake -DCMAKE_BUILD_TYPE=Release ../../Repos/seqan +make -j16 -k mason_simulator mason_variator + +# mason_variator - Variation Simulation +# ===================================== +# mason_variator [OPTIONS] -ir IN.fa -ov OUT.vcf [-of OUT.fa] +# Variation Simulation: +# --snp-rate DOUBLE +# Per-base SNP rate. In range [0.0..1.0]. Default: 0.0001. +# --small-indel-rate DOUBLE +# Small indel rate. In range [0.0..1.0]. Default: 0.000001. +# --min-small-indel-size INTEGER +# Minimal small indel size to simulate. In range [0..inf]. Default: 1. +# --max-small-indel-size INTEGER +# Maximal small indel size to simulate. In range [0..inf]. Default: 6. +# --sv-indel-rate DOUBLE +# Per-base SNP rate. In range [0.0..1.0]. Default: 0.0000001. +# --sv-inversion-rate DOUBLE +# Per-base SNP rate. In range [0.0..1.0]. Default: 0.0000001. +# --sv-translocation-rate DOUBLE +# Per-base SNP rate. In range [0.0..1.0]. Default: 0.0000001. +# --sv-duplication-rate DOUBLE +# Per-base SNP rate. In range [0.0..1.0]. Default: 0.0000001. +# --min-sv-size INTEGER +# Minimal SV size to simulate. In range [0..inf]. Default: 50. +# --max-sv-size INTEGER +# Maximal SV size to simulate. In range [0..inf]. Default: 1000. +SMALL_INDEL_SIZE=19 +MIN_SV_SIZE=20 +MAX_SV_SIZE=20000 +SV_RATE=0.00001 + +./../../build/seqan/bin/mason_variator --in-reference hg38_chr21.fa --out-vcf hg38_SV_simulation_default.vcf --seed 0 \ + > hg38_SV_simulation_default.log +./../../build/seqan/bin/mason_variator --in-reference hg38_chr21.fa --out-vcf hg38_SV_simulation_InDel.vcf --seed 1 \ + --snp-rate 0 --small-indel-rate ${SV_RATE} --max-small-indel-size ${SMALL_INDEL_SIZE} \ + --sv-indel-rate ${SV_RATE} --min-sv-size ${MIN_SV_SIZE} --max-sv-size ${MAX_SV_SIZE} \ + --sv-inversion-rate 0 --sv-translocation-rate 0 --sv-duplication-rate 0 \ + > hg38_SV_simulation_InDel.log +./../../build/seqan/bin/mason_variator --in-reference hg38_chr21.fa --out-vcf hg38_SV_simulation_noSNP.vcf --seed 2 \ + --snp-rate 0 --small-indel-rate ${SV_RATE} --max-small-indel-size ${SMALL_INDEL_SIZE} \ + --sv-indel-rate ${SV_RATE} --min-sv-size ${MIN_SV_SIZE} --max-sv-size ${MAX_SV_SIZE} \ + --sv-inversion-rate ${SV_RATE} --sv-translocation-rate ${SV_RATE} --sv-duplication-rate ${SV_RATE} \ + > hg38_SV_simulation_noSNP.log +./../../build/seqan/bin/mason_variator --in-reference hg38_chr21.fa --out-vcf hg38_SV_simulation_SNPandSV.vcf --seed 3 \ + --snp-rate ${SV_RATE} --small-indel-rate ${SV_RATE} --max-small-indel-size ${SMALL_INDEL_SIZE} \ + --sv-indel-rate ${SV_RATE} --min-sv-size ${MIN_SV_SIZE} --max-sv-size ${MAX_SV_SIZE} \ + --sv-inversion-rate ${SV_RATE} --sv-translocation-rate ${SV_RATE} --sv-duplication-rate ${SV_RATE} \ + > hg38_SV_simulation_SNPandSV.log + +# sort the vcf truth +picard SortVcf -I hg38_SV_simulation_default.vcf \ + -O hg38_SV_simulation_default_sorted.vcf -Xms1g -Xmx100g --TMP_DIR tmp/picard/ \ + >> hg38_SV_simulation_default.log +picard SortVcf -I hg38_SV_simulation_InDel.vcf \ + -O hg38_SV_simulation_InDel_sorted.vcf -Xms1g -Xmx100g --TMP_DIR tmp/picard/ \ + >> hg38_SV_simulation_InDel.log +picard SortVcf -I hg38_SV_simulation_noSNP.vcf \ + -O hg38_SV_simulation_noSNP_sorted.vcf -Xms1g -Xmx100g --TMP_DIR tmp/picard/ \ + >> hg38_SV_simulation_noSNP.log +picard SortVcf -I hg38_SV_simulation_SNPandSV.vcf \ + -O hg38_SV_simulation_SNPandSV_sorted.vcf -Xms1g -Xmx100g --TMP_DIR tmp/picard/ \ + >> hg38_SV_simulation_InDel.log + +bgzip -c hg38_SV_simulation_default_sorted.vcf > hg38_SV_simulation_default_sorted.vcf.gz +bgzip -c hg38_SV_simulation_InDel_sorted.vcf > hg38_SV_simulation_InDel_sorted.vcf.gz +bgzip -c hg38_SV_simulation_noSNP_sorted.vcf > hg38_SV_simulation_noSNP_sorted.vcf.gz +bgzip -c hg38_SV_simulation_SNPandSV_sorted.vcf > hg38_SV_simulation_SNPandSV_sorted.vcf.gz + +# create index +tabix -p vcf hg38_SV_simulation_default_sorted.vcf.gz +tabix -p vcf hg38_SV_simulation_InDel_sorted.vcf.gz +tabix -p vcf hg38_SV_simulation_noSNP_sorted.vcf.gz +tabix -p vcf hg38_SV_simulation_SNPandSV_sorted.vcf.gz + +# mason_simulator - Read Simulation +# ================================= +# mason_simulator [OPTIONS] -ir IN.fa --num-fragments NUM [-iv IN.vcf] --out LEFT.fq [--out-right RIGHT.fq] +# Simulate NUM reads/pairs from the reference sequence IN.fa, potentially with variants from IN.vcf. In case that +# both -o and -or are given, write out paired-end data, if only -io is given, only single-end reads are simulated. +# -oa, --out-alignment OUTPUT_FILE +# SAM/BAM file with alignments. Valid filetypes are: .sam[.*] and .bam, where * is any of the following +# extensions: gz and bgzf for transparent (de)compression. +# -ir, --input-reference INPUT_FILE +# Path to FASTA file to read the reference from. Valid filetypes are: .sam[.*], .raw[.*], .gbk[.*], .frn[.*], +# .fq[.*], .fna[.*], .ffn[.*], .fastq[.*], .fasta[.*], .faa[.*], .fa[.*], .embl[.*], and .bam, where * is any +# of the following extensions: gz and bgzf for transparent (de)compression. +# -iv, --input-vcf INPUT_FILE +# Path to the VCF file with variants to apply. Valid filetype is: .vcf[.*], where * is any of the following +# extensions: gz and bgzf for transparent (de)compression. + +# ! needs indexed fasta file ! +./../../build/seqan/bin/mason_simulator --input-reference hg38_chr21.fa --input-vcf hg38_SV_simulation_default.vcf \ + --out hg38_simulation_10000000_Illumina_reads_default.fastq \ + --num-fragments 10000000 --seq-technology illumina --illumina-read-length 200 \ + >> hg38_SV_simulation_default.log +./../../build/seqan/bin/mason_simulator --input-reference hg38_chr21.fa --input-vcf hg38_SV_simulation_InDel.vcf \ + --out hg38_simulation_10000000_Illumina_reads_InDel.fastq \ + --num-fragments 10000000 --seq-technology illumina --illumina-read-length 200 \ + >> hg38_SV_simulation_InDel.log +./../../build/seqan/bin/mason_simulator --input-reference hg38_chr21.fa --input-vcf hg38_SV_simulation_noSNP.vcf \ + --out hg38_simulation_10000000_Illumina_reads_noSNP.fastq \ + --num-fragments 10000000 --seq-technology illumina --illumina-read-length 200 \ + >> hg38_SV_simulation_noSNP.log +./../../build/seqan/bin/mason_simulator --input-reference hg38_chr21.fa --input-vcf hg38_SV_simulation_SNPandSV.vcf \ + --out hg38_simulation_10000000_Illumina_reads_SNPandSV.fastq \ + --num-fragments 10000000 --seq-technology illumina --illumina-read-length 200 \ + >> hg38_SV_simulation_InDel.log + +bwa mem hg38_chr21.fa hg38_simulation_10000000_Illumina_reads_default.fastq \ + > hg38_simulation_10000000_Illumina_reads_default.sam +bwa mem hg38_chr21.fa hg38_simulation_10000000_Illumina_reads_InDel.fastq \ + > hg38_simulation_10000000_Illumina_reads_InDel.sam +bwa mem hg38_chr21.fa hg38_simulation_10000000_Illumina_reads_noSNP.fastq \ + > hg38_simulation_10000000_Illumina_reads_noSNP.sam +bwa mem hg38_chr21.fa hg38_simulation_10000000_Illumina_reads_SNPandSV.fastq \ + > hg38_simulation_10000000_Illumina_reads_SNPandSV.sam + +# You can use the --mate-orientation to set the relative orientation when doing paired-end sequencing. The valid +# values are given in the following. + +# FR Reads are inward-facing, the same as Illumina paired-end reads: R1 --> <-- R2. +# RF Reads are outward-facing, the same as Illumina mate-pair reads: R1 <-- --> R2. +# FF Reads are on the same strand: R1 --> --> R2. +# FF2 Reads are on the same strand but the "right" reads are sequenced to the left of the "left" reads, same as +# 454 paired: R2 --> --> R1. + +samtools sort -O BAM hg38_simulation_10000000_Illumina_reads_default.sam \ + > hg38_simulation_10000000_Illumina_reads_default_sorted.bam +samtools sort -O BAM hg38_simulation_10000000_Illumina_reads_InDel.sam \ + > hg38_simulation_10000000_Illumina_reads_InDel_sorted.bam +samtools sort -O BAM hg38_simulation_10000000_Illumina_reads_noSNP.sam \ + > hg38_simulation_10000000_Illumina_reads_noSNP_sorted.bam +samtools sort -O BAM hg38_simulation_10000000_Illumina_reads_SNPandSV.sam \ + > hg38_simulation_10000000_Illumina_reads_SNPandSV_sorted.bam + +samtools index hg38_simulation_10000000_Illumina_reads_default_sorted.bam +samtools index hg38_simulation_10000000_Illumina_reads_InDel_sorted.bam +samtools index hg38_simulation_10000000_Illumina_reads_noSNP_sorted.bam +samtools index hg38_simulation_10000000_Illumina_reads_SNPandSV_sorted.bam + +cd ../..