-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread_generator_1.2.sh
170 lines (131 loc) · 6.97 KB
/
read_generator_1.2.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/bin/bash
# define inputs
REPEAT_VAR=${1}
echo "repetitive element = $REPEAT_VAR"
REPEATS_PULLDOWN=${2}
echo "number of repetitive elements = $REPEATS_PULLDOWN"
REPEATS_PATHTEST=${3}
echo "$REPEATS_PATHTEST"
REPEATS_INTERVALTEST=${4}
echo "$REPEATS_INTERVALTEST"
FASTQ_REPS=${5}
echo "$FASTQ_REPS"
PATH_CHR=${6}
echo "$PATH_CHR"
PATH_START=${7}
echo "$PATH_START"
PATH_END=${8}
echo "$PATH_END"
EXT_BP=${9}
echo "$EXT_BP"
hg19_genome=${10}
echo "$hg19_genome"
ArtFastGen_location=${11}
echo "$ArtFastGen_location"
njobs=${12}
echo "$njobs"
######################
### PULLDOWN FASTA ###
######################
# Create FASTA file with repetitive sequence for use in BLAST
echo ">rg_${REPEATS_PULLDOWN}_${REPEAT_VAR}" > rg_${REPEATS_PULLDOWN}_${REPEAT_VAR}.fsa
printf -v v "%-*s" "${REPEATS_PULLDOWN}" ""; echo "${v// /${REPEAT_VAR}}" >> rg_${REPEATS_PULLDOWN}_${REPEAT_VAR}.fsa
# Make human genome database from hg19 genome
makeblastdb -in $hg19_genome -dbtype nucl -title hg19_genome -out hg19_genome.db
# Local hg19 query
ws=$(tail -1 rg_${REPEATS_PULLDOWN}_${REPEAT_VAR}.fsa | wc -c)
word_size=$(expr $ws - 1)
blastn -task megablast -word_size $word_size -query rg_${REPEATS_PULLDOWN}_${REPEAT_VAR}.fsa -evalue 1000 -dust no -db hg19_genome.db -out rg_${REPEATS_PULLDOWN}_${REPEAT_VAR}_hg19local30_out -outfmt "7 sacc sstart send evalue qacc qstart qend"
# remote hg38 query - this isnt returning the exact reuslt we want yet
# blastn -task megablast -word_size 16 -query 5C9Repeats.fsa -evalue 1000 -dust no -db GPIPE/9606/109/ref_top_level -out 5C9Repeats_hg38remote_out -remote -outfmt "7 qacc sacc evalue qstart qend sstart send"
# Set expected value threshold (default is 10)
awk '!/^#/ && ($4 < 10)' rg_${REPEATS_PULLDOWN}_${REPEAT_VAR}_hg19local30_out > temp1.bed
# Reformat bed file to ensure smallest (start) value on the left
awk -F'\t' '$2 > $3 { $0 = $1 FS $3 FS $2 } 1' temp1.bed > temp2.bed
# Sort and Merge overlapping features
sort -k1,1 -k2,2n temp2.bed | awk -v OFS='\t' '{print $1, $2, $3}' > sorted.bed
bedtools merge -i sorted.bed > sorted_merged.bed
# Extend output +/- bp (set to 150bp)
awk -v x=$EXT_BP {'printf ("%s\t%s\t%s\n", $1, $2 - x, $3 + x)'} sorted_merged.bed > temp3.bed
# Get fasta sequences - This represents the C9 pulldown library.
bedtools getfasta -fi $hg19_genome -bed temp3.bed -fo rg_${REPEATS_PULLDOWN}_${REPEAT_VAR}.fa
# Split to make reads from each enriched region
mkdir split_dir
csplit -s -z rg_${REPEATS_PULLDOWN}_${REPEAT_VAR}.fa '/>/' '{*}'
for i in xx* ; do \
n=$(sed 's/>// ; s/ .*// ; 1q' "$i") ; \
mv "$i" "split_dir/$n.fa" ; \
done
# cleanup temporary files
rm temp1.bed temp2.bed sorted.bed
########################
### ARTIFICIAL FASTA ###
########################
# Create bed file of expansion location
echo $PATH_CHR $PATH_START $PATH_END > temp4.bed
# Expand bed file to include +/- basepairs
awk -v x=$EXT_BP {'printf ("%s\t%s\t%s\n", $1, $2 - x, $3 + x)'} temp4.bed > temp5.bed
# Get fasta sequences
bedtools getfasta -fi $hg19_genome -bed temp5.bed -fo path_repeat.fa
# make total .bed for downstream analysis
cat temp5.bed temp3.bed > art_pulldown.bed
# select start and end FASTA sequence +/- base-pairs added
string_length=$(sed -n '2p' path_repeat.fa | wc -m)
COUNT=`expr $string_length - $EXT_BP`
start=$(sed -n '2p' path_repeat.fa | cut -c1-${EXT_BP})
end=$(sed -n '2p' path_repeat.fa | cut -c${COUNT}-${string_length})
# Create artificial FASTA files with pathogenic repeats to be tested, place into split directory for cleanliness
rm repeat_lengths_list
for i in $(seq 0 1 7); do
echo "scale=2;2^$i" | bc >> repeat_lengths_list
done
seq 200 100 1000 >> repeat_lengths_list
seq 1200 200 2000 >> repeat_lengths_list
for i in $(cat repeat_lengths_list); do
echo ">repeat_element" > split_dir/rg_pathrep${i}_${REPEAT_VAR}.fsa
middle=$(printf -v v "%-*s" "${i}" ""; echo "${v// /${REPEAT_VAR}}")
echo $start$middle$end >> split_dir/rg_pathrep${i}_${REPEAT_VAR}.fsa
done
########################
### ARTIFICIAL FASTQ ###
########################
# Create artificial fastq files for each FASTA file. Note: variables contained within single inverted commas cause errors in bash and so '"${ used for location variable to expand
#parallel -a fa_list --xapply -j $njobs --eta 'artfastgen -O {1.}_R -R {1} -S ">repeat_element" -RL 300 -TLM 400 -URQS true -SE true -F1 '"${ArtFastGen_location}"'/test1.fastq -F2 '"${ArtFastGen_location}"'/test2.fastq'
#parallel -a fa_list --xapply -j $njobs --eta 'artfastgen -O {1.}_R -R {1} -S ">repeat_element" -CMP 100 -CSD 0.1 -RL 301 -TLM 400 -CMGCS 1 -CMPGC 1 -URQS true -SE true -F1 '"${URQS_read1}"' -F2 '"${URQS_read2}"''
rm fa_list fsa_list
ls split_dir/*fa >> fa_list
ls split_dir/*fsa >> fsa_list
#parallel -a fa_list --xapply -j $njobs --eta 'artfastgen -O {1.}_R -R {1} -S ">" -CMP 1000000 -RL 301 -TLM 600 -GCR 20 -CMPGC 1'
# run fa and fsa seperately as artificial c9 will generate ~ 10X more reads at same depth due to GC richness
#parallel -a fa_list --xapply -j $njobs --eta 'artfastgen -O {1.}_R -R {1} -S ">" -CMP 1000000 -RL 301 -TLM 600 -GCR 20 -CMPGC 0.8'
#parallel -a fsa_list --xapply -j $njobs --eta 'artfastgen -O {1.}_R -R {1} -S ">" -CMP 100000 -RL 301 -TLM 600 -GCR 20 -CMPGC 0.8'
parallel -a fa_list --xapply -j $njobs --eta 'artfastgen -O {1.}_R -R {1} -S ">" -CMP 100000 -RL 301 -TLM 600'
parallel -a fsa_list --xapply -j $njobs --eta 'artfastgen -O {1.}_R -R {1} -S ">" -CMP 100000 -RL 301 -TLM 600'
# rename and gzip .fastq files
rename _R.1.fastq _R1.fastq split_dir/*_R.1.fastq
rename _R.2.fastq _R2.fastq split_dir/*_R.2.fastq
# sample reads to create replicate files with a portion of .fastq from each .fasta location
mkdir fastq_files
for rep_fq in $(cat repeat_lengths_list);do
rep_fq2=split_dir/rg_pathrep${rep_fq}_GGCCCC
for FQ in $(ls split_dir/chr*_R1.fastq);do
echo $FQ
for reads in {0.1,0.2,0.5,1}; do
#for reads in {5000,25000,50000,100000,200000}; do
#reads=1000
seqtk sample -s 100 ${rep_fq2}_R1.fastq $reads > fastq_files/${rep_fq}R${reads}1_R1.fastq
seqtk sample -s 100 ${rep_fq2}_R2.fastq $reads > fastq_files/${rep_fq}R${reads}1_R2.fastq
seqtk sample -s 50 ${rep_fq2}_R1.fastq $reads > fastq_files/${rep_fq}R${reads}2_R1.fastq
seqtk sample -s 50 ${rep_fq2}_R2.fastq $reads > fastq_files/${rep_fq}R${reads}2_R2.fastq
seqtk sample -s 10 ${rep_fq2}_R1.fastq $reads > fastq_files/${rep_fq}R${reads}3_R1.fastq
seqtk sample -s 10 ${rep_fq2}_R2.fastq $reads > fastq_files/${rep_fq}R${reads}3_R2.fastq
# reads=$(echo "$cell_reads * $reads_input" | bc | awk '{print int($1+0.5)}')
seqtk sample -s 100 $FQ $reads >> fastq_files/${rep_fq}R${reads/./}1_R1.fastq
seqtk sample -s 100 ${FQ%%_R1.fastq}_R2.fastq $reads >> fastq_files/${rep_fq}R${reads/./}1_R2.fastq
seqtk sample -s 50 $FQ $reads >> fastq_files/${rep_fq}R${reads/./}2_R1.fastq
seqtk sample -s 50 ${FQ%%_R1.fastq}_R2.fastq $reads >> fastq_files/${rep_fq}R${reads/./}2_R2.fastq
seqtk sample -s 10 $FQ $reads >> fastq_files/${rep_fq}R${reads/./}3_R1.fastq
seqtk sample -s 10 ${FQ%%_R1.fastq}_R2.fastq $reads >> fastq_files/${rep_fq}R${reads/./}3_R2.fastq
done
done
done