-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathREADME
89 lines (62 loc) · 4.97 KB
/
README
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
### step-by-step preprocessing miseq 300PE reads AND 250PE reads
# reference:
# doi:10.1186/2049-2618-2-6
# Fadrosh D, Ma B, Gajer P, Sengamalay N, Ott S, Brotman R, Ravel J. 2014. An improved dual-indexing approach for multiplexed 16S rRNA gene sequencing on the Illumina MiSeq platform. Microbiome 2:6
# http://www.microbiomejournal.com/content/pdf/2049-2618-2-6.pdf
# this pipeline is currently being incorporated into QIIME, stay tuned with the next version of QIIME.
#1. trim barcode
####trim barcode, form a new barcode file, this step requires fastx_trimmer from fastx toolbox
fastx_trimmer -i R1.fq -f 1 -l 12 -Q 33 -o R1_barcode.fq
fastx_trimmer -i R2.fq -f 1 -l 12 -Q 33 -o R2_barcode.fq
cat R1_barcode.fq | fq_mergelines.pl > R1_barcode_temp
cat R2_barcode.fq | fq_mergelines.pl > R2_barcode_temp
paste R1_barcode_temp R2_barcode_temp | awk -F"\t" '{print $5"\t"$2$6"\t"$3"\t"$4$8}' | fq_splitlines.pl > R1R2_barcode.fastq
####trim barcode off the sequence
seqtk trimfq -b 12 R1.fq > R1_trimmed_seq.fastq
seqtk trimfq -b 12 R2.fq > R2_trimmed_seq.fastq
#2. assemble paired end read using either pandaseq, FLASH, or SeqPrep. QIIME 1.8 version also implement the function of merging reads ends.
# note that different merging algorithm has its own issues and disadvantages, choose carefully based on your needs (efficiency, overlapping length, quality, etc.)
# await evaluating ea-utils
#a)using pandaseq, note that the overlapping length needed to be determined based on read length and amplicon size
#Run Pandaseq to assemble MiSeq1 paired-end reads
pandaseq -F -o $overlapping_length -B -f R1.fq -r R3.fq > PandaAssembly.fq
#Use a Perl script followed by a Python script to fix Pandaseq MiSeq1 output and prepare it for Qiime
touch fix_5742_pandaseq_fastq.pl
echo -e "0a\n\#\!/usr/bin/perl\n\nmy_SPACE_\$filename_SPACE_=_SPACE_<\$ARGV[0]>;\nchomp_SPACE_\$filename;\nopen_SPACE_(FASTQ,_SPACE_\$filename);\n\t{\n\tif_SPACE_(\$filename_SPACE_=~_SPACE_/(.*)\\.[^.]*/)\n\t\t{\n\t\topen_SPACE_OUT,_SPACE_\">\$1.fixed.fastq\";\n\t\t}\n\t}\n\nwhile_SPACE_(<FASTQ>)\n\t{\n\tif_SPACE_(\$__SPACE_=~_SPACE_/^\\\@(NGSC\\-005)\\:(\\d*)\\:(000000000\\-A14NC)\\:(\\d*)\\:(\\\d*)\\:(\\d*)\\:(\\d*)\\:/)\n\t\t{\n\t\tprint_SPACE_OUT_SPACE_\"\\\@\$1:\$2:\$3:\$4:\$5:\$6:\$7_SPACE_2:N:0:_SLASH__LETTERN_\";\n\t\t}\n\telse\n\t\t{\n\t\tprint_SPACE_OUT_SPACE_\$_;\n\t\t}\n\t}\n\n.\n,wq" | ed fix_5742_pandaseq_fastq.pl
sed -i 's/_SPACE_/ /g' fix_5742_pandaseq_fastq.pl
sed -i 's/_SLASH_/\\/g' fix_5742_pandaseq_fastq.pl
sed -i 's/_LETTERN_/n/g' fix_5742_pandaseq_fastq.pl
sed -i 's/\\\#\\\!/\#\!/g' fix_5742_pandaseq_fastq.pl
perl fix_5742_pandaseq_fastq.pl PandaAssembly.fq
##make a barcode file with only the entries associated with sequences in the Pandaseq assembled MiSeq1 data set
sed -n '1~4'p PandaAssembly.fixed.fastq | sed -e 's/^@//' -e 's/:$//' > PandaAssembly.keep.header
seqtk subseq R2.fq PandaAssembly.keep.header > R2_filter.fq
sed '1~4 s/:$//' PandaAssembly.fixed.fastq > assembly.fq
fastqc --nogroup --noextract -q assembly.fq
#b) using FLASH
# http://ccb.jhu.edu/software/FLASH/
#c) SeqPrep
# https://github.com/jstjohn/SeqPrep
#d) PEAR
http://sco.h-its.org/exelixis/web/software/pear/
#e) QIIME1.8 also have the assembly featuer
# join_paired_end.py
#3. match up the barcode and sequence file, script attached
fq_getPairAndOrphan1.8.py R3.fastq R1.fastq R3N_PE.fq R1N_PE.fq orphan_temp.fq
fq_getPairAndOrphan1.8.py R3N_PE.fq R1R2_barcode.fastq PE_temp.fq barcode.fq orphan_temp.fq
# Eventually assembly.fq barcode.fq are the 3 files that you want to use for split
#4. run split_library for illumina using QIIME
http://qiime.org/1.3.0/scripts/split_libraries_fastq.html
#5. post-processing to remove barcode, heterogeneous spacer, and primer
# first trim off barcode, which is 24 bases at the beginning of the sequences
# to trim stagger and primer, I use tagcleaner (http://tagcleaner.sourceforge.net) to trim off the sequences from the beginning of the reads to the part that could match to the primer.
# the structure of the reads is the order of adaptor (pre-trimmed) + barcode + heterogeneous spacer (0-7 bases) + primer + sequence.
# await evaluating cutadapt (https://github.com/marcelm/cutadapt#paired-end-adapter-trimming)
prinseq-lite -trim_left 24 -fasta $fasta -line_width 0 -out_good $out
tagcleaner -fasta $fasta -out $out_tagclean -line_width 0 -verbose -log tagclean300PE.log -nomatch 3 -tag5 $forwardPrimer -mm5 $mismatch -tag3 $reversePrimer -mm3 $mismatch -trim_within $length
#getting information on what being trimmed, this is optional and only for logging
tagcleaner -fasta $fasta -out $out -line_width 0 -nomatch 3 -info -tag5 $forwardPrimer -mm5 $mismatch -tag3 $reversePrimer -mm3 $mismatch -trim_within #length
egrep ">" $fasta_with_header_info | awk '{print $4}' | awk '{count[$1]++}END{for (i in count) print count[i]"\t"i}' > ${stripped}.stat
prinseq-lite -stats_info -stats_len -fasta $fasta
#6. for 250PE, if need to stitch, using stitching_pipeline.pl
stitching_pipeline.pl -i R1_seqs.fasta -j R2_seqs.fasta -o output_dir