-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCombine_I1_R1_R2_v3.py
72 lines (62 loc) · 3.26 KB
/
Combine_I1_R1_R2_v3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
## This script takes in the sample barcode, R1, and R2, fastq files and parses read belonging to each sample
import sys
import re
from itertools import islice
import gzip
I1_filepath = sys.argv[1]
R1_filepath = sys.argv[2]
R2_filepath = sys.argv[3]
output_file = gzip.open('/PATH/TO/WORKING/DIRECTORY/SAMPLE_Matched_reads.fastq.gz', 'wt') ### EDIT TO DESIRED PATH AND SAMPLE NAME
Barcodes_list = ['TACCACCA', 'CTAAGTTT', 'GGGTCAAG', 'ACTGTGGC'] # input 4 sample indexing barcodes
with gzip.open(I1_filepath, mode='rt') as fp1: # the rt mode is for reading gzipped or binary files as text
with gzip.open(R1_filepath, mode='rt') as fp2:
with gzip.open(R2_filepath, mode='rt') as fp3:
# line = fp1.readline()
Total_reads_count = 0
Total_matched_seq = 0
Trimmed_seq_count = 0
for I1_line in fp1:
# print(line, end='')
if re.match('^@', I1_line):
Total_reads_count += 1
I1_header = list(I1_line.split(" "))
# print(I1_header[0])
all_names = I1_header[0]
if I1_line.strip() in Barcodes_list:
matched_name = all_names
# print(matched_name)
for R1_line in fp2:
# print(matched_name)
#Total_matched_seq += 1
if re.search(matched_name, R1_line):
# print(matched_name, R1_line)
R1_seq = next(fp2, '').strip()
BC = R1_seq[0:15]
UMI = R1_seq[16:27]
break
for R2_line in fp3:
if re.search(matched_name, R2_line):
# print(matched_name, R2_line,)
Total_matched_seq += 1
mRNA = list(islice(fp3, 1))[-1]
Quality = list(islice(fp3, 2))[-1]
#print(mRNA)
x = mRNA.find('AAAAAA')
if x == (-1):
print(matched_name+'_'+BC+'_'+UMI, file=output_file)
print(mRNA, end='', file=output_file)
print('+', file=output_file)
print(Quality, end='', file=output_file)
break
else:
Trimmed_seq_count += 1
trimming_end = (x - 1)
#print(mRNA, x, trimming_end)
trimmed_seq = mRNA[0:trimming_end]
trimmed_quality = Quality[0:trimming_end]
print(matched_name+'_'+BC+'_'+UMI, file=output_file)
print(trimmed_seq, file=output_file)
print('+', file=output_file)
print(trimmed_quality, file=output_file)
break
print('Total reads:', Total_reads_count, ' # of matched sequences:', Total_matched_seq, ' # of trimmed sequences:', Trimmed_seq_count)