-
Notifications
You must be signed in to change notification settings - Fork 2
/
bssh_reads_by_run.py
executable file
·152 lines (111 loc) · 3.92 KB
/
bssh_reads_by_run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env python3
"""
Author: Erin Young
Released: 2024-07-11
Version: 0.0.1
Description:
This script isn't intended to be used alone, but it contains the functions
for downloading fastq files given a run name from basespace.
EXAMPLE:
python3 bssh_reads_by_run.py -r UT-M70330-240131 -o /Volumes/IDGenomics_NAS/pulsenet_and_arln/UT-M70330-240131/reads
"""
import argparse
import logging
import subprocess
import re
import concurrent.futures
import pandas as pd
def bs_out(bash_command):
"""
Parses output from basespace (bs)
Args:
bash_command (str): bs cli command
Returns:
df (pd.Dataframe): Pandas dataframe of the parsed output.
"""
process = subprocess.Popen(bash_command.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
stdout = output.decode('utf-8')
stdout = stdout.split('\n')
stdout.pop(0)
stdout.pop(1)
stdout.pop(-1)
stdout.pop(-1)
stdout = [x.replace('|', '') for x in stdout]
stdout = [re.sub(r"\s+", ' ', x) for x in stdout]
df = pd.DataFrame(columns = stdout[0].split(),
data = [row.split()[0:4] for row in stdout[1:]])
logging.debug(f"bs stdout {df}")
return df
def download_from_bs(sample_id, out):
"""
Uses a string to search for the first line where the dataframe should begin.
Args:
sample_id (str): sample id for fastq file
out (str): directory reads are downloaded to
"""
bash_command = f"bs download dataset --config=bioinfo --id={sample_id} --output={out}/ --retry"
process = subprocess.Popen(bash_command.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
logging.info(f"Fastq files for {sample_id} downloaded to {out}")
def download_reads(run, out):
"""
Downloads reads from run into out directory.
Args:
run_name (str): run name
out (str): destination directory for fastq files
"""
idd = get_run_id(run)
idds = get_sample_ids(idd)
# will run this in parallel to speed things up
futures = []
with concurrent.futures.ThreadPoolExecutor() as executor:
for id in idds:
futures.append(executor.submit(download_from_bs, id, out))
for future in futures:
logging.debug(future.result())
def get_run_id(run_name):
"""
Uses a string to search for the first line where the dataframe should begin.
Args:
run_name (str): run name
Returns:
idd (str): The basespace id for that run
"""
bash_command = 'bs list --config=bioinfo runs'
df = bs_out(bash_command)
idd = df.loc[df['ExperimentName'] == run_name,'Id'].iloc[0]
logging.info(f"Run id is {idd}")
return idd
def get_sample_ids(idd):
"""
Gets the sample ids for a sequencing run.
Args:
idd (str): id for sequencing run
Returns:
idds (list): basespace ids for each sample
"""
bash_command = f"bs list dataset --config=bioinfo --input-run={idd} "
df = bs_out(bash_command)
# removing undetermined and None values
df = df[df['Name'] != 'Undetermined']
df = df[df['Id'].str.contains('None') == False]
idds=list(df['Id'])
logging.info(f"Reads ids are {idds}")
return idds
def main():
"""
Downloads fastq files from basespace by specifying run name.
Args:
args (argparse.Namespace): Parsed command-line arguments.
"""
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(description='Downloads reads from basespace')
# Define your arguments here
parser.add_argument('-r', '--run', type=str, required=True, help='Run name')
parser.add_argument('-o', '--out', type=str, required=False, default='reads', help='Directory where fastq files are downloaded')
# Parse the arguments
args = parser.parse_args()
download_reads(args.run, args.out)
if __name__ == '__main__':
main()