Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update 1 of EBI downloader #2

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 147 additions & 31 deletions standalone/EBI_SRA_Downloader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#!/usr/bin/env python3
# conda command to install all dependencies:
# conda create -n ebi_sra_importer pandas requests entrez-direct sra-tools xmltodict lxml -c bioconda -c conda-forge -y
#
# pip command to install all dependencies:
# pip install csv glob requests subprocess xmltodict sys lxml os urllib
Expand All @@ -11,8 +13,8 @@
# -sample {name} flag allows the user to specify the sample file name
# -prep {name} flag allows the user to specify the prep file name
# -study {name} flag allows the user to specify the study info file name
# -all-seqs true flag allows the script to accept all sample types
# -all-platform true flag allows the script to accept samples from all platforms
# -all-seqs allows the script to accept all sample types
# -all-platforms allows the script to accept samples from all platforms
# -debug true flag to enter debug mode (not download fastq files)
#
# libraries used
Expand All @@ -31,7 +33,7 @@

DEBUG = False
ALL_SEQS = False
ALL_PLATFORM = False
ALL_PLATFORMS = False
handler = logging.StreamHandler()
fmt_str = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
handler.setFormatter(logging.Formatter(fmt_str))
Expand Down Expand Up @@ -82,22 +84,22 @@ def ebi_create_details_file(study_accession, file_suffix="_detail"):
continue
row_list = row.decode("utf-8").split('\t')
if not ALL_SEQS and row_list[5].upper() != "METAGENOMIC":
logger.warning(row_list[5])
logger.warning("Library source is not Metagenomic for " +
logger.warning("Library source is " + row_list[5] +
" not Metagenomic for " +
row_list[1] + ". Omitting " + row_list[1])
continue
# skip row
elif not ALL_PLATFORM and row_list[6].lower() != "illumina":
logger.warning(row_list[6])
logger.warning("Instrument platform is not Illumina for " +
elif not ALL_PLATFORMS and row_list[6].lower() != "illumina":
logger.warning("Instrument platform is " + row_list[6] +
" not Illumina for " +
row_list[1] + ". Omitting " + row_list[1])
continue
# skip row
else:
for i in range(len(row_list)):
if len(row_list[i]) == 0:
row_list[i] = "unspecified"
if ALL_PLATFORM:
if ALL_PLATFORMS:
row_string = '\t'.join(row_list)
else:
row_string = '\t'.join(row_list[:6]) + "\tIllumina\t" + \
Expand All @@ -111,7 +113,7 @@ def ebi_create_details_file(study_accession, file_suffix="_detail"):
if ALL_SEQS:
raise Exception(study_accession + " has no sample or run that" +
" is from Illumina")
elif ALL_PLATFORM:
elif ALL_PLATFORMS:
raise Exception(study_accession + " has no sample or run that" +
" is METAGENOMIC")
else:
Expand Down Expand Up @@ -194,17 +196,20 @@ def sra_create_details_file(study_accession, file_suffix="_detail"):
if not ALL_SEQS:
if line[indices['library_source']].upper() != "METAGENOMIC":
logger.warning(line[indices['library_source']])
logger.warning("Library source is not Metagenomic for " +
logger.warning("Library source is " +
line[indices['library_source']] +
" not Metagenomic for " +
line[indices['run_accession']] +
". Omitting " +
line[indices['run_accession']])
continue
elif not ALL_PLATFORM:
elif not ALL_PLATFORMS:
if line[indices['instrument_platform']].lower() != "illumina":
logger.warning(line[indices['instrument_platform']])
logger.warning("Instrument platform is not Illumina for for "
+ line[indices['run_accession']] + ". Omitting "
+ line[indices['run_accession']])
logger.warning("Instrument platform is " +
line[indices['instrument_platform']] +
" not Illumina for " +
line[indices['run_accession']] + ". Omitting "
+ line[indices['run_accession']])
continue

for key in indices:
Expand All @@ -218,7 +223,7 @@ def sra_create_details_file(study_accession, file_suffix="_detail"):
if ALL_SEQS:
raise Exception(study_accession + " has no sample or run that" +
" is from Illumina")
elif ALL_PLATFORM:
elif ALL_PLATFORMS:
raise Exception(study_accession + " has no sample or run that" +
" is METAGENOMIC")
else:
Expand Down Expand Up @@ -455,23 +460,78 @@ def xml_to_dict(xml_fp):
root = etree.parse(xml_fp).getroot()
sample = root.getchildren()[0]
metadata = {}

attributes = sample.find('SAMPLE_ATTRIBUTES')
for node in attributes.iterfind('SAMPLE_ATTRIBUTE'):
tag = node.getchildren()[0]
value = node.getchildren()[1]
if value.text is None:
metadata[tag.text.strip('" ').upper()] = 'Not provided'
metadata[tag.text.strip('" ').upper()] = 'not provided'
else:
metadata[tag.text.strip('" ').upper()] \
= value.text.strip('" ')

#adding loops to look for additional data
title= sample.find('TITLE')
if title.text is None:
metadata['title'] = 'not provided'
else:
metadata['title'] = title.text.strip('" ')
description = sample.find('DESCRIPTION')
try:
if description.text is None:
metadata['description'] = 'not provided'
else:
if sep in description.text:
split_desc=description.text.strip('" ').split(sep)
counter=0
for i in split_desc:
metadata['description_field_' + str(counter)] = i
counter += 1
else:
metadata['description'] = description.text.strip('" ')
except:
metadata['description'] = 'not provided'

nameInfo = sample.find('SAMPLE_NAME')
for node in nameInfo:
tag = node.tag
value = node.text
if value is None:
metadata[tag.text.strip('" ').upper()] = 'not provided'
else:
metadata[tag.strip('" ').upper()] = value.strip('" ')

idInfo = sample.find('IDENTIFIERS')
for node in idInfo:
value = node.text
d = node.attrib
if len(d) > 0:
for k in d.keys():
tag = node.tag + "_" + d[k]
if value is None:
metadata[tag.text.strip('" ').upper()] = 'not provided'
else:
metadata[tag.strip('" ').upper()] = value.strip('" ')
else:
tag = node.tag

if value is None:
metadata[tag.text.strip('" ').upper()] = 'not provided'
else:
metadata[tag.strip('" ').upper()] = value.strip('" ')

return metadata

logger.info("downloading sample.txt file for each sample")
details_df = read_csv(study_details, sep='\t', header=None)
for row in details_df.iterrows():
library_name = row[1][0]
current_path = "./" + study_accession + "/" + library_name

sample_accession = row[1][1]
if sample_accession == 'unspecified': # and not DEBUG:
raise Exception(sample_accession + " does not contain metadata")
if path.exists(current_path + "/" + sample_accession + ".txt"):
continue

Expand Down Expand Up @@ -563,10 +623,61 @@ def xml_to_dict(xml_fp):
tag = node.getchildren()[0]
value = node.getchildren()[1]
if value.text is None:
metadata[tag.text.strip('" ').upper()] = 'Not provided'
metadata[tag.text.strip('" ').upper()] = 'not provided'
else:
metadata[tag.text.strip('" ').upper()] \
= value.text.strip('" ')

#adding loops to look for additional data
title= sample.find('TITLE')
if title.text is None:
metadata['title'] = 'not provided'
else:
metadata['title'] = title.text.strip('" ')
description = sample.find('DESCRIPTION')
try:
if description.text is None:
metadata['description'] = 'not provided'
else:
if sep in description.text:
split_desc=description.text.strip('" ').split(sep)
counter=0
for i in split_desc:
metadata['description_field_' + str(counter)] = i
counter += 1
else:
metadata['description'] = description.text.strip('" ')
except:
metadata['description'] = 'not provided'

nameInfo = sample.find('SAMPLE_NAME')
for node in nameInfo:
tag = node.tag
value = node.text
if value is None:
metadata[tag.text.strip('" ').upper()] = 'not provided'
else:
metadata[tag.strip('" ').upper()] = value.strip('" ')

idInfo = sample.find('IDENTIFIERS')
for node in idInfo:
value = node.text
d = node.attrib
if len(d) > 0:
for k in d.keys():
tag = node.tag + "_" + d[k]
if value is None:
metadata[tag.text.strip('" ').upper()] = 'not provided'
else:
metadata[tag.strip('" ').upper()] = value.strip('" ')
else:
tag = node.tag

if value is None:
metadata[tag.text.strip('" ').upper()] = 'not provided'
else:
metadata[tag.strip('" ').upper()] = value.strip('" ')

return metadata

logger.info("downloading sample.txt file for each sample")
Expand Down Expand Up @@ -751,12 +862,13 @@ def sra_fetch_data_file(study_details):
"transcriptional),instrument_platform")
parser.add_argument("-study", "--study_fileName", help="Study_file" +
" that contains study information")
parser.add_argument("-debug", "--debug", help="Debug mode: don't " +
parser.add_argument("-debug", "--debug", action='store_true', help="Debug mode: don't " +
"download fastq files")
parser.add_argument("-all-seqs", "--all_seqs", help="Accept " +
parser.add_argument("-all-seqs", "--all_seqs", action='store_true', help="Accept " +
"all type of sequence samples")
parser.add_argument("-all-platform", "--all_platform", help="Accept " +
parser.add_argument("-all-platforms", "--all_platforms", action='store_true', help="Accept " +
"all platform samples")
parser.add_argument("-sep","--sep",help="separator for description, default is ';' ")
args = parser.parse_args()

if args.ebiaccession is None and args.sraaccession is None:
Expand All @@ -770,18 +882,19 @@ def sra_fetch_data_file(study_details):
files for the entered SRA accession, and download the
FASTQ files.
Optional flags:
-sample [sample_file_name]
-prep [prep_file_name]
-study [study_info_file_name]
-debug true
-all-seqs true
-all-platform true
-sample_info [sample_info_file_name]
-prep_info [prep_info_file_name]
-study_info [study_info_file_name]
-debug
-all-seqs
-all-platforms
-sep
""")
sys.exit(2)

DEBUG = True if args.debug == "true" else False
ALL_SEQS = True if args.all_seqs == "true" else False
ALL_PLATFORM = True if args.all_platform == "true" else False
DEBUG = args.debug
ALL_SEQS = args.all_seqs
ALL_PLATFORMS = args.all_platforms

if args.ebiaccession is not None:
# Output file names
Expand All @@ -791,6 +904,9 @@ def sra_fetch_data_file(study_details):
if args.prep_fileName is None else args.prep_fileName
study_file_name = args.ebiaccession + "_study_info.txt" \
if args.study_fileName is None else args.study_fileName

#set parser settings
sep= ';' if args.sep is None else args.sep
# Call create_details_file to generate .details.txt
study_details = ebi_create_details_file(args.ebiaccession)

Expand Down