ucsd-cmi · adswafford · May 15, 2020
diff --git a/standalone/EBI_SRA_Downloader.py b/standalone/EBI_SRA_Downloader.py
@@ -1,4 +1,6 @@
 #!/usr/bin/env python3
+# conda command to install all dependencies:
+#   conda create -n ebi_sra_importer pandas requests entrez-direct sra-tools xmltodict lxml -c bioconda -c conda-forge -y
 #
 # pip command to install all dependencies:
 #   pip install csv glob requests subprocess xmltodict sys lxml os urllib
@@ -11,8 +13,8 @@
 #       -sample {name} flag allows the user to specify the sample file name
 #       -prep {name} flag allows the user to specify the prep file name
 #       -study {name} flag allows the user to specify the study info file name
-#       -all-seqs true flag allows the script to accept all sample types
-#       -all-platform true flag allows the script to accept samples from all platforms
+#       -all-seqs allows the script to accept all sample types
+#       -all-platforms allows the script to accept samples from all platforms
 #       -debug true flag to enter debug mode (not download fastq files)
 #
 # libraries used
@@ -31,7 +33,7 @@
 
 DEBUG = False
 ALL_SEQS = False
-ALL_PLATFORM = False
+ALL_PLATFORMS = False
 handler = logging.StreamHandler()
 fmt_str = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
 handler.setFormatter(logging.Formatter(fmt_str))
@@ -82,22 +84,22 @@ def ebi_create_details_file(study_accession, file_suffix="_detail"):
             continue
         row_list = row.decode("utf-8").split('\t')
         if not ALL_SEQS and row_list[5].upper() != "METAGENOMIC":
-            logger.warning(row_list[5])
-            logger.warning("Library source is not Metagenomic for " +
+            logger.warning("Library source is " + row_list[5] +
+                            " not Metagenomic for " +
                            row_list[1] + ". Omitting " + row_list[1])
             continue
             # skip row
-        elif not ALL_PLATFORM and row_list[6].lower() != "illumina":
-            logger.warning(row_list[6])
-            logger.warning("Instrument platform is not Illumina for " +
+        elif not ALL_PLATFORMS and row_list[6].lower() != "illumina":
+            logger.warning("Instrument platform is " + row_list[6] + 
+                            " not Illumina for " +
                            row_list[1] + ". Omitting " + row_list[1])
             continue
             # skip row
         else:
             for i in range(len(row_list)):
                 if len(row_list[i]) == 0:
                     row_list[i] = "unspecified"
-            if ALL_PLATFORM:
+            if ALL_PLATFORMS:
                 row_string = '\t'.join(row_list)
             else:
                 row_string = '\t'.join(row_list[:6]) + "\tIllumina\t" + \
@@ -111,7 +113,7 @@ def ebi_create_details_file(study_accession, file_suffix="_detail"):
         if ALL_SEQS:
             raise Exception(study_accession + " has no sample or run that" +
                             " is from Illumina")
-        elif ALL_PLATFORM:
+        elif ALL_PLATFORMS:
             raise Exception(study_accession + " has no sample or run that" +
                             " is METAGENOMIC")
         else:
@@ -194,17 +196,20 @@ def sra_create_details_file(study_accession, file_suffix="_detail"):
             if not ALL_SEQS:
                 if line[indices['library_source']].upper() != "METAGENOMIC":
                     logger.warning(line[indices['library_source']])
-                    logger.warning("Library source is not Metagenomic for " +
+                    logger.warning("Library source is " +
+                                    line[indices['library_source']] +
+                                    " not Metagenomic for " +
                                    line[indices['run_accession']] +
                                    ". Omitting " +
                                    line[indices['run_accession']])
                     continue
-            elif not ALL_PLATFORM:
+            elif not ALL_PLATFORMS:
                 if line[indices['instrument_platform']].lower() != "illumina":
-                    logger.warning(line[indices['instrument_platform']])
-                    logger.warning("Instrument platform is not Illumina for for "
-                                   + line[indices['run_accession']] + ". Omitting "
-                                   + line[indices['run_accession']])
+                    logger.warning("Instrument platform is " + 
+                                    line[indices['instrument_platform']] +
+                                    " not Illumina for " +
+                                    line[indices['run_accession']] + ". Omitting "
+                                    + line[indices['run_accession']])
                     continue
 
             for key in indices:
@@ -218,7 +223,7 @@ def sra_create_details_file(study_accession, file_suffix="_detail"):
         if ALL_SEQS:
             raise Exception(study_accession + " has no sample or run that" +
                             " is from Illumina")
-        elif ALL_PLATFORM:
+        elif ALL_PLATFORMS:
             raise Exception(study_accession + " has no sample or run that" +
                             " is METAGENOMIC")
         else:
@@ -455,23 +460,78 @@ def xml_to_dict(xml_fp):
         root = etree.parse(xml_fp).getroot()
         sample = root.getchildren()[0]
         metadata = {}
+
         attributes = sample.find('SAMPLE_ATTRIBUTES')
         for node in attributes.iterfind('SAMPLE_ATTRIBUTE'):
             tag = node.getchildren()[0]
             value = node.getchildren()[1]
             if value.text is None:
-                metadata[tag.text.strip('" ').upper()] = 'Not provided'
+                metadata[tag.text.strip('" ').upper()] = 'not provided'
             else:
                 metadata[tag.text.strip('" ').upper()] \
                     = value.text.strip('" ')
+
+        #adding loops to look for additional data
+        title= sample.find('TITLE')
+        if title.text is None:
+            metadata['title'] = 'not provided'
+        else:
+            metadata['title'] = title.text.strip('" ')
+        description = sample.find('DESCRIPTION')
+        try:
+            if description.text is None:
+                metadata['description'] = 'not provided'
+            else:
+                if sep in description.text:
+                    split_desc=description.text.strip('" ').split(sep)
+                    counter=0
+                    for i in split_desc:
+                        metadata['description_field_' + str(counter)] = i
+                        counter += 1
+                else:
+                    metadata['description'] = description.text.strip('" ')
+        except:
+            metadata['description'] = 'not provided'
+
+        nameInfo = sample.find('SAMPLE_NAME')
+        for node in nameInfo:
+            tag = node.tag
+            value = node.text
+            if value is None:
+                metadata[tag.text.strip('" ').upper()] = 'not provided'
+            else:
+                metadata[tag.strip('" ').upper()] = value.strip('" ')
+
+        idInfo = sample.find('IDENTIFIERS')
+        for node in idInfo:
+            value = node.text
+            d = node.attrib
+            if len(d) > 0:
+                for k in d.keys():
+                    tag = node.tag + "_" + d[k]
+                    if value is None:
+                        metadata[tag.text.strip('" ').upper()] = 'not provided'
+                    else:
+                        metadata[tag.strip('" ').upper()] = value.strip('" ')
+            else:
+                tag = node.tag
+
+                if value is None:
+                    metadata[tag.text.strip('" ').upper()] = 'not provided'
+                else:
+                    metadata[tag.strip('" ').upper()] = value.strip('" ')
+
         return metadata
 
     logger.info("downloading sample.txt file for each sample")
     details_df = read_csv(study_details, sep='\t', header=None)
     for row in details_df.iterrows():
         library_name = row[1][0]
         current_path = "./" + study_accession + "/" + library_name
+
         sample_accession = row[1][1]
+        if sample_accession == 'unspecified': # and not DEBUG:
+            raise Exception(sample_accession + " does not contain metadata")
         if path.exists(current_path + "/" + sample_accession + ".txt"):
             continue
 
@@ -563,10 +623,61 @@ def xml_to_dict(xml_fp):
             tag = node.getchildren()[0]
             value = node.getchildren()[1]
             if value.text is None:
-                metadata[tag.text.strip('" ').upper()] = 'Not provided'
+                metadata[tag.text.strip('" ').upper()] = 'not provided'
             else:
                 metadata[tag.text.strip('" ').upper()] \
                     = value.text.strip('" ')
+
+		#adding loops to look for additional data
+        title= sample.find('TITLE')
+        if title.text is None:
+            metadata['title'] = 'not provided'
+        else:
+            metadata['title'] = title.text.strip('" ')
+        description = sample.find('DESCRIPTION')
+        try:
+            if description.text is None:
+                metadata['description'] = 'not provided'
+            else:
+                if sep in description.text:
+                    split_desc=description.text.strip('" ').split(sep)
+                    counter=0
+                    for i in split_desc:
+                        metadata['description_field_' + str(counter)] = i
+                        counter += 1
+                else:
+                    metadata['description'] = description.text.strip('" ')
+        except:
+            metadata['description'] = 'not provided'
+
+        nameInfo = sample.find('SAMPLE_NAME')
+        for node in nameInfo:
+            tag = node.tag
+            value = node.text
+            if value is None:
+                metadata[tag.text.strip('" ').upper()] = 'not provided'
+            else:
+                metadata[tag.strip('" ').upper()] = value.strip('" ')
+
+        idInfo = sample.find('IDENTIFIERS')
+        for node in idInfo:
+            value = node.text
+            d = node.attrib
+            if len(d) > 0:
+                for k in d.keys():
+                    tag = node.tag + "_" + d[k]
+                    if value is None:
+                        metadata[tag.text.strip('" ').upper()] = 'not provided'
+                    else:
+                        metadata[tag.strip('" ').upper()] = value.strip('" ')
+            else:
+                tag = node.tag
+
+                if value is None:
+                    metadata[tag.text.strip('" ').upper()] = 'not provided'
+                else:
+                    metadata[tag.strip('" ').upper()] = value.strip('" ')
+
         return metadata
 
     logger.info("downloading sample.txt file for each sample")
@@ -751,12 +862,13 @@ def sra_fetch_data_file(study_details):
                         "transcriptional),instrument_platform")
     parser.add_argument("-study", "--study_fileName", help="Study_file" +
                         " that contains study information")
-    parser.add_argument("-debug", "--debug", help="Debug mode: don't " +
+    parser.add_argument("-debug", "--debug", action='store_true', help="Debug mode: don't " +
                         "download fastq files")
-    parser.add_argument("-all-seqs", "--all_seqs", help="Accept " +
+    parser.add_argument("-all-seqs", "--all_seqs", action='store_true', help="Accept " +
                         "all type of sequence samples")
-    parser.add_argument("-all-platform", "--all_platform", help="Accept " +
+    parser.add_argument("-all-platforms", "--all_platforms", action='store_true', help="Accept " +
                         "all platform samples")
+    parser.add_argument("-sep","--sep",help="separator for description, default is ';' ")
     args = parser.parse_args()
 
     if args.ebiaccession is None and args.sraaccession is None:
@@ -770,18 +882,19 @@ def sra_fetch_data_file(study_details):
                     files for the entered SRA accession, and download the
                     FASTQ files.
                 Optional flags:
-                    -sample [sample_file_name]
-                    -prep [prep_file_name]
-                    -study [study_info_file_name]
-                    -debug true
-                    -all-seqs true
-                    -all-platform true
+                    -sample_info [sample_info_file_name]
+                    -prep_info [prep_info_file_name]
+                    -study_info [study_info_file_name]
+                    -debug
+                    -all-seqs
+                    -all-platforms
+                    -sep
                """)
         sys.exit(2)
 
-    DEBUG = True if args.debug == "true" else False
-    ALL_SEQS = True if args.all_seqs == "true" else False
-    ALL_PLATFORM = True if args.all_platform == "true" else False
+    DEBUG = args.debug
+    ALL_SEQS = args.all_seqs
+    ALL_PLATFORMS = args.all_platforms
 
     if args.ebiaccession is not None:
         # Output file names
@@ -791,6 +904,9 @@ def sra_fetch_data_file(study_details):
             if args.prep_fileName is None else args.prep_fileName
         study_file_name = args.ebiaccession + "_study_info.txt" \
             if args.study_fileName is None else args.study_fileName
+
+        #set parser settings
+        sep= ';' if args.sep is None else args.sep
         # Call create_details_file to generate .details.txt
         study_details = ebi_create_details_file(args.ebiaccession)