From 8e7e37e952a0d31b2e3a7bb8673c95d76c96e6f5 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Mon, 4 Jan 2021 13:15:52 -0500 Subject: [PATCH 1/2] Optimized binary datatype sniffing. --- lib/galaxy/datatypes/binary.py | 95 +++++++++++----------------------- lib/galaxy/datatypes/sniff.py | 16 ++++++ 2 files changed, 46 insertions(+), 65 deletions(-) diff --git a/lib/galaxy/datatypes/binary.py b/lib/galaxy/datatypes/binary.py index a978d8e25d48..d4374f2ee36a 100644 --- a/lib/galaxy/datatypes/binary.py +++ b/lib/galaxy/datatypes/binary.py @@ -19,7 +19,7 @@ import h5py import pysam import pysam.bcftools -from bx.seq.twobit import TWOBIT_MAGIC_NUMBER, TWOBIT_MAGIC_NUMBER_SWAP, TWOBIT_MAGIC_SIZE +from bx.seq.twobit import TWOBIT_MAGIC_NUMBER, TWOBIT_MAGIC_NUMBER_SWAP from galaxy import util from galaxy.datatypes import metadata @@ -28,6 +28,7 @@ get_file_peek, ) from galaxy.datatypes.metadata import DictParameter, ListParameter, MetadataElement, MetadataParameter +from galaxy.datatypes.sniff import build_sniff_from_prefix from galaxy.util import nice_size, sqlite from galaxy.util.checkers import is_bz2, is_gzip from . import data, dataproviders @@ -1137,6 +1138,7 @@ def display_peek(self, dataset): return "Binary Anndata file (%s)" % (nice_size(dataset.get_size())) +@build_sniff_from_prefix class GmxBinary(Binary): """ Base class for GROMACS binary files - xtc, trr, cpt @@ -1145,15 +1147,9 @@ class GmxBinary(Binary): magic_number: Optional[int] = None # variables to be overwritten in the child class file_ext = "" - def sniff(self, filename): + def sniff_prefix(self, sniff_prefix): # The first 4 bytes of any GROMACS binary file containing the magic number - try: - header = open(filename, 'rb').read(struct.calcsize('>1i')) - if struct.unpack('>1i', header)[0] == self.magic_number: - return True - return False - except Exception: - return False + return sniff_prefix.magic_header('>1i') == self.magic_number def set_peek(self, dataset, is_multi_byte=False): if not dataset.dataset.purged: @@ -1450,22 +1446,17 @@ def display_peek(self, dataset): return "Binary scf sequence file (%s)" % (nice_size(dataset.get_size())) +@build_sniff_from_prefix class Sff(Binary): """ Standard Flowgram Format (SFF) """ edam_format = "format_3284" edam_data = "data_0924" file_ext = "sff" - def sniff(self, filename): + def sniff_prefix(self, sniff_prefix): # The first 4 bytes of any sff file is '.sff', and the file is binary. For details # about the format, see http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=format - try: - header = open(filename, 'rb').read(4) - if header == b'.sff': - return True - return False - except Exception: - return False + return sniff_prefix.startswith_bytes(b'.sff') def set_peek(self, dataset, is_multi_byte=False): if not dataset.dataset.purged: @@ -1482,6 +1473,7 @@ def display_peek(self, dataset): return "Binary sff file (%s)" % (nice_size(dataset.get_size())) +@build_sniff_from_prefix class BigWig(Binary): """ Accessing binary BigWig files from UCSC. @@ -1499,15 +1491,8 @@ def __init__(self, **kwd): self._magic = 0x888FFC26 self._name = "BigWig" - def _unpack(self, pattern, handle): - return struct.unpack(pattern, handle.read(struct.calcsize(pattern))) - - def sniff(self, filename): - try: - magic = self._unpack("I", open(filename, 'rb')) - return magic[0] == self._magic - except Exception: - return False + def sniff_prefix(self, sniff_prefix): + return sniff_prefix.magic_header("I") == self._magic def set_peek(self, dataset, is_multi_byte=False): if not dataset.dataset.purged: @@ -1537,23 +1522,16 @@ def __init__(self, **kwd): self._name = "BigBed" +@build_sniff_from_prefix class TwoBit(Binary): """Class describing a TwoBit format nucleotide file""" edam_format = "format_3009" edam_data = "data_0848" file_ext = "twobit" - def sniff(self, filename): - try: - # All twobit files start with a 16-byte header. If the file is smaller than 16 bytes, it's obviously not a valid twobit file. - if os.path.getsize(filename) < 16: - return False - header = open(filename, 'rb').read(TWOBIT_MAGIC_SIZE) - magic = struct.unpack(">L", header)[0] - if magic == TWOBIT_MAGIC_NUMBER or magic == TWOBIT_MAGIC_NUMBER_SWAP: - return True - except OSError: - return False + def sniff_prefix(self, sniff_prefix): + magic = sniff_prefix.magic_header(">L") + return magic == TWOBIT_MAGIC_NUMBER or magic == TWOBIT_MAGIC_NUMBER_SWAP def set_peek(self, dataset, is_multi_byte=False): if not dataset.dataset.purged: @@ -2167,22 +2145,16 @@ def display_peek(self, dataset): return "Microsoft Excel XLS file (%s)" % (data.nice_size(dataset.get_size())) +@build_sniff_from_prefix class Sra(Binary): """ Sequence Read Archive (SRA) datatype originally from mdshw5/sra-tools-galaxy""" file_ext = 'sra' - def sniff(self, filename): + def sniff_prefix(self, sniff_prefix): """ The first 8 bytes of any NCBI sra file is 'NCBI.sra', and the file is binary. For details about the format, see http://www.ncbi.nlm.nih.gov/books/n/helpsra/SRA_Overview_BK/#SRA_Overview_BK.4_SRA_Data_Structure """ - try: - header = open(filename, 'rb').read(8) - if header == b'NCBI.sra': - return True - else: - return False - except Exception: - return False + return sniff_prefix.startswith_bytes(b'NCBI.sra') def set_peek(self, dataset, is_multi_byte=False): if not dataset.dataset.purged: @@ -2579,6 +2551,7 @@ def display_peek(self, dataset): return "SearchGUI Archive, version %s" % (dataset.metadata.searchgui_version or 'unknown') +@build_sniff_from_prefix class NetCDF(Binary): """Binary data in netCDF format""" file_ext = "netcdf" @@ -2599,15 +2572,8 @@ def display_peek(self, dataset): except Exception: return "Binary netCDF file (%s)" % (nice_size(dataset.get_size())) - def sniff(self, filename): - try: - with open(filename, 'rb') as f: - header = f.read(3) - if header == b'CDF': - return True - return False - except Exception: - return False + def sniff_prefix(self, sniff_prefix): + return sniff_prefix.startswith_bytes(b'CDF') class Dcd(Binary): @@ -2713,6 +2679,7 @@ def display_peek(self, dataset): return "Binary CHARMM velocity file (%s)" % (nice_size(dataset.get_size())) +@build_sniff_from_prefix class DAA(Binary): """ Class describing an DAA (diamond alignment archive) file @@ -2730,12 +2697,12 @@ def __init__(self, **kwd): super().__init__(**kwd) self._magic = binascii.unhexlify("6be33e6d47530e3c") - def sniff(self, filename): + def sniff_prefix(self, sniff_prefix): # The first 8 bytes of any daa file are 0x3c0e53476d3ee36b - with open(filename, 'rb') as f: - return f.read(8) == self._magic + return sniff_prefix.startswith_bytes(self._magic) +@build_sniff_from_prefix class RMA6(Binary): """ Class describing an RMA6 (MEGAN6 read-match archive) file @@ -2753,12 +2720,11 @@ def __init__(self, **kwd): super().__init__(**kwd) self._magic = binascii.unhexlify("000003f600000006") - def sniff(self, filename): - # The first 8 bytes of any daa file are 0x3c0e53476d3ee36b - with open(filename, 'rb') as f: - return f.read(8) == self._magic + def sniff_prefix(self, sniff_prefix): + return sniff_prefix.startswith_bytes(self._magic) +@build_sniff_from_prefix class DMND(Binary): """ Class describing an DMND file @@ -2776,10 +2742,9 @@ def __init__(self, **kwd): super().__init__(**kwd) self._magic = binascii.unhexlify("6d18ee15a4f84a02") - def sniff(self, filename): + def sniff_prefix(self, sniff_prefix): # The first 8 bytes of any dmnd file are 0x24af8a415ee186d - with open(filename, 'rb') as f: - return f.read(8) == self._magic + return sniff_prefix.startswith_bytes(self._magic) class ICM(Binary): diff --git a/lib/galaxy/datatypes/sniff.py b/lib/galaxy/datatypes/sniff.py index 0eb3c2b8f675..5f0d88045e1b 100644 --- a/lib/galaxy/datatypes/sniff.py +++ b/lib/galaxy/datatypes/sniff.py @@ -10,6 +10,7 @@ import os import re import shutil +import struct import sys import tempfile import urllib.request @@ -597,6 +598,21 @@ def search(self, pattern): def search_str(self, query_str): return query_str in self.contents_header + def unpack_header(self, pattern): + size = struct.calcsize(pattern) + header_bytes = self.contents_header_bytes[0:size] + if len(header_bytes) < size: + return None + return struct.unpack(pattern, header_bytes) + + def magic_header(self, pattern): + # unpack header and get first element + unpacked = self.unpack_header(pattern) + return None if not unpacked else unpacked[0] + + def startswith_bytes(self, test_bytes): + return self.contents_header_bytes[0:len(test_bytes)] == test_bytes + def build_sniff_from_prefix(klass): # Build and attach a sniff function to this class (klass) from the sniff_prefix function From 06be029f68611b76609c17bc597d1039608d3362 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Tue, 5 Jan 2021 11:36:08 -0500 Subject: [PATCH 2/2] Apply suggestions from code review Co-authored-by: Nicola Soranzo --- lib/galaxy/datatypes/sniff.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/lib/galaxy/datatypes/sniff.py b/lib/galaxy/datatypes/sniff.py index 5f0d88045e1b..00ce45133067 100644 --- a/lib/galaxy/datatypes/sniff.py +++ b/lib/galaxy/datatypes/sniff.py @@ -598,20 +598,18 @@ def search(self, pattern): def search_str(self, query_str): return query_str in self.contents_header - def unpack_header(self, pattern): + def magic_header(self, pattern): + """ + Unpack header and get first element + """ size = struct.calcsize(pattern) - header_bytes = self.contents_header_bytes[0:size] + header_bytes = self.contents_header_bytes[:size] if len(header_bytes) < size: return None - return struct.unpack(pattern, header_bytes) - - def magic_header(self, pattern): - # unpack header and get first element - unpacked = self.unpack_header(pattern) - return None if not unpacked else unpacked[0] + return struct.unpack(pattern, header_bytes)[0] def startswith_bytes(self, test_bytes): - return self.contents_header_bytes[0:len(test_bytes)] == test_bytes + return self.contents_header_bytes.startswith(test_bytes) def build_sniff_from_prefix(klass):