Skip to content

Commit

Permalink
Merge pull request #11049 from jmchilton/binary_sniffing_prefixes
Browse files Browse the repository at this point in the history
  • Loading branch information
nsoranzo authored Jan 5, 2021
2 parents c3072d1 + 06be029 commit c9ffab3
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 65 deletions.
95 changes: 30 additions & 65 deletions lib/galaxy/datatypes/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import h5py
import pysam
import pysam.bcftools
from bx.seq.twobit import TWOBIT_MAGIC_NUMBER, TWOBIT_MAGIC_NUMBER_SWAP, TWOBIT_MAGIC_SIZE
from bx.seq.twobit import TWOBIT_MAGIC_NUMBER, TWOBIT_MAGIC_NUMBER_SWAP

from galaxy import util
from galaxy.datatypes import metadata
Expand All @@ -28,6 +28,7 @@
get_file_peek,
)
from galaxy.datatypes.metadata import DictParameter, ListParameter, MetadataElement, MetadataParameter
from galaxy.datatypes.sniff import build_sniff_from_prefix
from galaxy.util import nice_size, sqlite
from galaxy.util.checkers import is_bz2, is_gzip
from . import data, dataproviders
Expand Down Expand Up @@ -1137,6 +1138,7 @@ def display_peek(self, dataset):
return "Binary Anndata file (%s)" % (nice_size(dataset.get_size()))


@build_sniff_from_prefix
class GmxBinary(Binary):
"""
Base class for GROMACS binary files - xtc, trr, cpt
Expand All @@ -1145,15 +1147,9 @@ class GmxBinary(Binary):
magic_number: Optional[int] = None # variables to be overwritten in the child class
file_ext = ""

def sniff(self, filename):
def sniff_prefix(self, sniff_prefix):
# The first 4 bytes of any GROMACS binary file containing the magic number
try:
header = open(filename, 'rb').read(struct.calcsize('>1i'))
if struct.unpack('>1i', header)[0] == self.magic_number:
return True
return False
except Exception:
return False
return sniff_prefix.magic_header('>1i') == self.magic_number

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
Expand Down Expand Up @@ -1450,22 +1446,17 @@ def display_peek(self, dataset):
return "Binary scf sequence file (%s)" % (nice_size(dataset.get_size()))


@build_sniff_from_prefix
class Sff(Binary):
""" Standard Flowgram Format (SFF) """
edam_format = "format_3284"
edam_data = "data_0924"
file_ext = "sff"

def sniff(self, filename):
def sniff_prefix(self, sniff_prefix):
# The first 4 bytes of any sff file is '.sff', and the file is binary. For details
# about the format, see http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=format
try:
header = open(filename, 'rb').read(4)
if header == b'.sff':
return True
return False
except Exception:
return False
return sniff_prefix.startswith_bytes(b'.sff')

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
Expand All @@ -1482,6 +1473,7 @@ def display_peek(self, dataset):
return "Binary sff file (%s)" % (nice_size(dataset.get_size()))


@build_sniff_from_prefix
class BigWig(Binary):
"""
Accessing binary BigWig files from UCSC.
Expand All @@ -1499,15 +1491,8 @@ def __init__(self, **kwd):
self._magic = 0x888FFC26
self._name = "BigWig"

def _unpack(self, pattern, handle):
return struct.unpack(pattern, handle.read(struct.calcsize(pattern)))

def sniff(self, filename):
try:
magic = self._unpack("I", open(filename, 'rb'))
return magic[0] == self._magic
except Exception:
return False
def sniff_prefix(self, sniff_prefix):
return sniff_prefix.magic_header("I") == self._magic

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
Expand Down Expand Up @@ -1537,23 +1522,16 @@ def __init__(self, **kwd):
self._name = "BigBed"


@build_sniff_from_prefix
class TwoBit(Binary):
"""Class describing a TwoBit format nucleotide file"""
edam_format = "format_3009"
edam_data = "data_0848"
file_ext = "twobit"

def sniff(self, filename):
try:
# All twobit files start with a 16-byte header. If the file is smaller than 16 bytes, it's obviously not a valid twobit file.
if os.path.getsize(filename) < 16:
return False
header = open(filename, 'rb').read(TWOBIT_MAGIC_SIZE)
magic = struct.unpack(">L", header)[0]
if magic == TWOBIT_MAGIC_NUMBER or magic == TWOBIT_MAGIC_NUMBER_SWAP:
return True
except OSError:
return False
def sniff_prefix(self, sniff_prefix):
magic = sniff_prefix.magic_header(">L")
return magic == TWOBIT_MAGIC_NUMBER or magic == TWOBIT_MAGIC_NUMBER_SWAP

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
Expand Down Expand Up @@ -2167,22 +2145,16 @@ def display_peek(self, dataset):
return "Microsoft Excel XLS file (%s)" % (data.nice_size(dataset.get_size()))


@build_sniff_from_prefix
class Sra(Binary):
""" Sequence Read Archive (SRA) datatype originally from mdshw5/sra-tools-galaxy"""
file_ext = 'sra'

def sniff(self, filename):
def sniff_prefix(self, sniff_prefix):
""" The first 8 bytes of any NCBI sra file is 'NCBI.sra', and the file is binary.
For details about the format, see http://www.ncbi.nlm.nih.gov/books/n/helpsra/SRA_Overview_BK/#SRA_Overview_BK.4_SRA_Data_Structure
"""
try:
header = open(filename, 'rb').read(8)
if header == b'NCBI.sra':
return True
else:
return False
except Exception:
return False
return sniff_prefix.startswith_bytes(b'NCBI.sra')

def set_peek(self, dataset, is_multi_byte=False):
if not dataset.dataset.purged:
Expand Down Expand Up @@ -2579,6 +2551,7 @@ def display_peek(self, dataset):
return "SearchGUI Archive, version %s" % (dataset.metadata.searchgui_version or 'unknown')


@build_sniff_from_prefix
class NetCDF(Binary):
"""Binary data in netCDF format"""
file_ext = "netcdf"
Expand All @@ -2599,15 +2572,8 @@ def display_peek(self, dataset):
except Exception:
return "Binary netCDF file (%s)" % (nice_size(dataset.get_size()))

def sniff(self, filename):
try:
with open(filename, 'rb') as f:
header = f.read(3)
if header == b'CDF':
return True
return False
except Exception:
return False
def sniff_prefix(self, sniff_prefix):
return sniff_prefix.startswith_bytes(b'CDF')


class Dcd(Binary):
Expand Down Expand Up @@ -2713,6 +2679,7 @@ def display_peek(self, dataset):
return "Binary CHARMM velocity file (%s)" % (nice_size(dataset.get_size()))


@build_sniff_from_prefix
class DAA(Binary):
"""
Class describing an DAA (diamond alignment archive) file
Expand All @@ -2730,12 +2697,12 @@ def __init__(self, **kwd):
super().__init__(**kwd)
self._magic = binascii.unhexlify("6be33e6d47530e3c")

def sniff(self, filename):
def sniff_prefix(self, sniff_prefix):
# The first 8 bytes of any daa file are 0x3c0e53476d3ee36b
with open(filename, 'rb') as f:
return f.read(8) == self._magic
return sniff_prefix.startswith_bytes(self._magic)


@build_sniff_from_prefix
class RMA6(Binary):
"""
Class describing an RMA6 (MEGAN6 read-match archive) file
Expand All @@ -2753,12 +2720,11 @@ def __init__(self, **kwd):
super().__init__(**kwd)
self._magic = binascii.unhexlify("000003f600000006")

def sniff(self, filename):
# The first 8 bytes of any daa file are 0x3c0e53476d3ee36b
with open(filename, 'rb') as f:
return f.read(8) == self._magic
def sniff_prefix(self, sniff_prefix):
return sniff_prefix.startswith_bytes(self._magic)


@build_sniff_from_prefix
class DMND(Binary):
"""
Class describing an DMND file
Expand All @@ -2776,10 +2742,9 @@ def __init__(self, **kwd):
super().__init__(**kwd)
self._magic = binascii.unhexlify("6d18ee15a4f84a02")

def sniff(self, filename):
def sniff_prefix(self, sniff_prefix):
# The first 8 bytes of any dmnd file are 0x24af8a415ee186d
with open(filename, 'rb') as f:
return f.read(8) == self._magic
return sniff_prefix.startswith_bytes(self._magic)


class ICM(Binary):
Expand Down
14 changes: 14 additions & 0 deletions lib/galaxy/datatypes/sniff.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import os
import re
import shutil
import struct
import sys
import tempfile
import urllib.request
Expand Down Expand Up @@ -597,6 +598,19 @@ def search(self, pattern):
def search_str(self, query_str):
return query_str in self.contents_header

def magic_header(self, pattern):
"""
Unpack header and get first element
"""
size = struct.calcsize(pattern)
header_bytes = self.contents_header_bytes[:size]
if len(header_bytes) < size:
return None
return struct.unpack(pattern, header_bytes)[0]

def startswith_bytes(self, test_bytes):
return self.contents_header_bytes.startswith(test_bytes)


def build_sniff_from_prefix(klass):
# Build and attach a sniff function to this class (klass) from the sniff_prefix function
Expand Down

0 comments on commit c9ffab3

Please sign in to comment.