Skip to content

Commit

Permalink
Imported stockholm from @bgruening
Browse files Browse the repository at this point in the history
  • Loading branch information
bgruening authored and Eric Rasche committed May 5, 2015
1 parent 7ef0d4e commit 92ff779
Showing 1 changed file with 105 additions and 0 deletions.
105 changes: 105 additions & 0 deletions lib/galaxy/datatypes/msa.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,33 @@
from galaxy.datatypes.data import Text
from galaxy.datatypes.data import get_file_peek
from galaxy.datatypes.data import nice_size
from galaxy.datatypes.metadata import MetadataElement
import subprocess
import os


import logging
log = logging.getLogger(__name__)


def count_special_lines( word, filename, invert=False ):
"""
searching for special 'words' using the grep tool
grep is used to speed up the searching and counting
The number of hits is returned.
"""
try:
cmd = ["grep", "-c"]
if invert:
cmd.append('-v')
cmd.extend([word, filename])
out = subprocess.Popen(cmd, stdout=subprocess.PIPE)
return int(out.communicate()[0].split()[0])
except:
pass
return 0


class Hmmer3( Text ):
file_ext = "hmm"

Expand All @@ -29,3 +51,86 @@ def display_peek(self, dataset):
return dataset.peek
except:
return "HMMER3 database (%s)" % ( nice_size( dataset.get_size() ) )


class Stockholm_1_0( Text ):
file_ext = "stockholm"

MetadataElement( name="number_of_alignments", default=0, desc="Number of multiple alignments", readonly=True, visible=True, optional=True, no_value=0 )

def set_peek( self, dataset, is_multi_byte=False ):
if not dataset.dataset.purged:
dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
if (dataset.metadata.number_of_models == 1):
dataset.blurb = "1 alignment"
else:
dataset.blurb = "%s alignments" % dataset.metadata.number_of_models
dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disc'

def sniff( self, filename ):
if count_special_lines('^#[[:space:]+]STOCKHOLM[[:space:]+]1.0', filename) > 0:
return True
else:
return False

def set_meta( self, dataset, **kwd ):
"""
Set the number of models in dataset.
"""
dataset.metadata.number_of_models = count_special_lines('^#[[:space:]+]STOCKHOLM[[:space:]+]1.0', dataset.file_name)

def split( cls, input_datasets, subdir_generator_function, split_params):
"""
Split the input files by model records.
"""
if split_params is None:
return None

if len(input_datasets) > 1:
raise Exception("STOCKHOLM-file splitting does not support multiple files")
input_files = [ds.file_name for ds in input_datasets]

chunk_size = None
if split_params['split_mode'] == 'number_of_parts':
raise Exception('Split mode "%s" is currently not implemented for STOCKHOLM-files.' % split_params['split_mode'])
elif split_params['split_mode'] == 'to_size':
chunk_size = int(split_params['split_size'])
else:
raise Exception('Unsupported split mode %s' % split_params['split_mode'])

def _read_stockholm_records( filename ):
lines = []
with open(filename) as handle:
for line in handle:
lines.append( line )
if line.strip() == '//':
yield lines
lines = []

def _write_part_stockholm_file( accumulated_lines ):
part_dir = subdir_generator_function()
part_path = os.path.join( part_dir, os.path.basename( input_files[0] ) )
part_file = open( part_path, 'w' )
part_file.writelines( accumulated_lines )
part_file.close()

try:

stockholm_records = _read_stockholm_records( input_files[0] )
stockholm_lines_accumulated = []
for counter, stockholm_record in enumerate( stockholm_records, start=1):
stockholm_lines_accumulated.extend( stockholm_record )
if counter % chunk_size == 0:
_write_part_stockholm_file( stockholm_lines_accumulated )
stockholm_lines_accumulated = []
if stockholm_lines_accumulated:
_write_part_stockholm_file( stockholm_lines_accumulated )
except Exception, e:
log.error('Unable to split files: %s' % str(e))
raise
split = classmethod(split)

2 comments on commit 92ff779

@nsoranzo
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no "Pythonic" way to do count_special_lines() instead of calling grep?

@bgruening
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nsoranzo oh there are but all of them are way slower :)

Please sign in to comment.