Skip to content

Commit

Permalink
implement ignore_sequences option
Browse files Browse the repository at this point in the history
  • Loading branch information
Giorgio Gonnella committed May 13, 2022
1 parent 12b31da commit eaa8de8
Show file tree
Hide file tree
Showing 10 changed files with 97 additions and 36 deletions.
4 changes: 4 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
== 1.2.4 ==

- implement option 'ignore_sequences' when reading a GFA file

== 1.2.3 ==

- make it possible to count input header lines correctly
Expand Down
6 changes: 6 additions & 0 deletions doc/tutorial/gfa.rst
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,12 @@ segment followed by a GFA2 segment).
gfapy.error.VersionError: Version: 1.0 (None)
...

Finally, when creating a Gfa from file, a boolean argument ``ignore_sequences``
is accepted, whose default value is ``False``. If set to ``True``, sequences in
segment lines are replaced by the placeholder (i.e. are not loaded). This is
useful for very large files.


Collections of lines
~~~~~~~~~~~~~~~~~~~~

Expand Down
13 changes: 9 additions & 4 deletions gfapy/gfa.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,11 +189,13 @@ def to_gfa2(self):

# TODO: implement clone (see how clone for lines was implemented)

def read_file(self, filename):
def read_file(self, filename, ignore_sequences = False):
"""Read GFA data from a file and load it into the Gfa instance.
Parameters:
filename (str)
ignore_sequences (bool, default: False): replace sequences in S lines
with a placeholder ('*')
"""
if self._progress:
linecount = 0
Expand All @@ -206,7 +208,7 @@ def read_file(self, filename):
" containing {} lines".format(linecount))
with open(filename) as f:
for line in f:
self.add_line(line.rstrip('\r\n'))
self.add_line(line.rstrip('\r\n'), ignore_sequences=ignore_sequences)
if self._progress:
self._progress_log("read_file")
if self._line_queue:
Expand All @@ -219,20 +221,23 @@ def read_file(self, filename):
return self

@classmethod
def from_file(cls, filename, vlevel = 1, version = None, dialect="standard"):
def from_file(cls, filename, vlevel = 1, version = None, dialect="standard",
ignore_sequences = False):
"""Create a Gfa instance from the contents of a GFA file.
Parameters:
filename (str)
vlevel (int) : the validation level
version (str) : the GFA version ('gfa1' or 'gfa2'; default:
determine version automatically)
ignore_sequences (bool, default: False): replace sequences in S lines
with a placeholder ('*')
Returns:
gfapy.Gfa
"""
gfa = cls(vlevel = vlevel, version = version, dialect = dialect)
gfa.read_file(filename)
gfa.read_file(filename, ignore_sequences = ignore_sequences)
return gfa

def to_file(self, filename):
Expand Down
5 changes: 3 additions & 2 deletions gfapy/line/comment/construction.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import gfapy

class Construction:
def _initialize_positional_fields(self, strings):
self._init_field_value("content", "comment", strings[1], errmsginfo = strings)
def _initialize_positional_fields(self, strings, **kwargs):
self._init_field_value("content", "comment", strings[1],
errmsginfo = strings)
sp = strings[2] if len(strings) > 2 else " "
self._init_field_value("spacer", "comment", sp, errmsginfo = strings)

Expand Down
11 changes: 7 additions & 4 deletions gfapy/line/common/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,15 @@ class Construction:
"""

def __new__(cls, data, vlevel = 1, virtual = False, dialect = "standard",
version = None):
version = None, ignore_sequences = False):
if isinstance(data, str):
data = data.split("\t")
if isinstance(data, list) and cls.RECORD_TYPE == None:
cls = gfapy.Line._subclass(data, version = version)
return object.__new__(cls)

def __init__(self, data, vlevel = 1, virtual = False,
version = None, dialect = "standard"):
version = None, dialect = "standard", ignore_sequences = False):
self._dialect = dialect.lower()
self.vlevel = vlevel
self._virtual = virtual
Expand All @@ -76,7 +76,8 @@ def __init__(self, data, vlevel = 1, virtual = False,
self._compute_version(data[0])
else:
self._validate_version()
self._initialize_positional_fields(data)
self._initialize_positional_fields(data, ignore_sequences =
ignore_sequences)
self._initialize_tags(data)
if self.vlevel >= 1:
self._validate_record_type_specific_info()
Expand Down Expand Up @@ -143,7 +144,7 @@ def _init_field_value(self, n ,t, s, errmsginfo = None):
fieldname = n, line = errmsginfo)
self._data[n] = s

def _initialize_positional_fields(self, strings):
def _initialize_positional_fields(self, strings, ignore_sequences = False):
if strings[0] != self.RECORD_TYPE and self.RECORD_TYPE != "\n":
raise gfapy.FormatError("Record type of records of "+
"class {} must be {} ({} found)".format(self.__class__,
Expand All @@ -156,6 +157,8 @@ def _initialize_positional_fields(self, strings):
raise gfapy.FormatError(
"{} positional fields expected, ".format(self._n_positional_fields) +
"{} found\n{}".format(len(strings)-1, repr(strings)))
if self.RECORD_TYPE == 'S' and ignore_sequences:
strings[self.POSFIELDS.index('sequence')+1] = '*'
for i, n in enumerate(self.POSFIELDS):
self._init_field_value(n, self.__class__.DATATYPE[n], strings[i+1],
errmsginfo = strings)
Expand Down
2 changes: 1 addition & 1 deletion gfapy/line/custom_record/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def tagnames(self):
if (not x in self.positional_fieldnames) \
and (x != "record_type")]

def _initialize_positional_fields(self, strings):
def _initialize_positional_fields(self, strings, **kwargs):
"""delayed, see #delayed_inizialize_positional_fields"""
pass

Expand Down
26 changes: 15 additions & 11 deletions gfapy/lines/creators.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

class Creators:

def add_line(self, gfa_line):
def add_line(self, gfa_line, ignore_sequences = False):
"""Add a line to a GFA instance.
Note:
Expand All @@ -12,6 +12,8 @@ def add_line(self, gfa_line):
gfa_line (str, Line): a line instance or a string, containing a line
of a GFA file (if a string, a line instance is constructed using
the string)
ignore_sequences (bool, default: False): replace sequences in S lines
with the placeholder ('*')
Raises:
gfapy.error.VersionError : If a wrong line type is used, for the GFA
Expand All @@ -22,11 +24,12 @@ def add_line(self, gfa_line):
if gfa_line is None:
return
if self._version == "gfa1":
self.__add_line_GFA1(gfa_line)
self.__add_line_GFA1(gfa_line, ignore_sequences = ignore_sequences)
elif self._version == "gfa2":
self.__add_line_GFA2(gfa_line)
self.__add_line_GFA2(gfa_line, ignore_sequences = ignore_sequences)
elif self._version is None:
self.__add_line_unknown_version(gfa_line)
self.__add_line_unknown_version(gfa_line, ignore_sequences =
ignore_sequences)
else:
raise gfapy.AssertionError("This point should never be reached")

Expand Down Expand Up @@ -74,7 +77,7 @@ def _register_line(self, gfa_line):
self._records[gfa_line.record_type] = {}
self._records[gfa_line.record_type][id(gfa_line)] = gfa_line

def __add_line_unknown_version(self, gfa_line):
def __add_line_unknown_version(self, gfa_line, ignore_sequences = False):
if isinstance(gfa_line, str):
rt = gfa_line[0]
elif isinstance(gfa_line, gfapy.Line):
Expand All @@ -84,7 +87,8 @@ def __add_line_unknown_version(self, gfa_line):
"Only strings and gfapy.Line instances can be added")
if rt == "#":
if isinstance(gfa_line, str):
gfa_line = gfapy.Line(gfa_line, dialect=self._dialect)
gfa_line = gfapy.Line(gfa_line, dialect=self._dialect,
ignore_sequences = ignore_sequences)
gfa_line.connect(self)
elif rt == "H":
self._n_input_header_lines += 1
Expand All @@ -106,7 +110,7 @@ def __add_line_unknown_version(self, gfa_line):
elif rt == "S":
if isinstance(gfa_line, str):
gfa_line = gfapy.Line(gfa_line, vlevel=self._vlevel,
dialect=self._dialect)
dialect=self._dialect, ignore_sequences = ignore_sequences)
self._version = gfa_line.version
self._version_explanation = \
"implied by: syntax of S {} line".format(gfa_line.name)
Expand All @@ -126,11 +130,11 @@ def __add_line_unknown_version(self, gfa_line):
else:
self._line_queue.append(gfa_line)

def __add_line_GFA1(self, gfa_line):
def __add_line_GFA1(self, gfa_line, ignore_sequences = False):
if isinstance(gfa_line, str):
if gfa_line[0] == "S":
gfa_line = gfapy.Line(gfa_line, vlevel=self._vlevel,
dialect=self._dialect)
dialect=self._dialect, ignore_sequences = ignore_sequences)
else:
gfa_line = gfapy.Line(gfa_line, vlevel=self._vlevel,
dialect=self._dialect, version="gfa1")
Expand Down Expand Up @@ -160,11 +164,11 @@ def __add_line_GFA1(self, gfa_line):
raise gfapy.AssertionError(
"Invalid record type {}. This should never happen".format(rt))

def __add_line_GFA2(self, gfa_line):
def __add_line_GFA2(self, gfa_line, ignore_sequences = False):
if isinstance(gfa_line, str):
if gfa_line[0] == "S":
gfa_line = gfapy.Line(gfa_line, vlevel=self._vlevel,
dialect=self._dialect)
dialect=self._dialect, ignore_sequences = ignore_sequences)
else:
gfa_line = gfapy.Line(gfa_line, vlevel=self._vlevel,
version="gfa2", dialect=self._dialect)
Expand Down
45 changes: 31 additions & 14 deletions tests/test_api_gfa_basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,21 +39,38 @@ def test_to_s(self):
for l in lines: gfa.append(l)
self.assertEqual(set(lines), set(str(gfa).split("\n")))

## def test_from_file(self):
## filename = "tests/testdata/example1.gfa"
## gfa = gfapy.Gfa.from_file(filename)
## assert(gfa)
## with open(filename) as f:
## txt = f.read()
## self.assertEqual(txt, str(gfa))
def test_from_file(self):
filename = "tests/testdata/example1.gfa"
gfa = gfapy.Gfa.from_file(filename)
assert(gfa)
gfa_lines = str(gfa).split("\n")
line_no = 0
with open(filename) as f:
for line in f:
line = line.rstrip()
self.assertEqual(gfa_lines[line_no], line)
line_no += 1

## def test_to_file(self):
## filename = "tests/testdata/example1.gfa"
## gfa = gfapy.Gfa.from_file(filename)
## tmp = Tempfile("example1")
## gfa.to_file(tmp.path)
## tmp.rewind
## self.assertEqual(IO.read(filename), IO.read(tmp))
def test_from_file_ignore_sequences(self):
filename1 = "tests/testdata/sample.gfa"
filename2 = "tests/testdata/sample_wo_seqs.gfa"
gfa = gfapy.Gfa.from_file(filename1, ignore_sequences = True)
assert(gfa)
gfa_lines = str(gfa).split("\n")
line_no = 0
with open(filename2) as f:
for line in f:
line = line.rstrip()
self.assertEqual(gfa_lines[line_no], line)
line_no += 1

#def test_to_file(self):
# filename = "tests/testdata/example1.gfa"
# gfa = gfapy.Gfa.from_file(filename)
# tmp = Tempfile("example1")
# gfa.to_file(tmp.path)
# tmp.rewind
# self.assertEqual(IO.read(filename), IO.read(tmp))

def test_from_string(self):
lines = ["H\tVN:Z:1.0","S\t1\t*","S\t2\t*","S\t3\t*",
Expand Down
9 changes: 9 additions & 0 deletions tests/test_gfapy_line_segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,15 @@ def test_from_string(self):
f=["S","2","*","LN:i:3"]
gfapy.Line("\t".join(f))

def test_from_string_ignore_sequences(self):
fields = ["S","1","ACGTCACANNN","RC:i:1232","LN:i:11","ab:Z:abcd",
"FC:i:2321","KC:i:1212"]
string = "\t".join(fields)
l = gfapy.Line(string)
self.assertEqual("ACGTCACANNN", l.sequence)
l = gfapy.Line(string, ignore_sequences = True)
self.assertEqual(gfapy.is_placeholder(l.sequence), True)

def test_forbidden_segment_names(self):
gfapy.Line("S\tA+B\t*")
gfapy.Line("S\tA-B\t*")
Expand Down
12 changes: 12 additions & 0 deletions tests/testdata/sample_wo_seqs.gfa
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
H VN:Z:1.0
H ul:Z:https://github.com/sjackman/assembly-graph/blob/master/sample.gfa
S 1 *
S 2 *
S 3 * RC:i:4
S 4 *
S 5 *
S 6 *
L 1 + 2 + 5M
L 3 + 2 + 0M
L 3 + 4 - 1M1D2M1S
L 4 - 5 + 0M

0 comments on commit eaa8de8

Please sign in to comment.