diff --git a/CHANGES.txt b/CHANGES.txt index 58aef77..540d999 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,7 @@ +== 1.2.4 == + +- implement option 'ignore_sequences' when reading a GFA file + == 1.2.3 == - make it possible to count input header lines correctly diff --git a/doc/tutorial/gfa.rst b/doc/tutorial/gfa.rst index 8bdca2c..16f9577 100644 --- a/doc/tutorial/gfa.rst +++ b/doc/tutorial/gfa.rst @@ -101,6 +101,12 @@ segment followed by a GFA2 segment). gfapy.error.VersionError: Version: 1.0 (None) ... +Finally, when creating a Gfa from file, a boolean argument ``ignore_sequences`` +is accepted, whose default value is ``False``. If set to ``True``, sequences in +segment lines are replaced by the placeholder (i.e. are not loaded). This is +useful for very large files. + + Collections of lines ~~~~~~~~~~~~~~~~~~~~ diff --git a/gfapy/gfa.py b/gfapy/gfa.py index 96e5304..5687128 100644 --- a/gfapy/gfa.py +++ b/gfapy/gfa.py @@ -189,11 +189,13 @@ def to_gfa2(self): # TODO: implement clone (see how clone for lines was implemented) - def read_file(self, filename): + def read_file(self, filename, ignore_sequences = False): """Read GFA data from a file and load it into the Gfa instance. Parameters: filename (str) + ignore_sequences (bool, default: False): replace sequences in S lines + with a placeholder ('*') """ if self._progress: linecount = 0 @@ -206,7 +208,7 @@ def read_file(self, filename): " containing {} lines".format(linecount)) with open(filename) as f: for line in f: - self.add_line(line.rstrip('\r\n')) + self.add_line(line.rstrip('\r\n'), ignore_sequences=ignore_sequences) if self._progress: self._progress_log("read_file") if self._line_queue: @@ -219,7 +221,8 @@ def read_file(self, filename): return self @classmethod - def from_file(cls, filename, vlevel = 1, version = None, dialect="standard"): + def from_file(cls, filename, vlevel = 1, version = None, dialect="standard", + ignore_sequences = False): """Create a Gfa instance from the contents of a GFA file. Parameters: @@ -227,12 +230,14 @@ def from_file(cls, filename, vlevel = 1, version = None, dialect="standard"): vlevel (int) : the validation level version (str) : the GFA version ('gfa1' or 'gfa2'; default: determine version automatically) + ignore_sequences (bool, default: False): replace sequences in S lines + with a placeholder ('*') Returns: gfapy.Gfa """ gfa = cls(vlevel = vlevel, version = version, dialect = dialect) - gfa.read_file(filename) + gfa.read_file(filename, ignore_sequences = ignore_sequences) return gfa def to_file(self, filename): diff --git a/gfapy/line/comment/construction.py b/gfapy/line/comment/construction.py index e4a39bc..e33330f 100644 --- a/gfapy/line/comment/construction.py +++ b/gfapy/line/comment/construction.py @@ -1,8 +1,9 @@ import gfapy class Construction: - def _initialize_positional_fields(self, strings): - self._init_field_value("content", "comment", strings[1], errmsginfo = strings) + def _initialize_positional_fields(self, strings, **kwargs): + self._init_field_value("content", "comment", strings[1], + errmsginfo = strings) sp = strings[2] if len(strings) > 2 else " " self._init_field_value("spacer", "comment", sp, errmsginfo = strings) diff --git a/gfapy/line/common/construction.py b/gfapy/line/common/construction.py index fb11fe0..a80bae7 100644 --- a/gfapy/line/common/construction.py +++ b/gfapy/line/common/construction.py @@ -44,7 +44,7 @@ class Construction: """ def __new__(cls, data, vlevel = 1, virtual = False, dialect = "standard", - version = None): + version = None, ignore_sequences = False): if isinstance(data, str): data = data.split("\t") if isinstance(data, list) and cls.RECORD_TYPE == None: @@ -52,7 +52,7 @@ def __new__(cls, data, vlevel = 1, virtual = False, dialect = "standard", return object.__new__(cls) def __init__(self, data, vlevel = 1, virtual = False, - version = None, dialect = "standard"): + version = None, dialect = "standard", ignore_sequences = False): self._dialect = dialect.lower() self.vlevel = vlevel self._virtual = virtual @@ -76,7 +76,8 @@ def __init__(self, data, vlevel = 1, virtual = False, self._compute_version(data[0]) else: self._validate_version() - self._initialize_positional_fields(data) + self._initialize_positional_fields(data, ignore_sequences = + ignore_sequences) self._initialize_tags(data) if self.vlevel >= 1: self._validate_record_type_specific_info() @@ -143,7 +144,7 @@ def _init_field_value(self, n ,t, s, errmsginfo = None): fieldname = n, line = errmsginfo) self._data[n] = s - def _initialize_positional_fields(self, strings): + def _initialize_positional_fields(self, strings, ignore_sequences = False): if strings[0] != self.RECORD_TYPE and self.RECORD_TYPE != "\n": raise gfapy.FormatError("Record type of records of "+ "class {} must be {} ({} found)".format(self.__class__, @@ -156,6 +157,8 @@ def _initialize_positional_fields(self, strings): raise gfapy.FormatError( "{} positional fields expected, ".format(self._n_positional_fields) + "{} found\n{}".format(len(strings)-1, repr(strings))) + if self.RECORD_TYPE == 'S' and ignore_sequences: + strings[self.POSFIELDS.index('sequence')+1] = '*' for i, n in enumerate(self.POSFIELDS): self._init_field_value(n, self.__class__.DATATYPE[n], strings[i+1], errmsginfo = strings) diff --git a/gfapy/line/custom_record/construction.py b/gfapy/line/custom_record/construction.py index 46cb919..3a8cbd6 100644 --- a/gfapy/line/custom_record/construction.py +++ b/gfapy/line/custom_record/construction.py @@ -29,7 +29,7 @@ def tagnames(self): if (not x in self.positional_fieldnames) \ and (x != "record_type")] - def _initialize_positional_fields(self, strings): + def _initialize_positional_fields(self, strings, **kwargs): """delayed, see #delayed_inizialize_positional_fields""" pass diff --git a/gfapy/lines/creators.py b/gfapy/lines/creators.py index 26893f3..87b92e3 100644 --- a/gfapy/lines/creators.py +++ b/gfapy/lines/creators.py @@ -2,7 +2,7 @@ class Creators: - def add_line(self, gfa_line): + def add_line(self, gfa_line, ignore_sequences = False): """Add a line to a GFA instance. Note: @@ -12,6 +12,8 @@ def add_line(self, gfa_line): gfa_line (str, Line): a line instance or a string, containing a line of a GFA file (if a string, a line instance is constructed using the string) + ignore_sequences (bool, default: False): replace sequences in S lines + with the placeholder ('*') Raises: gfapy.error.VersionError : If a wrong line type is used, for the GFA @@ -22,11 +24,12 @@ def add_line(self, gfa_line): if gfa_line is None: return if self._version == "gfa1": - self.__add_line_GFA1(gfa_line) + self.__add_line_GFA1(gfa_line, ignore_sequences = ignore_sequences) elif self._version == "gfa2": - self.__add_line_GFA2(gfa_line) + self.__add_line_GFA2(gfa_line, ignore_sequences = ignore_sequences) elif self._version is None: - self.__add_line_unknown_version(gfa_line) + self.__add_line_unknown_version(gfa_line, ignore_sequences = + ignore_sequences) else: raise gfapy.AssertionError("This point should never be reached") @@ -74,7 +77,7 @@ def _register_line(self, gfa_line): self._records[gfa_line.record_type] = {} self._records[gfa_line.record_type][id(gfa_line)] = gfa_line - def __add_line_unknown_version(self, gfa_line): + def __add_line_unknown_version(self, gfa_line, ignore_sequences = False): if isinstance(gfa_line, str): rt = gfa_line[0] elif isinstance(gfa_line, gfapy.Line): @@ -84,7 +87,8 @@ def __add_line_unknown_version(self, gfa_line): "Only strings and gfapy.Line instances can be added") if rt == "#": if isinstance(gfa_line, str): - gfa_line = gfapy.Line(gfa_line, dialect=self._dialect) + gfa_line = gfapy.Line(gfa_line, dialect=self._dialect, + ignore_sequences = ignore_sequences) gfa_line.connect(self) elif rt == "H": self._n_input_header_lines += 1 @@ -106,7 +110,7 @@ def __add_line_unknown_version(self, gfa_line): elif rt == "S": if isinstance(gfa_line, str): gfa_line = gfapy.Line(gfa_line, vlevel=self._vlevel, - dialect=self._dialect) + dialect=self._dialect, ignore_sequences = ignore_sequences) self._version = gfa_line.version self._version_explanation = \ "implied by: syntax of S {} line".format(gfa_line.name) @@ -126,11 +130,11 @@ def __add_line_unknown_version(self, gfa_line): else: self._line_queue.append(gfa_line) - def __add_line_GFA1(self, gfa_line): + def __add_line_GFA1(self, gfa_line, ignore_sequences = False): if isinstance(gfa_line, str): if gfa_line[0] == "S": gfa_line = gfapy.Line(gfa_line, vlevel=self._vlevel, - dialect=self._dialect) + dialect=self._dialect, ignore_sequences = ignore_sequences) else: gfa_line = gfapy.Line(gfa_line, vlevel=self._vlevel, dialect=self._dialect, version="gfa1") @@ -160,11 +164,11 @@ def __add_line_GFA1(self, gfa_line): raise gfapy.AssertionError( "Invalid record type {}. This should never happen".format(rt)) - def __add_line_GFA2(self, gfa_line): + def __add_line_GFA2(self, gfa_line, ignore_sequences = False): if isinstance(gfa_line, str): if gfa_line[0] == "S": gfa_line = gfapy.Line(gfa_line, vlevel=self._vlevel, - dialect=self._dialect) + dialect=self._dialect, ignore_sequences = ignore_sequences) else: gfa_line = gfapy.Line(gfa_line, vlevel=self._vlevel, version="gfa2", dialect=self._dialect) diff --git a/tests/test_api_gfa_basics.py b/tests/test_api_gfa_basics.py index 29b9901..b6c72b2 100644 --- a/tests/test_api_gfa_basics.py +++ b/tests/test_api_gfa_basics.py @@ -39,21 +39,38 @@ def test_to_s(self): for l in lines: gfa.append(l) self.assertEqual(set(lines), set(str(gfa).split("\n"))) - ## def test_from_file(self): - ## filename = "tests/testdata/example1.gfa" - ## gfa = gfapy.Gfa.from_file(filename) - ## assert(gfa) - ## with open(filename) as f: - ## txt = f.read() - ## self.assertEqual(txt, str(gfa)) + def test_from_file(self): + filename = "tests/testdata/example1.gfa" + gfa = gfapy.Gfa.from_file(filename) + assert(gfa) + gfa_lines = str(gfa).split("\n") + line_no = 0 + with open(filename) as f: + for line in f: + line = line.rstrip() + self.assertEqual(gfa_lines[line_no], line) + line_no += 1 - ## def test_to_file(self): - ## filename = "tests/testdata/example1.gfa" - ## gfa = gfapy.Gfa.from_file(filename) - ## tmp = Tempfile("example1") - ## gfa.to_file(tmp.path) - ## tmp.rewind - ## self.assertEqual(IO.read(filename), IO.read(tmp)) + def test_from_file_ignore_sequences(self): + filename1 = "tests/testdata/sample.gfa" + filename2 = "tests/testdata/sample_wo_seqs.gfa" + gfa = gfapy.Gfa.from_file(filename1, ignore_sequences = True) + assert(gfa) + gfa_lines = str(gfa).split("\n") + line_no = 0 + with open(filename2) as f: + for line in f: + line = line.rstrip() + self.assertEqual(gfa_lines[line_no], line) + line_no += 1 + + #def test_to_file(self): + # filename = "tests/testdata/example1.gfa" + # gfa = gfapy.Gfa.from_file(filename) + # tmp = Tempfile("example1") + # gfa.to_file(tmp.path) + # tmp.rewind + # self.assertEqual(IO.read(filename), IO.read(tmp)) def test_from_string(self): lines = ["H\tVN:Z:1.0","S\t1\t*","S\t2\t*","S\t3\t*", diff --git a/tests/test_gfapy_line_segment.py b/tests/test_gfapy_line_segment.py index 99174f0..b3a91a7 100644 --- a/tests/test_gfapy_line_segment.py +++ b/tests/test_gfapy_line_segment.py @@ -37,6 +37,15 @@ def test_from_string(self): f=["S","2","*","LN:i:3"] gfapy.Line("\t".join(f)) + def test_from_string_ignore_sequences(self): + fields = ["S","1","ACGTCACANNN","RC:i:1232","LN:i:11","ab:Z:abcd", + "FC:i:2321","KC:i:1212"] + string = "\t".join(fields) + l = gfapy.Line(string) + self.assertEqual("ACGTCACANNN", l.sequence) + l = gfapy.Line(string, ignore_sequences = True) + self.assertEqual(gfapy.is_placeholder(l.sequence), True) + def test_forbidden_segment_names(self): gfapy.Line("S\tA+B\t*") gfapy.Line("S\tA-B\t*") diff --git a/tests/testdata/sample_wo_seqs.gfa b/tests/testdata/sample_wo_seqs.gfa new file mode 100644 index 0000000..f404b69 --- /dev/null +++ b/tests/testdata/sample_wo_seqs.gfa @@ -0,0 +1,12 @@ +H VN:Z:1.0 +H ul:Z:https://github.com/sjackman/assembly-graph/blob/master/sample.gfa +S 1 * +S 2 * +S 3 * RC:i:4 +S 4 * +S 5 * +S 6 * +L 1 + 2 + 5M +L 3 + 2 + 0M +L 3 + 4 - 1M1D2M1S +L 4 - 5 + 0M