implement ignore_sequences option

ggonnella · May 13, 2022 · eaa8de8 · eaa8de8
1 parent 12b31da
commit eaa8de8
Show file tree

Hide file tree

Showing 10 changed files with 97 additions and 36 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,3 +1,7 @@
+== 1.2.4 ==
+
+- implement option 'ignore_sequences' when reading a GFA file
+
 == 1.2.3 ==
 
 - make it possible to count input header lines correctly

diff --git a/doc/tutorial/gfa.rst b/doc/tutorial/gfa.rst
@@ -101,6 +101,12 @@ segment followed by a GFA2 segment).
     gfapy.error.VersionError: Version: 1.0 (None)
     ...
 
+Finally, when creating a Gfa from file, a boolean argument ``ignore_sequences``
+is accepted, whose default value is ``False``. If set to ``True``, sequences in
+segment lines are replaced by the placeholder (i.e. are not loaded). This is
+useful for very large files.
+
+
 Collections of lines
 ~~~~~~~~~~~~~~~~~~~~
 

diff --git a/gfapy/gfa.py b/gfapy/gfa.py
@@ -189,11 +189,13 @@ def to_gfa2(self):
 
   # TODO: implement clone (see how clone for lines was implemented)
 
-  def read_file(self, filename):
+  def read_file(self, filename, ignore_sequences = False):
     """Read GFA data from a file and load it into the Gfa instance.
 
     Parameters:
       filename (str)
+      ignore_sequences (bool, default: False): replace sequences in S lines
+        with a placeholder ('*')
     """
     if self._progress:
       linecount = 0
@@ -206,7 +208,7 @@ def read_file(self, filename):
                               " containing {} lines".format(linecount))
     with open(filename) as f:
       for line in f:
-        self.add_line(line.rstrip('\r\n'))
+        self.add_line(line.rstrip('\r\n'), ignore_sequences=ignore_sequences)
         if self._progress:
           self._progress_log("read_file")
     if self._line_queue:
@@ -219,20 +221,23 @@ def read_file(self, filename):
     return self
 
   @classmethod
-  def from_file(cls, filename, vlevel = 1, version = None, dialect="standard"):
+  def from_file(cls, filename, vlevel = 1, version = None, dialect="standard",
+                ignore_sequences = False):
     """Create a Gfa instance from the contents of a GFA file.
 
     Parameters:
       filename (str)
       vlevel (int) : the validation level
       version (str) : the GFA version ('gfa1' or 'gfa2'; default:
           determine version automatically)
+      ignore_sequences (bool, default: False): replace sequences in S lines
+        with a placeholder ('*')
 
     Returns:
       gfapy.Gfa
     """
     gfa = cls(vlevel = vlevel, version = version, dialect = dialect)
-    gfa.read_file(filename)
+    gfa.read_file(filename, ignore_sequences = ignore_sequences)
     return gfa
 
   def to_file(self, filename):

diff --git a/gfapy/line/comment/construction.py b/gfapy/line/comment/construction.py
@@ -1,8 +1,9 @@
 import gfapy
 
 class Construction:
-  def _initialize_positional_fields(self, strings):
-    self._init_field_value("content", "comment", strings[1], errmsginfo = strings)
+  def _initialize_positional_fields(self, strings, **kwargs):
+    self._init_field_value("content", "comment", strings[1],
+        errmsginfo = strings)
     sp = strings[2] if len(strings) > 2 else " "
     self._init_field_value("spacer", "comment", sp, errmsginfo = strings)
 

diff --git a/gfapy/line/common/construction.py b/gfapy/line/common/construction.py
@@ -44,15 +44,15 @@ class Construction:
   """
 
   def __new__(cls, data, vlevel = 1, virtual = False, dialect = "standard",
-      version = None):
+      version = None, ignore_sequences = False):
     if isinstance(data, str):
       data = data.split("\t")
     if isinstance(data, list) and cls.RECORD_TYPE == None:
       cls = gfapy.Line._subclass(data, version = version)
     return object.__new__(cls)
 
   def __init__(self, data, vlevel = 1, virtual = False,
-               version = None, dialect = "standard"):
+               version = None, dialect = "standard", ignore_sequences = False):
     self._dialect = dialect.lower()
     self.vlevel = vlevel
     self._virtual = virtual
@@ -76,7 +76,8 @@ def __init__(self, data, vlevel = 1, virtual = False,
         self._compute_version(data[0])
       else:
         self._validate_version()
-      self._initialize_positional_fields(data)
+      self._initialize_positional_fields(data, ignore_sequences =
+          ignore_sequences)
       self._initialize_tags(data)
       if self.vlevel >= 1:
         self._validate_record_type_specific_info()
@@ -143,7 +144,7 @@ def _init_field_value(self, n ,t, s, errmsginfo = None):
             fieldname = n, line = errmsginfo)
     self._data[n] = s
 
-  def _initialize_positional_fields(self, strings):
+  def _initialize_positional_fields(self, strings, ignore_sequences = False):
     if strings[0] != self.RECORD_TYPE and self.RECORD_TYPE != "\n":
       raise gfapy.FormatError("Record type of records of "+
           "class {} must be {} ({} found)".format(self.__class__,
@@ -156,6 +157,8 @@ def _initialize_positional_fields(self, strings):
       raise gfapy.FormatError(
         "{} positional fields expected, ".format(self._n_positional_fields) +
         "{} found\n{}".format(len(strings)-1, repr(strings)))
+    if self.RECORD_TYPE == 'S' and ignore_sequences:
+      strings[self.POSFIELDS.index('sequence')+1] = '*'
     for i, n in enumerate(self.POSFIELDS):
       self._init_field_value(n, self.__class__.DATATYPE[n], strings[i+1],
                        errmsginfo = strings)

diff --git a/gfapy/line/custom_record/construction.py b/gfapy/line/custom_record/construction.py
@@ -29,7 +29,7 @@ def tagnames(self):
              if (not x in self.positional_fieldnames) \
                  and (x != "record_type")]
 
-  def _initialize_positional_fields(self, strings):
+  def _initialize_positional_fields(self, strings, **kwargs):
     """delayed, see #delayed_inizialize_positional_fields"""
     pass
 

diff --git a/gfapy/lines/creators.py b/gfapy/lines/creators.py
@@ -2,7 +2,7 @@
 
 class Creators:
 
-  def add_line(self, gfa_line):
+  def add_line(self, gfa_line, ignore_sequences = False):
     """Add a line to a GFA instance.
 
     Note:
@@ -12,6 +12,8 @@ def add_line(self, gfa_line):
       gfa_line (str, Line): a line instance or a string, containing a line
         of a GFA file (if a string, a line instance is constructed using
         the string)
+      ignore_sequences (bool, default: False): replace sequences in S lines
+        with the placeholder ('*')
 
     Raises:
       gfapy.error.VersionError : If a wrong line type is used, for the GFA
@@ -22,11 +24,12 @@ def add_line(self, gfa_line):
     if gfa_line is None:
       return
     if self._version == "gfa1":
-      self.__add_line_GFA1(gfa_line)
+      self.__add_line_GFA1(gfa_line, ignore_sequences = ignore_sequences)
     elif self._version == "gfa2":
-      self.__add_line_GFA2(gfa_line)
+      self.__add_line_GFA2(gfa_line, ignore_sequences = ignore_sequences)
     elif self._version is None:
-      self.__add_line_unknown_version(gfa_line)
+      self.__add_line_unknown_version(gfa_line, ignore_sequences =
+          ignore_sequences)
     else:
       raise gfapy.AssertionError("This point should never be reached")
 
@@ -74,7 +77,7 @@ def _register_line(self, gfa_line):
         self._records[gfa_line.record_type] = {}
       self._records[gfa_line.record_type][id(gfa_line)] = gfa_line
 
-  def __add_line_unknown_version(self, gfa_line):
+  def __add_line_unknown_version(self, gfa_line, ignore_sequences = False):
     if isinstance(gfa_line, str):
       rt = gfa_line[0]
     elif isinstance(gfa_line, gfapy.Line):
@@ -84,7 +87,8 @@ def __add_line_unknown_version(self, gfa_line):
           "Only strings and gfapy.Line instances can be added")
     if rt == "#":
       if isinstance(gfa_line, str):
-        gfa_line = gfapy.Line(gfa_line, dialect=self._dialect)
+        gfa_line = gfapy.Line(gfa_line, dialect=self._dialect,
+            ignore_sequences = ignore_sequences)
       gfa_line.connect(self)
     elif rt == "H":
       self._n_input_header_lines += 1
@@ -106,7 +110,7 @@ def __add_line_unknown_version(self, gfa_line):
     elif rt == "S":
       if isinstance(gfa_line, str):
         gfa_line = gfapy.Line(gfa_line, vlevel=self._vlevel,
-            dialect=self._dialect)
+            dialect=self._dialect, ignore_sequences = ignore_sequences)
       self._version = gfa_line.version
       self._version_explanation = \
           "implied by: syntax of S {} line".format(gfa_line.name)
@@ -126,11 +130,11 @@ def __add_line_unknown_version(self, gfa_line):
     else:
       self._line_queue.append(gfa_line)
 
-  def __add_line_GFA1(self, gfa_line):
+  def __add_line_GFA1(self, gfa_line, ignore_sequences = False):
     if isinstance(gfa_line, str):
       if gfa_line[0] == "S":
         gfa_line = gfapy.Line(gfa_line, vlevel=self._vlevel,
-            dialect=self._dialect)
+            dialect=self._dialect, ignore_sequences = ignore_sequences)
       else:
         gfa_line = gfapy.Line(gfa_line, vlevel=self._vlevel,
             dialect=self._dialect, version="gfa1")
@@ -160,11 +164,11 @@ def __add_line_GFA1(self, gfa_line):
       raise gfapy.AssertionError(
         "Invalid record type {}. This should never happen".format(rt))
 
-  def __add_line_GFA2(self, gfa_line):
+  def __add_line_GFA2(self, gfa_line, ignore_sequences = False):
     if isinstance(gfa_line, str):
       if gfa_line[0] == "S":
         gfa_line = gfapy.Line(gfa_line, vlevel=self._vlevel,
-            dialect=self._dialect)
+            dialect=self._dialect, ignore_sequences = ignore_sequences)
       else:
         gfa_line = gfapy.Line(gfa_line, vlevel=self._vlevel,
                                         version="gfa2", dialect=self._dialect)

diff --git a/tests/test_api_gfa_basics.py b/tests/test_api_gfa_basics.py
@@ -39,21 +39,38 @@ def test_to_s(self):
     for l in lines: gfa.append(l)
     self.assertEqual(set(lines), set(str(gfa).split("\n")))
 
-  ## def test_from_file(self):
-  ##   filename = "tests/testdata/example1.gfa"
-  ##   gfa = gfapy.Gfa.from_file(filename)
-  ##   assert(gfa)
-  ##   with open(filename) as f:
-  ##     txt = f.read()
-  ##   self.assertEqual(txt, str(gfa))
+  def test_from_file(self):
+    filename = "tests/testdata/example1.gfa"
+    gfa = gfapy.Gfa.from_file(filename)
+    assert(gfa)
+    gfa_lines = str(gfa).split("\n")
+    line_no = 0
+    with open(filename) as f:
+      for line in f:
+        line = line.rstrip()
+        self.assertEqual(gfa_lines[line_no], line)
+        line_no += 1
 
-  ## def test_to_file(self):
-  ##   filename = "tests/testdata/example1.gfa"
-  ##   gfa = gfapy.Gfa.from_file(filename)
-  ##   tmp = Tempfile("example1")
-  ##   gfa.to_file(tmp.path)
-  ##   tmp.rewind
-  ##   self.assertEqual(IO.read(filename), IO.read(tmp))
+  def test_from_file_ignore_sequences(self):
+    filename1 = "tests/testdata/sample.gfa"
+    filename2 = "tests/testdata/sample_wo_seqs.gfa"
+    gfa = gfapy.Gfa.from_file(filename1, ignore_sequences = True)
+    assert(gfa)
+    gfa_lines = str(gfa).split("\n")
+    line_no = 0
+    with open(filename2) as f:
+      for line in f:
+        line = line.rstrip()
+        self.assertEqual(gfa_lines[line_no], line)
+        line_no += 1
+
+  #def test_to_file(self):
+  #  filename = "tests/testdata/example1.gfa"
+  #  gfa = gfapy.Gfa.from_file(filename)
+  #  tmp = Tempfile("example1")
+  #  gfa.to_file(tmp.path)
+  #  tmp.rewind
+  #  self.assertEqual(IO.read(filename), IO.read(tmp))
 
   def test_from_string(self):
     lines = ["H\tVN:Z:1.0","S\t1\t*","S\t2\t*","S\t3\t*",

diff --git a/tests/test_gfapy_line_segment.py b/tests/test_gfapy_line_segment.py
@@ -37,6 +37,15 @@ def test_from_string(self):
     f=["S","2","*","LN:i:3"]
     gfapy.Line("\t".join(f))
 
+  def test_from_string_ignore_sequences(self):
+    fields = ["S","1","ACGTCACANNN","RC:i:1232","LN:i:11","ab:Z:abcd",
+            "FC:i:2321","KC:i:1212"]
+    string = "\t".join(fields)
+    l = gfapy.Line(string)
+    self.assertEqual("ACGTCACANNN", l.sequence)
+    l = gfapy.Line(string, ignore_sequences = True)
+    self.assertEqual(gfapy.is_placeholder(l.sequence), True)
+
   def test_forbidden_segment_names(self):
     gfapy.Line("S\tA+B\t*")
     gfapy.Line("S\tA-B\t*")

diff --git a/tests/testdata/sample_wo_seqs.gfa b/tests/testdata/sample_wo_seqs.gfa
@@ -0,0 +1,12 @@
+H	VN:Z:1.0
+H	ul:Z:https://github.com/sjackman/assembly-graph/blob/master/sample.gfa
+S	1	*
+S	2	*
+S	3	*	RC:i:4
+S	4	*
+S	5	*
+S	6	*
+L	1	+	2	+	5M
+L	3	+	2	+	0M
+L	3	+	4	-	1M1D2M1S
+L	4	-	5	+	0M