malonge · Neato-Nick · Oct 13, 2022 · Oct 13, 2022 · Oct 14, 2022 · Oct 14, 2022
diff --git a/ragtag.py b/ragtag.py
@@ -64,6 +64,7 @@ def main():
       delta2paf       delta to PAF file conversion
       paf2delta       PAF to delta file conversion
       updategff       update gff intervals
+      manualsplits    split an assembly at manually identified misjoins
 
 
     options:
@@ -129,6 +130,10 @@ def main():
         elif cmd == "paf2delta":
             subcmd = ["ragtag_paf2delta.py"] + sys.argv[2:]
             subprocess.call(subcmd)
+
+        elif cmd == "manualsplits":
+            subcmd = ["ragtag_manualsplits.py"] + sys.argv[2:]
+            subprocess.call(subcmd)
 
         else:
             print(description)

diff --git a/ragtag_manualsplits.py b/ragtag_manualsplits.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+
+"""
+MIT License
+
+Copyright (c) 2021 Michael Alonge <[email protected]>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+import os
+import re
+import sys
+import argparse
+
+import pysam
+
+from ragtag_correct import make_gff_interval_tree
+
+from ragtag_utilities.utilities import get_ragtag_version
+from ragtag_utilities.utilities import log
+from ragtag_utilities.AGPFile import AGPFile
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Split assembly at specified gaps', usage="ragtag.py manualsplits --bed <splits.bed> <asm.fa>")
+    parser.add_argument("asm", metavar="<asm.fa>", default="", type=str, help="assembly fasta file (uncompressed or bgzipped)")
+    parser.add_argument("--bed", metavar="<splits.bed.gz>", type=str, default="", help="bgzipped and Tabix-indexed BED file of where to break sequences.\nIf translating from 1-based coordinates e.g. SAM/BAM/VCF:\nuse the coordinate to be split on as the End of each BED region, and Start should be End-1.")
+    parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.manualsplits.agp", help="output AGP file path [./ragtag.splitasm.agp]")
+    parser.add_argument("--gff", metavar="<features.gff>", type=str, default="", help="don't break sequences within gff intervals [null]")
+
+    # Parse the command line arguments
+    args = parser.parse_args()
+    if not args.asm:
+        parser.print_help()
+        print("\n** The assembly FASTA file is required **")
+        sys.exit()
+    if not args.bed:
+        parser.print_help()
+        print("\n** BED file of  is required **")
+
+    asm_fn = args.asm
+    agp_fn = args.o
+    gff_file = args.gff
+    bed_file = args.bed
+    # Initialize Tabix-indexed BED file
+    if bed_file:
+        log("INFO", "Breaking across positions from BED file")
+        bed = pysam.TabixFile(bed_file)
+        bed_contigs = [row.contig for row in bed.fetch(parser = pysam.asBed())]
+    if gff_file:
+        gff_file = os.path.abspath(gff_file)
+        it = make_gff_interval_tree(gff_file)
+        log("INFO", "Avoiding breaks within GFF intervals")
+    # Initialize the AGP file
+    agp = AGPFile(agp_fn, mode="w")
+    agp.add_pragma()
+    agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))
+
+    # Process the FASTA file
+    new_header_idx = 0
+    fai = pysam.FastaFile(asm_fn)
+    for header in sorted(fai.references):
+        seq = fai.fetch(header).upper()
+        seq_len = fai.get_reference_length(header)
+        if bed_file and header in bed_contigs:
+            splits_pos = [(row.start, row.end) for row in bed.fetch(header, parser=pysam.asBed()) if row.end <= seq_len]
+        else:
+            splits_pos = []
+        # Remove coordinates overlapping gff features
+        if gff_file:
+            #non_gff_breaks = dict()
+            if splits_pos:
+                new_splits = []
+                for split in splits_pos:
+                    if it[header][split[1]]:
+                        log("INFO", "Avoiding breaking %s at %d. This point intersects a feature in the gff file."
+                                    % (header, split[1]))
+                    else:
+                        new_splits.append(split)
+                if new_splits:
+                    splits_pos = new_splits
+
+        if not splits_pos:
+            new_header = "seq{0:08}".format(new_header_idx)
+            new_header_idx += 1
+            agp.add_seq_line(header, "1", seq_len, "1", "W", new_header, "1", seq_len, "+")
+        else:
+            pid = 1
+            # The sequence doesn't start with a gap
+            new_header = "seq{0:08}".format(new_header_idx)
+            # if there's a split coordinate before the end, that needs to be the "new" end
+            # Make any/all splits necessary
+            if splits_pos:
+                for i in range(0, len(splits_pos)):
+                    # Only first iteration starts at 1 (object; component always starts at 1)
+                    if i == 0:
+                        obj_start=1
+                        obj_end=splits_pos[i][1]
+                        cmp_end=obj_end
+                        log("INFO", "Found first split in %s at %s" % (header, obj_end))
+                    else:
+                        obj_start=obj_end+1
+                        obj_end=splits_pos[i][1]
+                        cmp_end=obj_end - obj_start + 1
+                        log("INFO", "Found another split in %s at %s" % (header, obj_end))
+                    agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+")
+                    new_header_idx += 1
+                    new_header = "seq{0:08}".format(new_header_idx)
+                    pid += 1
+                # Add in final component to object - from last break till end of the sequence
+                obj_start=obj_end+1
+                obj_end=seq_len
+                cmp_end=obj_end - obj_start + 1
+                agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+")
+            else:  
+                agp.add_seq_line(header, "1", str(gap_coords[0][0]), str(pid), "W", new_header, "1", str(gap_coords[0][0]), "+")
+            new_header_idx += 1
+            pid += 1
+
+        agp.write()
+
+    # Iterate over the AGP file and print the sequences
+    agp = AGPFile(agp_fn, mode="r")
+    for line in agp.iterate_lines():
+        if not line.is_gap:
+            obj, comp, obj_beg, obj_end = line.obj, line.comp, line.obj_beg, line.obj_end
+            print(">" + comp)
+            print(fai.fetch(obj, obj_beg-1, obj_end))
+
+    fai.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ragtag_splitasm.py b/ragtag_splitasm.py
@@ -24,21 +24,26 @@
 SOFTWARE.
 """
 
+import os
 import re
 import sys
 import argparse
 
 import pysam
 
+from ragtag_correct import make_gff_interval_tree
+
 from ragtag_utilities.utilities import get_ragtag_version
+from ragtag_utilities.utilities import log
 from ragtag_utilities.AGPFile import AGPFile
 
 
 def main():
-    parser = argparse.ArgumentParser(description='Split sequencs at gaps', usage="ragtag.py splitasm <asm.fa>")
+    parser = argparse.ArgumentParser(description='Split sequences at gaps', usage="ragtag.py splitasm <asm.fa>")
     parser.add_argument("asm", metavar="<asm.fa>", default="", type=str, help="assembly fasta file (uncompressed or bgzipped)")
     parser.add_argument("-n", metavar="INT", type=int, default=0, help="minimum gap size [0]")
     parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.splitasm.agp", help="output AGP file path [./ragtag.splitasm.agp]")
+    parser.add_argument("--gff", metavar="<features.gff>", type=str, default="", help="don't break sequences within gff intervals [null]")
 
     # Parse the command line arguments
     args = parser.parse_args()
@@ -50,7 +55,11 @@ def main():
     asm_fn = args.asm
     min_gap_size = args.n
     agp_fn = args.o
-
+    gff_file = args.gff
+    if gff_file:
+        gff_file = os.path.abspath(gff_file)
+        it = make_gff_interval_tree(gff_file)
+        log("INFO", "Avoiding breaks within GFF intervals")
     # Initialize the AGP file
     agp = AGPFile(agp_fn, mode="w")
     agp.add_pragma()
@@ -63,8 +72,21 @@ def main():
         seq = fai.fetch(header).upper()
         seq_len = fai.get_reference_length(header)
         gap_coords = [(i.start(), i.end()) for i in re.finditer(r'N+', seq) if i.end() - i.start() > min_gap_size]
-
-        if not gap_coords:
+        # Remove coordinates overlapping gff features
+        if gff_file:
+            #non_gff_breaks = dict()
+            new_breaks = []
+            for gap in gap_coords:
+                if it[header][gap[0]] or it[header][gap[1]]:
+                    log("INFO", "Avoiding breaking %s between %d-%d. This point intersects a feature in the gff file."
+                                % (header, gap[0], gap[1]))
+                else:
+                    new_breaks.append(gap)
+            if new_breaks:
+                #non_gff_breaks[header] = new_breaks
+                gap_coords = new_breaks
+
+        if not gap_coords and not splits_pos:
             new_header = "seq{0:08}".format(new_header_idx)
             new_header_idx += 1
             agp.add_seq_line(header, "1", seq_len, "1", "W", new_header, "1", seq_len, "+")

diff --git a/ragtag_update_gff.py b/ragtag_update_gff.py
@@ -28,60 +28,92 @@
 import sys
 import argparse
 from collections import defaultdict
+import re
 
 from intervaltree import IntervalTree
 
 from ragtag_utilities.utilities import log, get_ragtag_version
 from ragtag_utilities.AGPFile import AGPFile
 
 
-def sub_update(gff_file, agp_file):
+def sub_update(gff_file, agp_file, is_split):
     # Make a dictionary associating each original sequence with an interval tree of component sequences
     trans = defaultdict(IntervalTree)
     agp = AGPFile(agp_file, mode="r")
     for agp_line in agp.iterate_lines():
 
         # Check that the agp file looks correct for this task
-        if agp_line.orientation == "-":
-            raise ValueError("The placement BED file is not formatted correctly. No sequences should be reverse complemented for misassembly correction.")
-        if not agp_line.comp_type == "W":
+        if not agp_line.comp_type == "W" and not is_split:
             raise ValueError("The placement BED file is not formatted correctly. All lines should be WGS contig (W).")
-        if agp_line.is_gap:
+        if agp_line.is_gap and not is_split:
             raise ValueError("There should be no gaps in the correction AGP file.")
+        # gaps don't have the orientation attribute so this check should come last
+        if not is_split:
+            if agp_line.orientation == "-":
+                raise ValueError("The placement BED file is not formatted correctly. No sequences should be reverse complemented for misassembly correction.")
 
-        start, end = agp_line.obj_beg - 1, agp_line.obj_end
-        trans[agp_line.obj][start:end] = agp_line.comp
+        # Cant adjust gap comps because gaps aren't assembled into object
+        if agp_line.comp_type == "W":
+            start, end = agp_line.obj_beg - 1, agp_line.obj_end
+            trans[agp_line.obj][start:end] = agp_line.comp
 
     # Iterate through the gff intervals and update them according to trans
     with open(gff_file, "r") as f:
+        attr_id = re.compile('(?<=ID=).*')
+        attr_parent = re.compile('(?<=Parent=).*')
+        ovlp_ids = []
         for line in f:
             line = line.rstrip()
             if line.startswith("#"):
                 print(line)  # Print this comment line
             else:
                 fields = line.split("\t")
                 h, s, e = fields[0], int(fields[3]), int(fields[4])
+                attributes = fields[8]
+                parent = None
+                feat_id = None
+                for attr in attributes.split(";"):
+                    feat_id_matches = attr_id.findall(attr)
+                    parent_matches = attr_parent.findall(attr)
+                    if feat_id_matches:
+                        feat_id = feat_id_matches
+                    if parent_matches:
+                        parent = parent_matches
+
+
                 s -= 1  # Keep everything zero-indexed
 
                 if h not in trans:
                     raise ValueError("Inconsistent input files.")
 
                 ovlps = trans[h][s:e]
                 if len(ovlps) > 1:
-                    raise ValueError(
-                        "%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e)
-                    )
+                    #raise ValueError(
+                    #    "%s:%d-%d in the gff file overlaps two sub-sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e)
+                    #)
+                    log("WARNING", "%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Skipping %s and any child features. To retain feature, include '--gff' and re-run 'correct' or 'splitasm'" % (h, s, e, feat_id))
+                    if feat_id:
+                        ovlp_ids.append(feat_id)
+                    continue
                 if len(ovlps) < 1:
                     raise ValueError("The placement BED file is not formatted correctly.")
 
-                # Get the data from the overlapping interval and print the new line
-                o = list(ovlps)[0]
-                new_s = s - o.begin
-                new_e = e - o.begin
-                fields[0] = o.data
-                fields[3] = str(new_s + 1)  # back to one-based indexing for gff format
-                fields[4] = str(new_e)
-                print("\t".join(fields))
+                # Check if feat needs to be skipped because of ID or parent match,
+                # complex solution ensuring orphan feats aren't added e.g. parent gene overlaps but not its stop codon
+                if parent and parent in ovlp_ids:
+                    log("WARNING", "Parent %s already excluded for overlapping two sub sequences. Skipping %s." % (parent, feat_id))
+                    # To access any potential level 3 feats, add parent to ID exclusion list
+                    ovlp_ids.append(feat_id)
+                    continue
+                else:
+                    # Get the data from the overlapping interval and print the new line
+                    o = list(ovlps)[0]
+                    new_s = s - o.begin
+                    new_e = e - o.begin
+                    fields[0] = o.data
+                    fields[3] = str(new_s + 1)  # back to one-based indexing for gff format
+                    fields[4] = str(new_e)
+                    print("\t".join(fields))
 
 
 def sup_update(gff_file, agp_file):
@@ -132,10 +164,11 @@ def sup_update(gff_file, agp_file):
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Update gff intervals given a RagTag AGP file", usage="ragtag.py updategff [-c] <genes.gff> <ragtag.agp>")
+    parser = argparse.ArgumentParser(description="Update gff intervals from given a RagTag AGP file", usage="ragtag.py updategff [-c] <genes.gff> <ragtag.agp>")
     parser.add_argument("gff", nargs='?', default="", metavar="<genes.gff>", type=str, help="gff file")
     parser.add_argument("agp", nargs='?', default="", metavar="<ragtag.*.agp>", type=str, help="agp file")
     parser.add_argument("-c", action="store_true", default=False, help="update for misassembly correction (ragtag.correction.agp)")
+    parser.add_argument("-s", action="store_true", default=False, help="update for assembly splitting from RagTag")
 
     args = parser.parse_args()
 
@@ -149,9 +182,10 @@ def main():
     gff_file = os.path.abspath(args.gff)
     agp_file = os.path.abspath(args.agp)
     is_sub = args.c
+    is_split = args.s
 
-    if is_sub:
-        sub_update(gff_file, agp_file)
+    if is_sub or is_split:
+        sub_update(gff_file, agp_file, is_split)
     else:
         sup_update(gff_file, agp_file)