From 8650119e0812bc69333596c21b084b1fa3d816aa Mon Sep 17 00:00:00 2001
From: Nick Carleson <nick.carleson@gmail.com>
Date: Thu, 13 Oct 2022 14:22:34 -0700
Subject: [PATCH 1/9] Can update a gff from splitasm if no feats ovlp gaps

---
 ragtag_update_gff.py | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)
 mode change 100644 => 100755 ragtag_update_gff.py

diff --git a/ragtag_update_gff.py b/ragtag_update_gff.py
old mode 100644
new mode 100755
index 0c5cccc..6a99dab
--- a/ragtag_update_gff.py
+++ b/ragtag_update_gff.py
@@ -28,6 +28,7 @@
 import sys
 import argparse
 from collections import defaultdict
+import warnings
 
 from intervaltree import IntervalTree
 
@@ -35,25 +36,30 @@
 from ragtag_utilities.AGPFile import AGPFile
 
 
-def sub_update(gff_file, agp_file):
+def sub_update(gff_file, agp_file, is_split):
     # Make a dictionary associating each original sequence with an interval tree of component sequences
     trans = defaultdict(IntervalTree)
     agp = AGPFile(agp_file, mode="r")
     for agp_line in agp.iterate_lines():
 
         # Check that the agp file looks correct for this task
-        if agp_line.orientation == "-":
-            raise ValueError("The placement BED file is not formatted correctly. No sequences should be reverse complemented for misassembly correction.")
-        if not agp_line.comp_type == "W":
+        if not agp_line.comp_type == "W" and not is_split:
             raise ValueError("The placement BED file is not formatted correctly. All lines should be WGS contig (W).")
-        if agp_line.is_gap:
+        if agp_line.is_gap and not is_split:
             raise ValueError("There should be no gaps in the correction AGP file.")
+        # gaps don't have the orientation attribute so this check should come last
+        if not is_split:
+            if agp_line.orientation == "-":
+                raise ValueError("The placement BED file is not formatted correctly. No sequences should be reverse complemented for misassembly correction.")
 
-        start, end = agp_line.obj_beg - 1, agp_line.obj_end
-        trans[agp_line.obj][start:end] = agp_line.comp
+        # Cant adjust gap comps because gaps aren't assembled into object
+        if agp_line.comp_type == "W":
+            start, end = agp_line.obj_beg - 1, agp_line.obj_end
+            trans[agp_line.obj][start:end] = agp_line.comp
 
     # Iterate through the gff intervals and update them according to trans
     with open(gff_file, "r") as f:
+        print(list(trans.keys())[0])
         for line in f:
             line = line.rstrip()
             if line.startswith("#"):
@@ -61,6 +67,9 @@ def sub_update(gff_file, agp_file):
             else:
                 fields = line.split("\t")
                 h, s, e = fields[0], int(fields[3]), int(fields[4])
+                #all_attributes = fields[8]
+                #attributes = all_attributes.split(";")
+
                 s -= 1  # Keep everything zero-indexed
 
                 if h not in trans:
@@ -68,9 +77,13 @@ def sub_update(gff_file, agp_file):
 
                 ovlps = trans[h][s:e]
                 if len(ovlps) > 1:
+                    #for ovlp in ovlps:
+                    #    print(ovlp)
+                    #warnings.warn("%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Ignoring feature. Make sure to run 'correct' or 'splitasm' with '--gff'" % (h, s, e))
                     raise ValueError(
                         "%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e)
                     )
+                    #skipped_parent_feats =
                 if len(ovlps) < 1:
                     raise ValueError("The placement BED file is not formatted correctly.")
 
@@ -84,7 +97,7 @@ def sub_update(gff_file, agp_file):
                 print("\t".join(fields))
 
 
-def sup_update(gff_file, agp_file):
+def sup_update(gff_file, agp_file, is_split):
     # Make a dictionary associating each original sequence with the destination sequence
     trans = {}
     strands = {}
@@ -132,10 +145,11 @@ def sup_update(gff_file, agp_file):
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Update gff intervals given a RagTag AGP file", usage="ragtag.py updategff [-c] <genes.gff> <ragtag.agp>")
+    parser = argparse.ArgumentParser(description="Update gff intervals from `correct`, `scaffold`, or `splitasm` given a RagTag AGP file", usage="ragtag.py updategff [-c] <genes.gff> <ragtag.agp>")
     parser.add_argument("gff", nargs='?', default="", metavar="<genes.gff>", type=str, help="gff file")
     parser.add_argument("agp", nargs='?', default="", metavar="<ragtag.*.agp>", type=str, help="agp file")
     parser.add_argument("-c", action="store_true", default=False, help="update for misassembly correction (ragtag.correction.agp)")
+    parser.add_argument("-s", action="store_true", default=False, help="update for assembly splitting from RagTag")
 
     args = parser.parse_args()
 
@@ -149,9 +163,10 @@ def main():
     gff_file = os.path.abspath(args.gff)
     agp_file = os.path.abspath(args.agp)
     is_sub = args.c
+    is_split = args.s
 
-    if is_sub:
-        sub_update(gff_file, agp_file)
+    if is_sub or is_split:
+        sub_update(gff_file, agp_file, is_split)
     else:
         sup_update(gff_file, agp_file)
 

From cc73bf2e630eb9002874a2ac409f40b6514c1781 Mon Sep 17 00:00:00 2001
From: Nick Carleson <nick.carleson@gmail.com>
Date: Thu, 13 Oct 2022 16:17:30 -0700
Subject: [PATCH 2/9] Skips features that span gaps in AGP

---
 ragtag_update_gff.py | 58 ++++++++++++++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 18 deletions(-)

diff --git a/ragtag_update_gff.py b/ragtag_update_gff.py
index 6a99dab..30b2241 100755
--- a/ragtag_update_gff.py
+++ b/ragtag_update_gff.py
@@ -29,6 +29,7 @@
 import argparse
 from collections import defaultdict
 import warnings
+import re
 
 from intervaltree import IntervalTree
 
@@ -59,7 +60,9 @@ def sub_update(gff_file, agp_file, is_split):
 
     # Iterate through the gff intervals and update them according to trans
     with open(gff_file, "r") as f:
-        print(list(trans.keys())[0])
+        attr_id = re.compile('(?<=ID=).*')
+        attr_parent = re.compile('(?<=Parent=).*')
+        ovlp_ids = []
         for line in f:
             line = line.rstrip()
             if line.startswith("#"):
@@ -67,8 +70,15 @@ def sub_update(gff_file, agp_file, is_split):
             else:
                 fields = line.split("\t")
                 h, s, e = fields[0], int(fields[3]), int(fields[4])
-                #all_attributes = fields[8]
-                #attributes = all_attributes.split(";")
+                attributes = fields[8]
+                for attr in attributes.split(";"):
+                    feat_id_matches = attr_id.findall(attr)
+                    parent_matches = attr_parent.findall(attr)
+                    if feat_id_matches:
+                        feat_id = feat_id_matches
+                    if parent_matches:
+                        parent = parent_matches
+
 
                 s -= 1  # Keep everything zero-indexed
 
@@ -77,24 +87,36 @@ def sub_update(gff_file, agp_file, is_split):
 
                 ovlps = trans[h][s:e]
                 if len(ovlps) > 1:
-                    #for ovlp in ovlps:
-                    #    print(ovlp)
-                    #warnings.warn("%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Ignoring feature. Make sure to run 'correct' or 'splitasm' with '--gff'" % (h, s, e))
-                    raise ValueError(
-                        "%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e)
-                    )
-                    #skipped_parent_feats =
+                    warnings.warn("%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Make sure to run 'correct' or 'splitasm' with '--gff'" % (h, s, e))
+                    #raise ValueError(
+                    #    "%s:%d-%d in the gff file overlaps two sub-sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e)
+                    #)
+                    if feat_id:
+                        ovlp_ids.append(feat_id)
                 if len(ovlps) < 1:
                     raise ValueError("The placement BED file is not formatted correctly.")
 
-                # Get the data from the overlapping interval and print the new line
-                o = list(ovlps)[0]
-                new_s = s - o.begin
-                new_e = e - o.begin
-                fields[0] = o.data
-                fields[3] = str(new_s + 1)  # back to one-based indexing for gff format
-                fields[4] = str(new_e)
-                print("\t".join(fields))
+                # Check if feat needs to be skipped because of ID or parent match,
+                # complex solution ensuring orphan feats aren't added e.g. parent gene overlaps but not its stop codon
+                if (feat_id in ovlp_ids) or (parent_matches and parent in ovlp_ids):
+                    #if (feat_id in ovlp_ids):
+                    #    print("ID %s found in list" % (feat_id))
+                    #if (parent in ovlp_ids):
+                    #    print("Parent %s found in list" % (parent))
+                    #print("Ignoring %s spanning a gap between two sub sequences %s:%d-%d" % (feat_id, h, s, e))
+                    # To access level 3 feats, add parent to ID exclusion list
+                    if parent:
+                        ovlp_ids.append(feat_id)
+                    continue
+                else:
+                    # Get the data from the overlapping interval and print the new line
+                    o = list(ovlps)[0]
+                    new_s = s - o.begin
+                    new_e = e - o.begin
+                    fields[0] = o.data
+                    fields[3] = str(new_s + 1)  # back to one-based indexing for gff format
+                    fields[4] = str(new_e)
+                    print("\t".join(fields))
 
 
 def sup_update(gff_file, agp_file, is_split):

From 228004d306f444c145cf16a3c81ae4fe1a2dd7e8 Mon Sep 17 00:00:00 2001
From: Nick Carleson <nick.carleson@gmail.com>
Date: Thu, 13 Oct 2022 17:08:27 -0700
Subject: [PATCH 3/9] Bug fix -s not required for sup_update

---
 ragtag_update_gff.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/ragtag_update_gff.py b/ragtag_update_gff.py
index 30b2241..b38e960 100755
--- a/ragtag_update_gff.py
+++ b/ragtag_update_gff.py
@@ -28,7 +28,6 @@
 import sys
 import argparse
 from collections import defaultdict
-import warnings
 import re
 
 from intervaltree import IntervalTree
@@ -87,10 +86,10 @@ def sub_update(gff_file, agp_file, is_split):
 
                 ovlps = trans[h][s:e]
                 if len(ovlps) > 1:
-                    warnings.warn("%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Make sure to run 'correct' or 'splitasm' with '--gff'" % (h, s, e))
                     #raise ValueError(
                     #    "%s:%d-%d in the gff file overlaps two sub-sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e)
                     #)
+                    log("WARNING", "%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Skipping %s. Make sure to run 'correct' or 'splitasm' with '--gff'" % (h, s, e, feat_id))
                     if feat_id:
                         ovlp_ids.append(feat_id)
                 if len(ovlps) < 1:
@@ -119,7 +118,7 @@ def sub_update(gff_file, agp_file, is_split):
                     print("\t".join(fields))
 
 
-def sup_update(gff_file, agp_file, is_split):
+def sup_update(gff_file, agp_file):
     # Make a dictionary associating each original sequence with the destination sequence
     trans = {}
     strands = {}
@@ -167,7 +166,7 @@ def sup_update(gff_file, agp_file, is_split):
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Update gff intervals from `correct`, `scaffold`, or `splitasm` given a RagTag AGP file", usage="ragtag.py updategff [-c] <genes.gff> <ragtag.agp>")
+    parser = argparse.ArgumentParser(description="Update gff intervals from given a RagTag AGP file", usage="ragtag.py updategff [-c] <genes.gff> <ragtag.agp>")
     parser.add_argument("gff", nargs='?', default="", metavar="<genes.gff>", type=str, help="gff file")
     parser.add_argument("agp", nargs='?', default="", metavar="<ragtag.*.agp>", type=str, help="agp file")
     parser.add_argument("-c", action="store_true", default=False, help="update for misassembly correction (ragtag.correction.agp)")

From cf0bdb23d54a95cac2ad9980ab175e0489a69f4f Mon Sep 17 00:00:00 2001
From: Nick Carleson <nick.carleson@gmail.com>
Date: Thu, 13 Oct 2022 17:11:03 -0700
Subject: [PATCH 4/9] Added option to not split assembly across GFF features

---
 ragtag_splitasm.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 ragtag_splitasm.py

diff --git a/ragtag_splitasm.py b/ragtag_splitasm.py
old mode 100644
new mode 100755
index d050ac8..9b34fd9
--- a/ragtag_splitasm.py
+++ b/ragtag_splitasm.py
@@ -24,13 +24,17 @@
 SOFTWARE.
 """
 
+import os
 import re
 import sys
 import argparse
 
 import pysam
 
+from ragtag_correct import make_gff_interval_tree
+
 from ragtag_utilities.utilities import get_ragtag_version
+from ragtag_utilities.utilities import log
 from ragtag_utilities.AGPFile import AGPFile
 
 
@@ -39,6 +43,7 @@ def main():
     parser.add_argument("asm", metavar="<asm.fa>", default="", type=str, help="assembly fasta file (uncompressed or bgzipped)")
     parser.add_argument("-n", metavar="INT", type=int, default=0, help="minimum gap size [0]")
     parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.splitasm.agp", help="output AGP file path [./ragtag.splitasm.agp]")
+    parser.add_argument("--gff", metavar="<features.gff>", type=str, default="", help="don't break sequences within gff intervals [null]")
 
     # Parse the command line arguments
     args = parser.parse_args()
@@ -50,7 +55,11 @@ def main():
     asm_fn = args.asm
     min_gap_size = args.n
     agp_fn = args.o
-
+    gff_file = args.gff
+    if gff_file:
+        gff_file = os.path.abspath(gff_file)
+        it = make_gff_interval_tree(gff_file)
+        log("INFO", "Avoiding breaks within GFF intervals")
     # Initialize the AGP file
     agp = AGPFile(agp_fn, mode="w")
     agp.add_pragma()
@@ -63,6 +72,19 @@ def main():
         seq = fai.fetch(header).upper()
         seq_len = fai.get_reference_length(header)
         gap_coords = [(i.start(), i.end()) for i in re.finditer(r'N+', seq) if i.end() - i.start() > min_gap_size]
+        # Remove coordinates overlapping gff features
+        if gff_file:
+            #non_gff_breaks = dict()
+            new_breaks = []
+            for gap in gap_coords:
+                if it[header][gap[0]] or it[header][gap[1]]:
+                    log("INFO", "Avoiding breaking %s between %d-%d. This point intersects a feature in the gff file."
+                                % (header, gap[0], gap[1]))
+                else:
+                    new_breaks.append(gap)
+            if new_breaks:
+                #non_gff_breaks[header] = new_breaks
+                gap_coords = new_breaks
 
         if not gap_coords:
             new_header = "seq{0:08}".format(new_header_idx)

From 4d8991fb1e37f7487c904d51ee39a6a7b036cc80 Mon Sep 17 00:00:00 2001
From: Nick Carleson <nick.carleson@gmail.com>
Date: Tue, 25 Oct 2022 16:29:14 -0700
Subject: [PATCH 5/9] Fix: child features with parents that were skipped did
 not get skipped

---
 ragtag_update_gff.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/ragtag_update_gff.py b/ragtag_update_gff.py
index b38e960..18ca364 100755
--- a/ragtag_update_gff.py
+++ b/ragtag_update_gff.py
@@ -70,6 +70,8 @@ def sub_update(gff_file, agp_file, is_split):
                 fields = line.split("\t")
                 h, s, e = fields[0], int(fields[3]), int(fields[4])
                 attributes = fields[8]
+                parent = None
+                feat_id = None
                 for attr in attributes.split(";"):
                     feat_id_matches = attr_id.findall(attr)
                     parent_matches = attr_parent.findall(attr)
@@ -89,23 +91,19 @@ def sub_update(gff_file, agp_file, is_split):
                     #raise ValueError(
                     #    "%s:%d-%d in the gff file overlaps two sub-sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e)
                     #)
-                    log("WARNING", "%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Skipping %s. Make sure to run 'correct' or 'splitasm' with '--gff'" % (h, s, e, feat_id))
+                    log("WARNING", "%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Skipping %s and any child features. To retain feature, include '--gff' and re-run 'correct' or 'splitasm'" % (h, s, e, feat_id))
                     if feat_id:
                         ovlp_ids.append(feat_id)
+                    continue
                 if len(ovlps) < 1:
                     raise ValueError("The placement BED file is not formatted correctly.")
 
                 # Check if feat needs to be skipped because of ID or parent match,
                 # complex solution ensuring orphan feats aren't added e.g. parent gene overlaps but not its stop codon
-                if (feat_id in ovlp_ids) or (parent_matches and parent in ovlp_ids):
-                    #if (feat_id in ovlp_ids):
-                    #    print("ID %s found in list" % (feat_id))
-                    #if (parent in ovlp_ids):
-                    #    print("Parent %s found in list" % (parent))
-                    #print("Ignoring %s spanning a gap between two sub sequences %s:%d-%d" % (feat_id, h, s, e))
-                    # To access level 3 feats, add parent to ID exclusion list
-                    if parent:
-                        ovlp_ids.append(feat_id)
+                if parent and parent in ovlp_ids:
+                    log("WARNING", "Parent %s already excluded for overlapping two sub sequences. Skipping %s." % (parent, feat_id))
+                    # To access any potential level 3 feats, add parent to ID exclusion list
+                    ovlp_ids.append(feat_id)
                     continue
                 else:
                     # Get the data from the overlapping interval and print the new line

From 081857d9f9821e996c705c3586b12c170dc63cf6 Mon Sep 17 00:00:00 2001
From: Nick Carleson <nick.carleson@gmail.com>
Date: Wed, 30 Nov 2022 11:35:31 -0800
Subject: [PATCH 6/9] Splits contigs before any gaps using BED file. Ensure no
 new gaps will be created

---
 ragtag_splitasm.py | 67 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 64 insertions(+), 3 deletions(-)

diff --git a/ragtag_splitasm.py b/ragtag_splitasm.py
index 9b34fd9..26cf07b 100755
--- a/ragtag_splitasm.py
+++ b/ragtag_splitasm.py
@@ -39,11 +39,12 @@
 
 
 def main():
-    parser = argparse.ArgumentParser(description='Split sequencs at gaps', usage="ragtag.py splitasm <asm.fa>")
+    parser = argparse.ArgumentParser(description='Split sequences at gaps', usage="ragtag.py splitasm <asm.fa>")
     parser.add_argument("asm", metavar="<asm.fa>", default="", type=str, help="assembly fasta file (uncompressed or bgzipped)")
     parser.add_argument("-n", metavar="INT", type=int, default=0, help="minimum gap size [0]")
     parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.splitasm.agp", help="output AGP file path [./ragtag.splitasm.agp]")
     parser.add_argument("--gff", metavar="<features.gff>", type=str, default="", help="don't break sequences within gff intervals [null]")
+    parser.add_argument("--bed", metavar="<splits.bed>", type=str, default="", help="BED file of where to break sequences [null]")
 
     # Parse the command line arguments
     args = parser.parse_args()
@@ -56,6 +57,17 @@ def main():
     min_gap_size = args.n
     agp_fn = args.o
     gff_file = args.gff
+    bed_file = args.bed
+    # Initialize Tabix-indexed BED file
+    if bed_file:
+        log("INFO", "Breaking across positions from BED file")
+        bed = pysam.TabixFile(bed_file)
+        bed_contigs = [row.contig for row in bed.fetch(parser = pysam.asBed())]
+        #print(bed_contigs)
+        # Fetch will break if sequence is absent
+        #for row in bed.fetch("seq00000000", parser = pysam.asBed()):
+        #    print(row)
+            #log("INFO", row.contig)
     if gff_file:
         gff_file = os.path.abspath(gff_file)
         it = make_gff_interval_tree(gff_file)
@@ -65,6 +77,8 @@ def main():
     agp.add_pragma()
     agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))
 
+
+
     # Process the FASTA file
     new_header_idx = 0
     fai = pysam.FastaFile(asm_fn)
@@ -72,6 +86,12 @@ def main():
         seq = fai.fetch(header).upper()
         seq_len = fai.get_reference_length(header)
         gap_coords = [(i.start(), i.end()) for i in re.finditer(r'N+', seq) if i.end() - i.start() > min_gap_size]
+        if bed_file and header in bed_contigs:
+            splits_pos = [(row.start, row.end) for row in bed.fetch(header, parser=pysam.asBed()) if row.end <= seq_len]
+        else:
+            splits_pos = []
+        #log("INFO", "Splitting %s at %s" % (header, splits_pos))
+        #print("Splitting %s at %s" % (header, splits_pos))
         # Remove coordinates overlapping gff features
         if gff_file:
             #non_gff_breaks = dict()
@@ -82,11 +102,22 @@ def main():
                                 % (header, gap[0], gap[1]))
                 else:
                     new_breaks.append(gap)
+            if splits_pos:
+                new_splits = []
+                for split in splits_pos:
+                    if it[header][split[1]]:
+                        log("INFO", "Avoiding breaking %s at %d. This point intersects a feature in the gff file."
+                                    % (header, split[1]))
+                    else:
+                        new_splits.append(split)
+                if new_splits:
+                    splits_pos = new_splits
             if new_breaks:
                 #non_gff_breaks[header] = new_breaks
                 gap_coords = new_breaks
+            
 
-        if not gap_coords:
+        if not gap_coords and not splits_pos:
             new_header = "seq{0:08}".format(new_header_idx)
             new_header_idx += 1
             agp.add_seq_line(header, "1", seq_len, "1", "W", new_header, "1", seq_len, "+")
@@ -96,7 +127,37 @@ def main():
             if gap_coords[0][0]:
                 # The sequence doesn't start with a gap
                 new_header = "seq{0:08}".format(new_header_idx)
-                agp.add_seq_line(header, "1", str(gap_coords[0][0]), str(pid), "W", new_header, "1", str(gap_coords[0][0]), "+")
+                # if there's a split coordinate before the end, that needs to be the "new" end
+                # First, if/ELSE allows current behavior to be the same
+                # Make any/all splits necessary before the *first* gap sequence
+                # *later*: will need to check for split coords between every gap coord
+                if splits_pos:
+                    log("INFO", "Should break %s %s times at gaps %s" % (header, len(splits_pos), str(splits_pos)))
+                    for i in range(0, len(splits_pos)):
+                        log("INFO", "Breaking %s at gap %s" % (header, str(splits_pos)))
+                        if splits_pos[i][1] < gap_coords[0][0]:
+                            # Only first iteration starts at 1 (object; component always starts at 1)
+                            if i == 0:
+                                obj_start=1
+                                obj_end=splits_pos[i][1]
+                                cmp_end=obj_end
+                                log("INFO", "Found first split in %s at %s" % (header, obj_end))
+                            else:
+                                obj_start=obj_end+1
+                                obj_end=splits_pos[i][1]
+                                cmp_end=obj_end - obj_start + 1
+                                log("INFO", "Found another split in %s at %s producing seq of length %s" % (header, obj_end, cmp_end))
+                            agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+")
+                            new_header_idx += 1
+                            new_header = "seq{0:08}".format(new_header_idx)
+                            pid += 1
+                    # Add in final component to object - from last break till end of the sequence
+                    obj_start=obj_end+1
+                    obj_end=seq_len
+                    cmp_end=obj_end - obj_start + 1
+                    agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+")
+                else:  
+                    agp.add_seq_line(header, "1", str(gap_coords[0][0]), str(pid), "W", new_header, "1", str(gap_coords[0][0]), "+")
                 new_header_idx += 1
                 pid += 1
 

From 80b78fbdf40c76aab1407eb1ca4a8b35753197a6 Mon Sep 17 00:00:00 2001
From: Nick Carleson <nick.carleson@gmail.com>
Date: Wed, 30 Nov 2022 12:13:37 -0800
Subject: [PATCH 7/9] Separate manual split functionality into new utility

---
 ragtag.py              |   5 ++
 ragtag_manualsplits.py | 160 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 165 insertions(+)
 mode change 100644 => 100755 ragtag.py
 create mode 100755 ragtag_manualsplits.py

diff --git a/ragtag.py b/ragtag.py
old mode 100644
new mode 100755
index 66dee59..d0d9d92
--- a/ragtag.py
+++ b/ragtag.py
@@ -64,6 +64,7 @@ def main():
       delta2paf       delta to PAF file conversion
       paf2delta       PAF to delta file conversion
       updategff       update gff intervals
+      manualsplits    split an assembly at manually identified misjoins
       
 
     options:
@@ -129,6 +130,10 @@ def main():
         elif cmd == "paf2delta":
             subcmd = ["ragtag_paf2delta.py"] + sys.argv[2:]
             subprocess.call(subcmd)
+            
+        elif cmd == "manualsplits":
+            subcmd = ["ragtag_manualsplits.py"] + sys.argv[2:]
+            subprocess.call(subcmd)
 
         else:
             print(description)
diff --git a/ragtag_manualsplits.py b/ragtag_manualsplits.py
new file mode 100755
index 0000000..dfea756
--- /dev/null
+++ b/ragtag_manualsplits.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python
+
+"""
+MIT License
+
+Copyright (c) 2021 Michael Alonge <malonge11@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+import os
+import re
+import sys
+import argparse
+
+import pysam
+
+from ragtag_correct import make_gff_interval_tree
+
+from ragtag_utilities.utilities import get_ragtag_version
+from ragtag_utilities.utilities import log
+from ragtag_utilities.AGPFile import AGPFile
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Split assembly at specified gaps', usage="ragtag.py manualsplits --bed <splits.bed> <asm.fa>")
+    parser.add_argument("asm", metavar="<asm.fa>", default="", type=str, help="assembly fasta file (uncompressed or bgzipped)")
+    parser.add_argument("--bed", metavar="<splits.bed.gz>", type=str, default="", help="bgzipped and Tabix-indexed BED file of where to break sequences.\nIf translating from 1-based coordinates e.g. SAM/BAM/VCF:\nuse the coordinate to be split on as the End of each BED region, and Start should be End-1.")
+    parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.manualsplits.agp", help="output AGP file path [./ragtag.splitasm.agp]")
+    parser.add_argument("--gff", metavar="<features.gff>", type=str, default="", help="don't break sequences within gff intervals [null]")
+
+    # Parse the command line arguments
+    args = parser.parse_args()
+    if not args.asm:
+        parser.print_help()
+        print("\n** The assembly FASTA file is required **")
+        sys.exit()
+    if not args.bed:
+        parser.print_help()
+        print("\n** BED file of  is required **")
+
+    asm_fn = args.asm
+    agp_fn = args.o
+    gff_file = args.gff
+    bed_file = args.bed
+    # Initialize Tabix-indexed BED file
+    if bed_file:
+        log("INFO", "Breaking across positions from BED file")
+        bed = pysam.TabixFile(bed_file)
+        bed_contigs = [row.contig for row in bed.fetch(parser = pysam.asBed())]
+        #print(bed_contigs)
+        # Fetch will break if sequence is absent
+        #for row in bed.fetch("seq00000000", parser = pysam.asBed()):
+        #    print(row)
+            #log("INFO", row.contig)
+    if gff_file:
+        gff_file = os.path.abspath(gff_file)
+        it = make_gff_interval_tree(gff_file)
+        log("INFO", "Avoiding breaks within GFF intervals")
+    # Initialize the AGP file
+    agp = AGPFile(agp_fn, mode="w")
+    agp.add_pragma()
+    agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))
+
+    # Process the FASTA file
+    new_header_idx = 0
+    fai = pysam.FastaFile(asm_fn)
+    for header in sorted(fai.references):
+        seq = fai.fetch(header).upper()
+        seq_len = fai.get_reference_length(header)
+        if bed_file and header in bed_contigs:
+            splits_pos = [(row.start, row.end) for row in bed.fetch(header, parser=pysam.asBed()) if row.end <= seq_len]
+        else:
+            splits_pos = []
+        #log("INFO", "Splitting %s at %s" % (header, splits_pos))
+        #print("Splitting %s at %s" % (header, splits_pos))
+        # Remove coordinates overlapping gff features
+        if gff_file:
+            #non_gff_breaks = dict()
+            if splits_pos:
+                new_splits = []
+                for split in splits_pos:
+                    if it[header][split[1]]:
+                        log("INFO", "Avoiding breaking %s at %d. This point intersects a feature in the gff file."
+                                    % (header, split[1]))
+                    else:
+                        new_splits.append(split)
+                if new_splits:
+                    splits_pos = new_splits
+            
+        if not splits_pos:
+            new_header = "seq{0:08}".format(new_header_idx)
+            new_header_idx += 1
+            agp.add_seq_line(header, "1", seq_len, "1", "W", new_header, "1", seq_len, "+")
+        else:
+            pid = 1
+            # The sequence doesn't start with a gap
+            new_header = "seq{0:08}".format(new_header_idx)
+            # if there's a split coordinate before the end, that needs to be the "new" end
+            # Make any/all splits necessary
+            if splits_pos:
+                log("INFO", "Should break %s %s times at gaps %s" % (header, len(splits_pos), str(splits_pos)))
+                for i in range(0, len(splits_pos)):
+                    log("INFO", "Breaking %s at gap %s" % (header, str(splits_pos)))
+                    # Only first iteration starts at 1 (object; component always starts at 1)
+                    if i == 0:
+                        obj_start=1
+                        obj_end=splits_pos[i][1]
+                        cmp_end=obj_end
+                        log("INFO", "Found first split in %s at %s" % (header, obj_end))
+                    else:
+                        obj_start=obj_end+1
+                        obj_end=splits_pos[i][1]
+                        cmp_end=obj_end - obj_start + 1
+                        log("INFO", "Found another split in %s at %s producing seq of length %s" % (header, obj_end, cmp_end))
+                    agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+")
+                    new_header_idx += 1
+                    new_header = "seq{0:08}".format(new_header_idx)
+                    pid += 1
+                # Add in final component to object - from last break till end of the sequence
+                obj_start=obj_end+1
+                obj_end=seq_len
+                cmp_end=obj_end - obj_start + 1
+                agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+")
+            else:  
+                agp.add_seq_line(header, "1", str(gap_coords[0][0]), str(pid), "W", new_header, "1", str(gap_coords[0][0]), "+")
+            new_header_idx += 1
+            pid += 1
+
+        agp.write()
+
+    # Iterate over the AGP file and print the sequences
+    agp = AGPFile(agp_fn, mode="r")
+    for line in agp.iterate_lines():
+        if not line.is_gap:
+            obj, comp, obj_beg, obj_end = line.obj, line.comp, line.obj_beg, line.obj_end
+            print(">" + comp)
+            print(fai.fetch(obj, obj_beg-1, obj_end))
+
+    fai.close()
+
+
+if __name__ == "__main__":
+    main()

From 1c295a323c271fc2d3565c00b835b80a5ea08f25 Mon Sep 17 00:00:00 2001
From: Nick Carleson <nick.carleson@gmail.com>
Date: Wed, 30 Nov 2022 12:22:33 -0800
Subject: [PATCH 8/9] Revert manual-split functionality out of splitasm

---
 ragtag_splitasm.py | 63 +---------------------------------------------
 1 file changed, 1 insertion(+), 62 deletions(-)

diff --git a/ragtag_splitasm.py b/ragtag_splitasm.py
index 26cf07b..1e63f39 100755
--- a/ragtag_splitasm.py
+++ b/ragtag_splitasm.py
@@ -44,7 +44,6 @@ def main():
     parser.add_argument("-n", metavar="INT", type=int, default=0, help="minimum gap size [0]")
     parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.splitasm.agp", help="output AGP file path [./ragtag.splitasm.agp]")
     parser.add_argument("--gff", metavar="<features.gff>", type=str, default="", help="don't break sequences within gff intervals [null]")
-    parser.add_argument("--bed", metavar="<splits.bed>", type=str, default="", help="BED file of where to break sequences [null]")
 
     # Parse the command line arguments
     args = parser.parse_args()
@@ -57,17 +56,6 @@ def main():
     min_gap_size = args.n
     agp_fn = args.o
     gff_file = args.gff
-    bed_file = args.bed
-    # Initialize Tabix-indexed BED file
-    if bed_file:
-        log("INFO", "Breaking across positions from BED file")
-        bed = pysam.TabixFile(bed_file)
-        bed_contigs = [row.contig for row in bed.fetch(parser = pysam.asBed())]
-        #print(bed_contigs)
-        # Fetch will break if sequence is absent
-        #for row in bed.fetch("seq00000000", parser = pysam.asBed()):
-        #    print(row)
-            #log("INFO", row.contig)
     if gff_file:
         gff_file = os.path.abspath(gff_file)
         it = make_gff_interval_tree(gff_file)
@@ -77,8 +65,6 @@ def main():
     agp.add_pragma()
     agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))
 
-
-
     # Process the FASTA file
     new_header_idx = 0
     fai = pysam.FastaFile(asm_fn)
@@ -86,12 +72,6 @@ def main():
         seq = fai.fetch(header).upper()
         seq_len = fai.get_reference_length(header)
         gap_coords = [(i.start(), i.end()) for i in re.finditer(r'N+', seq) if i.end() - i.start() > min_gap_size]
-        if bed_file and header in bed_contigs:
-            splits_pos = [(row.start, row.end) for row in bed.fetch(header, parser=pysam.asBed()) if row.end <= seq_len]
-        else:
-            splits_pos = []
-        #log("INFO", "Splitting %s at %s" % (header, splits_pos))
-        #print("Splitting %s at %s" % (header, splits_pos))
         # Remove coordinates overlapping gff features
         if gff_file:
             #non_gff_breaks = dict()
@@ -102,20 +82,9 @@ def main():
                                 % (header, gap[0], gap[1]))
                 else:
                     new_breaks.append(gap)
-            if splits_pos:
-                new_splits = []
-                for split in splits_pos:
-                    if it[header][split[1]]:
-                        log("INFO", "Avoiding breaking %s at %d. This point intersects a feature in the gff file."
-                                    % (header, split[1]))
-                    else:
-                        new_splits.append(split)
-                if new_splits:
-                    splits_pos = new_splits
             if new_breaks:
                 #non_gff_breaks[header] = new_breaks
                 gap_coords = new_breaks
-            
 
         if not gap_coords and not splits_pos:
             new_header = "seq{0:08}".format(new_header_idx)
@@ -127,37 +96,7 @@ def main():
             if gap_coords[0][0]:
                 # The sequence doesn't start with a gap
                 new_header = "seq{0:08}".format(new_header_idx)
-                # if there's a split coordinate before the end, that needs to be the "new" end
-                # First, if/ELSE allows current behavior to be the same
-                # Make any/all splits necessary before the *first* gap sequence
-                # *later*: will need to check for split coords between every gap coord
-                if splits_pos:
-                    log("INFO", "Should break %s %s times at gaps %s" % (header, len(splits_pos), str(splits_pos)))
-                    for i in range(0, len(splits_pos)):
-                        log("INFO", "Breaking %s at gap %s" % (header, str(splits_pos)))
-                        if splits_pos[i][1] < gap_coords[0][0]:
-                            # Only first iteration starts at 1 (object; component always starts at 1)
-                            if i == 0:
-                                obj_start=1
-                                obj_end=splits_pos[i][1]
-                                cmp_end=obj_end
-                                log("INFO", "Found first split in %s at %s" % (header, obj_end))
-                            else:
-                                obj_start=obj_end+1
-                                obj_end=splits_pos[i][1]
-                                cmp_end=obj_end - obj_start + 1
-                                log("INFO", "Found another split in %s at %s producing seq of length %s" % (header, obj_end, cmp_end))
-                            agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+")
-                            new_header_idx += 1
-                            new_header = "seq{0:08}".format(new_header_idx)
-                            pid += 1
-                    # Add in final component to object - from last break till end of the sequence
-                    obj_start=obj_end+1
-                    obj_end=seq_len
-                    cmp_end=obj_end - obj_start + 1
-                    agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+")
-                else:  
-                    agp.add_seq_line(header, "1", str(gap_coords[0][0]), str(pid), "W", new_header, "1", str(gap_coords[0][0]), "+")
+                agp.add_seq_line(header, "1", str(gap_coords[0][0]), str(pid), "W", new_header, "1", str(gap_coords[0][0]), "+")
                 new_header_idx += 1
                 pid += 1
 

From e5fac60a004388be981d54e1c4f6d5679a584f43 Mon Sep 17 00:00:00 2001
From: Nick Carleson <nick.carleson@gmail.com>
Date: Wed, 30 Nov 2022 12:35:25 -0800
Subject: [PATCH 9/9] Less logging for quieter runs

---
 ragtag_manualsplits.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/ragtag_manualsplits.py b/ragtag_manualsplits.py
index dfea756..9c07c41 100755
--- a/ragtag_manualsplits.py
+++ b/ragtag_manualsplits.py
@@ -64,11 +64,6 @@ def main():
         log("INFO", "Breaking across positions from BED file")
         bed = pysam.TabixFile(bed_file)
         bed_contigs = [row.contig for row in bed.fetch(parser = pysam.asBed())]
-        #print(bed_contigs)
-        # Fetch will break if sequence is absent
-        #for row in bed.fetch("seq00000000", parser = pysam.asBed()):
-        #    print(row)
-            #log("INFO", row.contig)
     if gff_file:
         gff_file = os.path.abspath(gff_file)
         it = make_gff_interval_tree(gff_file)
@@ -88,8 +83,6 @@ def main():
             splits_pos = [(row.start, row.end) for row in bed.fetch(header, parser=pysam.asBed()) if row.end <= seq_len]
         else:
             splits_pos = []
-        #log("INFO", "Splitting %s at %s" % (header, splits_pos))
-        #print("Splitting %s at %s" % (header, splits_pos))
         # Remove coordinates overlapping gff features
         if gff_file:
             #non_gff_breaks = dict()
@@ -115,9 +108,7 @@ def main():
             # if there's a split coordinate before the end, that needs to be the "new" end
             # Make any/all splits necessary
             if splits_pos:
-                log("INFO", "Should break %s %s times at gaps %s" % (header, len(splits_pos), str(splits_pos)))
                 for i in range(0, len(splits_pos)):
-                    log("INFO", "Breaking %s at gap %s" % (header, str(splits_pos)))
                     # Only first iteration starts at 1 (object; component always starts at 1)
                     if i == 0:
                         obj_start=1
@@ -128,7 +119,7 @@ def main():
                         obj_start=obj_end+1
                         obj_end=splits_pos[i][1]
                         cmp_end=obj_end - obj_start + 1
-                        log("INFO", "Found another split in %s at %s producing seq of length %s" % (header, obj_end, cmp_end))
+                        log("INFO", "Found another split in %s at %s" % (header, obj_end))
                     agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+")
                     new_header_idx += 1
                     new_header = "seq{0:08}".format(new_header_idx)