From 8650119e0812bc69333596c21b084b1fa3d816aa Mon Sep 17 00:00:00 2001 From: Nick Carleson Date: Thu, 13 Oct 2022 14:22:34 -0700 Subject: [PATCH 1/9] Can update a gff from splitasm if no feats ovlp gaps --- ragtag_update_gff.py | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) mode change 100644 => 100755 ragtag_update_gff.py diff --git a/ragtag_update_gff.py b/ragtag_update_gff.py old mode 100644 new mode 100755 index 0c5cccc..6a99dab --- a/ragtag_update_gff.py +++ b/ragtag_update_gff.py @@ -28,6 +28,7 @@ import sys import argparse from collections import defaultdict +import warnings from intervaltree import IntervalTree @@ -35,25 +36,30 @@ from ragtag_utilities.AGPFile import AGPFile -def sub_update(gff_file, agp_file): +def sub_update(gff_file, agp_file, is_split): # Make a dictionary associating each original sequence with an interval tree of component sequences trans = defaultdict(IntervalTree) agp = AGPFile(agp_file, mode="r") for agp_line in agp.iterate_lines(): # Check that the agp file looks correct for this task - if agp_line.orientation == "-": - raise ValueError("The placement BED file is not formatted correctly. No sequences should be reverse complemented for misassembly correction.") - if not agp_line.comp_type == "W": + if not agp_line.comp_type == "W" and not is_split: raise ValueError("The placement BED file is not formatted correctly. All lines should be WGS contig (W).") - if agp_line.is_gap: + if agp_line.is_gap and not is_split: raise ValueError("There should be no gaps in the correction AGP file.") + # gaps don't have the orientation attribute so this check should come last + if not is_split: + if agp_line.orientation == "-": + raise ValueError("The placement BED file is not formatted correctly. No sequences should be reverse complemented for misassembly correction.") - start, end = agp_line.obj_beg - 1, agp_line.obj_end - trans[agp_line.obj][start:end] = agp_line.comp + # Cant adjust gap comps because gaps aren't assembled into object + if agp_line.comp_type == "W": + start, end = agp_line.obj_beg - 1, agp_line.obj_end + trans[agp_line.obj][start:end] = agp_line.comp # Iterate through the gff intervals and update them according to trans with open(gff_file, "r") as f: + print(list(trans.keys())[0]) for line in f: line = line.rstrip() if line.startswith("#"): @@ -61,6 +67,9 @@ def sub_update(gff_file, agp_file): else: fields = line.split("\t") h, s, e = fields[0], int(fields[3]), int(fields[4]) + #all_attributes = fields[8] + #attributes = all_attributes.split(";") + s -= 1 # Keep everything zero-indexed if h not in trans: @@ -68,9 +77,13 @@ def sub_update(gff_file, agp_file): ovlps = trans[h][s:e] if len(ovlps) > 1: + #for ovlp in ovlps: + # print(ovlp) + #warnings.warn("%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Ignoring feature. Make sure to run 'correct' or 'splitasm' with '--gff'" % (h, s, e)) raise ValueError( "%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e) ) + #skipped_parent_feats = if len(ovlps) < 1: raise ValueError("The placement BED file is not formatted correctly.") @@ -84,7 +97,7 @@ def sub_update(gff_file, agp_file): print("\t".join(fields)) -def sup_update(gff_file, agp_file): +def sup_update(gff_file, agp_file, is_split): # Make a dictionary associating each original sequence with the destination sequence trans = {} strands = {} @@ -132,10 +145,11 @@ def sup_update(gff_file, agp_file): def main(): - parser = argparse.ArgumentParser(description="Update gff intervals given a RagTag AGP file", usage="ragtag.py updategff [-c] ") + parser = argparse.ArgumentParser(description="Update gff intervals from `correct`, `scaffold`, or `splitasm` given a RagTag AGP file", usage="ragtag.py updategff [-c] ") parser.add_argument("gff", nargs='?', default="", metavar="", type=str, help="gff file") parser.add_argument("agp", nargs='?', default="", metavar="", type=str, help="agp file") parser.add_argument("-c", action="store_true", default=False, help="update for misassembly correction (ragtag.correction.agp)") + parser.add_argument("-s", action="store_true", default=False, help="update for assembly splitting from RagTag") args = parser.parse_args() @@ -149,9 +163,10 @@ def main(): gff_file = os.path.abspath(args.gff) agp_file = os.path.abspath(args.agp) is_sub = args.c + is_split = args.s - if is_sub: - sub_update(gff_file, agp_file) + if is_sub or is_split: + sub_update(gff_file, agp_file, is_split) else: sup_update(gff_file, agp_file) From cc73bf2e630eb9002874a2ac409f40b6514c1781 Mon Sep 17 00:00:00 2001 From: Nick Carleson Date: Thu, 13 Oct 2022 16:17:30 -0700 Subject: [PATCH 2/9] Skips features that span gaps in AGP --- ragtag_update_gff.py | 58 ++++++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/ragtag_update_gff.py b/ragtag_update_gff.py index 6a99dab..30b2241 100755 --- a/ragtag_update_gff.py +++ b/ragtag_update_gff.py @@ -29,6 +29,7 @@ import argparse from collections import defaultdict import warnings +import re from intervaltree import IntervalTree @@ -59,7 +60,9 @@ def sub_update(gff_file, agp_file, is_split): # Iterate through the gff intervals and update them according to trans with open(gff_file, "r") as f: - print(list(trans.keys())[0]) + attr_id = re.compile('(?<=ID=).*') + attr_parent = re.compile('(?<=Parent=).*') + ovlp_ids = [] for line in f: line = line.rstrip() if line.startswith("#"): @@ -67,8 +70,15 @@ def sub_update(gff_file, agp_file, is_split): else: fields = line.split("\t") h, s, e = fields[0], int(fields[3]), int(fields[4]) - #all_attributes = fields[8] - #attributes = all_attributes.split(";") + attributes = fields[8] + for attr in attributes.split(";"): + feat_id_matches = attr_id.findall(attr) + parent_matches = attr_parent.findall(attr) + if feat_id_matches: + feat_id = feat_id_matches + if parent_matches: + parent = parent_matches + s -= 1 # Keep everything zero-indexed @@ -77,24 +87,36 @@ def sub_update(gff_file, agp_file, is_split): ovlps = trans[h][s:e] if len(ovlps) > 1: - #for ovlp in ovlps: - # print(ovlp) - #warnings.warn("%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Ignoring feature. Make sure to run 'correct' or 'splitasm' with '--gff'" % (h, s, e)) - raise ValueError( - "%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e) - ) - #skipped_parent_feats = + warnings.warn("%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Make sure to run 'correct' or 'splitasm' with '--gff'" % (h, s, e)) + #raise ValueError( + # "%s:%d-%d in the gff file overlaps two sub-sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e) + #) + if feat_id: + ovlp_ids.append(feat_id) if len(ovlps) < 1: raise ValueError("The placement BED file is not formatted correctly.") - # Get the data from the overlapping interval and print the new line - o = list(ovlps)[0] - new_s = s - o.begin - new_e = e - o.begin - fields[0] = o.data - fields[3] = str(new_s + 1) # back to one-based indexing for gff format - fields[4] = str(new_e) - print("\t".join(fields)) + # Check if feat needs to be skipped because of ID or parent match, + # complex solution ensuring orphan feats aren't added e.g. parent gene overlaps but not its stop codon + if (feat_id in ovlp_ids) or (parent_matches and parent in ovlp_ids): + #if (feat_id in ovlp_ids): + # print("ID %s found in list" % (feat_id)) + #if (parent in ovlp_ids): + # print("Parent %s found in list" % (parent)) + #print("Ignoring %s spanning a gap between two sub sequences %s:%d-%d" % (feat_id, h, s, e)) + # To access level 3 feats, add parent to ID exclusion list + if parent: + ovlp_ids.append(feat_id) + continue + else: + # Get the data from the overlapping interval and print the new line + o = list(ovlps)[0] + new_s = s - o.begin + new_e = e - o.begin + fields[0] = o.data + fields[3] = str(new_s + 1) # back to one-based indexing for gff format + fields[4] = str(new_e) + print("\t".join(fields)) def sup_update(gff_file, agp_file, is_split): From 228004d306f444c145cf16a3c81ae4fe1a2dd7e8 Mon Sep 17 00:00:00 2001 From: Nick Carleson Date: Thu, 13 Oct 2022 17:08:27 -0700 Subject: [PATCH 3/9] Bug fix -s not required for sup_update --- ragtag_update_gff.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ragtag_update_gff.py b/ragtag_update_gff.py index 30b2241..b38e960 100755 --- a/ragtag_update_gff.py +++ b/ragtag_update_gff.py @@ -28,7 +28,6 @@ import sys import argparse from collections import defaultdict -import warnings import re from intervaltree import IntervalTree @@ -87,10 +86,10 @@ def sub_update(gff_file, agp_file, is_split): ovlps = trans[h][s:e] if len(ovlps) > 1: - warnings.warn("%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Make sure to run 'correct' or 'splitasm' with '--gff'" % (h, s, e)) #raise ValueError( # "%s:%d-%d in the gff file overlaps two sub-sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e) #) + log("WARNING", "%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Skipping %s. Make sure to run 'correct' or 'splitasm' with '--gff'" % (h, s, e, feat_id)) if feat_id: ovlp_ids.append(feat_id) if len(ovlps) < 1: @@ -119,7 +118,7 @@ def sub_update(gff_file, agp_file, is_split): print("\t".join(fields)) -def sup_update(gff_file, agp_file, is_split): +def sup_update(gff_file, agp_file): # Make a dictionary associating each original sequence with the destination sequence trans = {} strands = {} @@ -167,7 +166,7 @@ def sup_update(gff_file, agp_file, is_split): def main(): - parser = argparse.ArgumentParser(description="Update gff intervals from `correct`, `scaffold`, or `splitasm` given a RagTag AGP file", usage="ragtag.py updategff [-c] ") + parser = argparse.ArgumentParser(description="Update gff intervals from given a RagTag AGP file", usage="ragtag.py updategff [-c] ") parser.add_argument("gff", nargs='?', default="", metavar="", type=str, help="gff file") parser.add_argument("agp", nargs='?', default="", metavar="", type=str, help="agp file") parser.add_argument("-c", action="store_true", default=False, help="update for misassembly correction (ragtag.correction.agp)") From cf0bdb23d54a95cac2ad9980ab175e0489a69f4f Mon Sep 17 00:00:00 2001 From: Nick Carleson Date: Thu, 13 Oct 2022 17:11:03 -0700 Subject: [PATCH 4/9] Added option to not split assembly across GFF features --- ragtag_splitasm.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) mode change 100644 => 100755 ragtag_splitasm.py diff --git a/ragtag_splitasm.py b/ragtag_splitasm.py old mode 100644 new mode 100755 index d050ac8..9b34fd9 --- a/ragtag_splitasm.py +++ b/ragtag_splitasm.py @@ -24,13 +24,17 @@ SOFTWARE. """ +import os import re import sys import argparse import pysam +from ragtag_correct import make_gff_interval_tree + from ragtag_utilities.utilities import get_ragtag_version +from ragtag_utilities.utilities import log from ragtag_utilities.AGPFile import AGPFile @@ -39,6 +43,7 @@ def main(): parser.add_argument("asm", metavar="", default="", type=str, help="assembly fasta file (uncompressed or bgzipped)") parser.add_argument("-n", metavar="INT", type=int, default=0, help="minimum gap size [0]") parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.splitasm.agp", help="output AGP file path [./ragtag.splitasm.agp]") + parser.add_argument("--gff", metavar="", type=str, default="", help="don't break sequences within gff intervals [null]") # Parse the command line arguments args = parser.parse_args() @@ -50,7 +55,11 @@ def main(): asm_fn = args.asm min_gap_size = args.n agp_fn = args.o - + gff_file = args.gff + if gff_file: + gff_file = os.path.abspath(gff_file) + it = make_gff_interval_tree(gff_file) + log("INFO", "Avoiding breaks within GFF intervals") # Initialize the AGP file agp = AGPFile(agp_fn, mode="w") agp.add_pragma() @@ -63,6 +72,19 @@ def main(): seq = fai.fetch(header).upper() seq_len = fai.get_reference_length(header) gap_coords = [(i.start(), i.end()) for i in re.finditer(r'N+', seq) if i.end() - i.start() > min_gap_size] + # Remove coordinates overlapping gff features + if gff_file: + #non_gff_breaks = dict() + new_breaks = [] + for gap in gap_coords: + if it[header][gap[0]] or it[header][gap[1]]: + log("INFO", "Avoiding breaking %s between %d-%d. This point intersects a feature in the gff file." + % (header, gap[0], gap[1])) + else: + new_breaks.append(gap) + if new_breaks: + #non_gff_breaks[header] = new_breaks + gap_coords = new_breaks if not gap_coords: new_header = "seq{0:08}".format(new_header_idx) From 4d8991fb1e37f7487c904d51ee39a6a7b036cc80 Mon Sep 17 00:00:00 2001 From: Nick Carleson Date: Tue, 25 Oct 2022 16:29:14 -0700 Subject: [PATCH 5/9] Fix: child features with parents that were skipped did not get skipped --- ragtag_update_gff.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/ragtag_update_gff.py b/ragtag_update_gff.py index b38e960..18ca364 100755 --- a/ragtag_update_gff.py +++ b/ragtag_update_gff.py @@ -70,6 +70,8 @@ def sub_update(gff_file, agp_file, is_split): fields = line.split("\t") h, s, e = fields[0], int(fields[3]), int(fields[4]) attributes = fields[8] + parent = None + feat_id = None for attr in attributes.split(";"): feat_id_matches = attr_id.findall(attr) parent_matches = attr_parent.findall(attr) @@ -89,23 +91,19 @@ def sub_update(gff_file, agp_file, is_split): #raise ValueError( # "%s:%d-%d in the gff file overlaps two sub-sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e) #) - log("WARNING", "%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Skipping %s. Make sure to run 'correct' or 'splitasm' with '--gff'" % (h, s, e, feat_id)) + log("WARNING", "%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Skipping %s and any child features. To retain feature, include '--gff' and re-run 'correct' or 'splitasm'" % (h, s, e, feat_id)) if feat_id: ovlp_ids.append(feat_id) + continue if len(ovlps) < 1: raise ValueError("The placement BED file is not formatted correctly.") # Check if feat needs to be skipped because of ID or parent match, # complex solution ensuring orphan feats aren't added e.g. parent gene overlaps but not its stop codon - if (feat_id in ovlp_ids) or (parent_matches and parent in ovlp_ids): - #if (feat_id in ovlp_ids): - # print("ID %s found in list" % (feat_id)) - #if (parent in ovlp_ids): - # print("Parent %s found in list" % (parent)) - #print("Ignoring %s spanning a gap between two sub sequences %s:%d-%d" % (feat_id, h, s, e)) - # To access level 3 feats, add parent to ID exclusion list - if parent: - ovlp_ids.append(feat_id) + if parent and parent in ovlp_ids: + log("WARNING", "Parent %s already excluded for overlapping two sub sequences. Skipping %s." % (parent, feat_id)) + # To access any potential level 3 feats, add parent to ID exclusion list + ovlp_ids.append(feat_id) continue else: # Get the data from the overlapping interval and print the new line From 081857d9f9821e996c705c3586b12c170dc63cf6 Mon Sep 17 00:00:00 2001 From: Nick Carleson Date: Wed, 30 Nov 2022 11:35:31 -0800 Subject: [PATCH 6/9] Splits contigs before any gaps using BED file. Ensure no new gaps will be created --- ragtag_splitasm.py | 67 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 3 deletions(-) diff --git a/ragtag_splitasm.py b/ragtag_splitasm.py index 9b34fd9..26cf07b 100755 --- a/ragtag_splitasm.py +++ b/ragtag_splitasm.py @@ -39,11 +39,12 @@ def main(): - parser = argparse.ArgumentParser(description='Split sequencs at gaps', usage="ragtag.py splitasm ") + parser = argparse.ArgumentParser(description='Split sequences at gaps', usage="ragtag.py splitasm ") parser.add_argument("asm", metavar="", default="", type=str, help="assembly fasta file (uncompressed or bgzipped)") parser.add_argument("-n", metavar="INT", type=int, default=0, help="minimum gap size [0]") parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.splitasm.agp", help="output AGP file path [./ragtag.splitasm.agp]") parser.add_argument("--gff", metavar="", type=str, default="", help="don't break sequences within gff intervals [null]") + parser.add_argument("--bed", metavar="", type=str, default="", help="BED file of where to break sequences [null]") # Parse the command line arguments args = parser.parse_args() @@ -56,6 +57,17 @@ def main(): min_gap_size = args.n agp_fn = args.o gff_file = args.gff + bed_file = args.bed + # Initialize Tabix-indexed BED file + if bed_file: + log("INFO", "Breaking across positions from BED file") + bed = pysam.TabixFile(bed_file) + bed_contigs = [row.contig for row in bed.fetch(parser = pysam.asBed())] + #print(bed_contigs) + # Fetch will break if sequence is absent + #for row in bed.fetch("seq00000000", parser = pysam.asBed()): + # print(row) + #log("INFO", row.contig) if gff_file: gff_file = os.path.abspath(gff_file) it = make_gff_interval_tree(gff_file) @@ -65,6 +77,8 @@ def main(): agp.add_pragma() agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) + + # Process the FASTA file new_header_idx = 0 fai = pysam.FastaFile(asm_fn) @@ -72,6 +86,12 @@ def main(): seq = fai.fetch(header).upper() seq_len = fai.get_reference_length(header) gap_coords = [(i.start(), i.end()) for i in re.finditer(r'N+', seq) if i.end() - i.start() > min_gap_size] + if bed_file and header in bed_contigs: + splits_pos = [(row.start, row.end) for row in bed.fetch(header, parser=pysam.asBed()) if row.end <= seq_len] + else: + splits_pos = [] + #log("INFO", "Splitting %s at %s" % (header, splits_pos)) + #print("Splitting %s at %s" % (header, splits_pos)) # Remove coordinates overlapping gff features if gff_file: #non_gff_breaks = dict() @@ -82,11 +102,22 @@ def main(): % (header, gap[0], gap[1])) else: new_breaks.append(gap) + if splits_pos: + new_splits = [] + for split in splits_pos: + if it[header][split[1]]: + log("INFO", "Avoiding breaking %s at %d. This point intersects a feature in the gff file." + % (header, split[1])) + else: + new_splits.append(split) + if new_splits: + splits_pos = new_splits if new_breaks: #non_gff_breaks[header] = new_breaks gap_coords = new_breaks + - if not gap_coords: + if not gap_coords and not splits_pos: new_header = "seq{0:08}".format(new_header_idx) new_header_idx += 1 agp.add_seq_line(header, "1", seq_len, "1", "W", new_header, "1", seq_len, "+") @@ -96,7 +127,37 @@ def main(): if gap_coords[0][0]: # The sequence doesn't start with a gap new_header = "seq{0:08}".format(new_header_idx) - agp.add_seq_line(header, "1", str(gap_coords[0][0]), str(pid), "W", new_header, "1", str(gap_coords[0][0]), "+") + # if there's a split coordinate before the end, that needs to be the "new" end + # First, if/ELSE allows current behavior to be the same + # Make any/all splits necessary before the *first* gap sequence + # *later*: will need to check for split coords between every gap coord + if splits_pos: + log("INFO", "Should break %s %s times at gaps %s" % (header, len(splits_pos), str(splits_pos))) + for i in range(0, len(splits_pos)): + log("INFO", "Breaking %s at gap %s" % (header, str(splits_pos))) + if splits_pos[i][1] < gap_coords[0][0]: + # Only first iteration starts at 1 (object; component always starts at 1) + if i == 0: + obj_start=1 + obj_end=splits_pos[i][1] + cmp_end=obj_end + log("INFO", "Found first split in %s at %s" % (header, obj_end)) + else: + obj_start=obj_end+1 + obj_end=splits_pos[i][1] + cmp_end=obj_end - obj_start + 1 + log("INFO", "Found another split in %s at %s producing seq of length %s" % (header, obj_end, cmp_end)) + agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+") + new_header_idx += 1 + new_header = "seq{0:08}".format(new_header_idx) + pid += 1 + # Add in final component to object - from last break till end of the sequence + obj_start=obj_end+1 + obj_end=seq_len + cmp_end=obj_end - obj_start + 1 + agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+") + else: + agp.add_seq_line(header, "1", str(gap_coords[0][0]), str(pid), "W", new_header, "1", str(gap_coords[0][0]), "+") new_header_idx += 1 pid += 1 From 80b78fbdf40c76aab1407eb1ca4a8b35753197a6 Mon Sep 17 00:00:00 2001 From: Nick Carleson Date: Wed, 30 Nov 2022 12:13:37 -0800 Subject: [PATCH 7/9] Separate manual split functionality into new utility --- ragtag.py | 5 ++ ragtag_manualsplits.py | 160 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 165 insertions(+) mode change 100644 => 100755 ragtag.py create mode 100755 ragtag_manualsplits.py diff --git a/ragtag.py b/ragtag.py old mode 100644 new mode 100755 index 66dee59..d0d9d92 --- a/ragtag.py +++ b/ragtag.py @@ -64,6 +64,7 @@ def main(): delta2paf delta to PAF file conversion paf2delta PAF to delta file conversion updategff update gff intervals + manualsplits split an assembly at manually identified misjoins options: @@ -129,6 +130,10 @@ def main(): elif cmd == "paf2delta": subcmd = ["ragtag_paf2delta.py"] + sys.argv[2:] subprocess.call(subcmd) + + elif cmd == "manualsplits": + subcmd = ["ragtag_manualsplits.py"] + sys.argv[2:] + subprocess.call(subcmd) else: print(description) diff --git a/ragtag_manualsplits.py b/ragtag_manualsplits.py new file mode 100755 index 0000000..dfea756 --- /dev/null +++ b/ragtag_manualsplits.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python + +""" +MIT License + +Copyright (c) 2021 Michael Alonge + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + +import os +import re +import sys +import argparse + +import pysam + +from ragtag_correct import make_gff_interval_tree + +from ragtag_utilities.utilities import get_ragtag_version +from ragtag_utilities.utilities import log +from ragtag_utilities.AGPFile import AGPFile + + +def main(): + parser = argparse.ArgumentParser(description='Split assembly at specified gaps', usage="ragtag.py manualsplits --bed ") + parser.add_argument("asm", metavar="", default="", type=str, help="assembly fasta file (uncompressed or bgzipped)") + parser.add_argument("--bed", metavar="", type=str, default="", help="bgzipped and Tabix-indexed BED file of where to break sequences.\nIf translating from 1-based coordinates e.g. SAM/BAM/VCF:\nuse the coordinate to be split on as the End of each BED region, and Start should be End-1.") + parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.manualsplits.agp", help="output AGP file path [./ragtag.splitasm.agp]") + parser.add_argument("--gff", metavar="", type=str, default="", help="don't break sequences within gff intervals [null]") + + # Parse the command line arguments + args = parser.parse_args() + if not args.asm: + parser.print_help() + print("\n** The assembly FASTA file is required **") + sys.exit() + if not args.bed: + parser.print_help() + print("\n** BED file of is required **") + + asm_fn = args.asm + agp_fn = args.o + gff_file = args.gff + bed_file = args.bed + # Initialize Tabix-indexed BED file + if bed_file: + log("INFO", "Breaking across positions from BED file") + bed = pysam.TabixFile(bed_file) + bed_contigs = [row.contig for row in bed.fetch(parser = pysam.asBed())] + #print(bed_contigs) + # Fetch will break if sequence is absent + #for row in bed.fetch("seq00000000", parser = pysam.asBed()): + # print(row) + #log("INFO", row.contig) + if gff_file: + gff_file = os.path.abspath(gff_file) + it = make_gff_interval_tree(gff_file) + log("INFO", "Avoiding breaks within GFF intervals") + # Initialize the AGP file + agp = AGPFile(agp_fn, mode="w") + agp.add_pragma() + agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) + + # Process the FASTA file + new_header_idx = 0 + fai = pysam.FastaFile(asm_fn) + for header in sorted(fai.references): + seq = fai.fetch(header).upper() + seq_len = fai.get_reference_length(header) + if bed_file and header in bed_contigs: + splits_pos = [(row.start, row.end) for row in bed.fetch(header, parser=pysam.asBed()) if row.end <= seq_len] + else: + splits_pos = [] + #log("INFO", "Splitting %s at %s" % (header, splits_pos)) + #print("Splitting %s at %s" % (header, splits_pos)) + # Remove coordinates overlapping gff features + if gff_file: + #non_gff_breaks = dict() + if splits_pos: + new_splits = [] + for split in splits_pos: + if it[header][split[1]]: + log("INFO", "Avoiding breaking %s at %d. This point intersects a feature in the gff file." + % (header, split[1])) + else: + new_splits.append(split) + if new_splits: + splits_pos = new_splits + + if not splits_pos: + new_header = "seq{0:08}".format(new_header_idx) + new_header_idx += 1 + agp.add_seq_line(header, "1", seq_len, "1", "W", new_header, "1", seq_len, "+") + else: + pid = 1 + # The sequence doesn't start with a gap + new_header = "seq{0:08}".format(new_header_idx) + # if there's a split coordinate before the end, that needs to be the "new" end + # Make any/all splits necessary + if splits_pos: + log("INFO", "Should break %s %s times at gaps %s" % (header, len(splits_pos), str(splits_pos))) + for i in range(0, len(splits_pos)): + log("INFO", "Breaking %s at gap %s" % (header, str(splits_pos))) + # Only first iteration starts at 1 (object; component always starts at 1) + if i == 0: + obj_start=1 + obj_end=splits_pos[i][1] + cmp_end=obj_end + log("INFO", "Found first split in %s at %s" % (header, obj_end)) + else: + obj_start=obj_end+1 + obj_end=splits_pos[i][1] + cmp_end=obj_end - obj_start + 1 + log("INFO", "Found another split in %s at %s producing seq of length %s" % (header, obj_end, cmp_end)) + agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+") + new_header_idx += 1 + new_header = "seq{0:08}".format(new_header_idx) + pid += 1 + # Add in final component to object - from last break till end of the sequence + obj_start=obj_end+1 + obj_end=seq_len + cmp_end=obj_end - obj_start + 1 + agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+") + else: + agp.add_seq_line(header, "1", str(gap_coords[0][0]), str(pid), "W", new_header, "1", str(gap_coords[0][0]), "+") + new_header_idx += 1 + pid += 1 + + agp.write() + + # Iterate over the AGP file and print the sequences + agp = AGPFile(agp_fn, mode="r") + for line in agp.iterate_lines(): + if not line.is_gap: + obj, comp, obj_beg, obj_end = line.obj, line.comp, line.obj_beg, line.obj_end + print(">" + comp) + print(fai.fetch(obj, obj_beg-1, obj_end)) + + fai.close() + + +if __name__ == "__main__": + main() From 1c295a323c271fc2d3565c00b835b80a5ea08f25 Mon Sep 17 00:00:00 2001 From: Nick Carleson Date: Wed, 30 Nov 2022 12:22:33 -0800 Subject: [PATCH 8/9] Revert manual-split functionality out of splitasm --- ragtag_splitasm.py | 63 +--------------------------------------------- 1 file changed, 1 insertion(+), 62 deletions(-) diff --git a/ragtag_splitasm.py b/ragtag_splitasm.py index 26cf07b..1e63f39 100755 --- a/ragtag_splitasm.py +++ b/ragtag_splitasm.py @@ -44,7 +44,6 @@ def main(): parser.add_argument("-n", metavar="INT", type=int, default=0, help="minimum gap size [0]") parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.splitasm.agp", help="output AGP file path [./ragtag.splitasm.agp]") parser.add_argument("--gff", metavar="", type=str, default="", help="don't break sequences within gff intervals [null]") - parser.add_argument("--bed", metavar="", type=str, default="", help="BED file of where to break sequences [null]") # Parse the command line arguments args = parser.parse_args() @@ -57,17 +56,6 @@ def main(): min_gap_size = args.n agp_fn = args.o gff_file = args.gff - bed_file = args.bed - # Initialize Tabix-indexed BED file - if bed_file: - log("INFO", "Breaking across positions from BED file") - bed = pysam.TabixFile(bed_file) - bed_contigs = [row.contig for row in bed.fetch(parser = pysam.asBed())] - #print(bed_contigs) - # Fetch will break if sequence is absent - #for row in bed.fetch("seq00000000", parser = pysam.asBed()): - # print(row) - #log("INFO", row.contig) if gff_file: gff_file = os.path.abspath(gff_file) it = make_gff_interval_tree(gff_file) @@ -77,8 +65,6 @@ def main(): agp.add_pragma() agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) - - # Process the FASTA file new_header_idx = 0 fai = pysam.FastaFile(asm_fn) @@ -86,12 +72,6 @@ def main(): seq = fai.fetch(header).upper() seq_len = fai.get_reference_length(header) gap_coords = [(i.start(), i.end()) for i in re.finditer(r'N+', seq) if i.end() - i.start() > min_gap_size] - if bed_file and header in bed_contigs: - splits_pos = [(row.start, row.end) for row in bed.fetch(header, parser=pysam.asBed()) if row.end <= seq_len] - else: - splits_pos = [] - #log("INFO", "Splitting %s at %s" % (header, splits_pos)) - #print("Splitting %s at %s" % (header, splits_pos)) # Remove coordinates overlapping gff features if gff_file: #non_gff_breaks = dict() @@ -102,20 +82,9 @@ def main(): % (header, gap[0], gap[1])) else: new_breaks.append(gap) - if splits_pos: - new_splits = [] - for split in splits_pos: - if it[header][split[1]]: - log("INFO", "Avoiding breaking %s at %d. This point intersects a feature in the gff file." - % (header, split[1])) - else: - new_splits.append(split) - if new_splits: - splits_pos = new_splits if new_breaks: #non_gff_breaks[header] = new_breaks gap_coords = new_breaks - if not gap_coords and not splits_pos: new_header = "seq{0:08}".format(new_header_idx) @@ -127,37 +96,7 @@ def main(): if gap_coords[0][0]: # The sequence doesn't start with a gap new_header = "seq{0:08}".format(new_header_idx) - # if there's a split coordinate before the end, that needs to be the "new" end - # First, if/ELSE allows current behavior to be the same - # Make any/all splits necessary before the *first* gap sequence - # *later*: will need to check for split coords between every gap coord - if splits_pos: - log("INFO", "Should break %s %s times at gaps %s" % (header, len(splits_pos), str(splits_pos))) - for i in range(0, len(splits_pos)): - log("INFO", "Breaking %s at gap %s" % (header, str(splits_pos))) - if splits_pos[i][1] < gap_coords[0][0]: - # Only first iteration starts at 1 (object; component always starts at 1) - if i == 0: - obj_start=1 - obj_end=splits_pos[i][1] - cmp_end=obj_end - log("INFO", "Found first split in %s at %s" % (header, obj_end)) - else: - obj_start=obj_end+1 - obj_end=splits_pos[i][1] - cmp_end=obj_end - obj_start + 1 - log("INFO", "Found another split in %s at %s producing seq of length %s" % (header, obj_end, cmp_end)) - agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+") - new_header_idx += 1 - new_header = "seq{0:08}".format(new_header_idx) - pid += 1 - # Add in final component to object - from last break till end of the sequence - obj_start=obj_end+1 - obj_end=seq_len - cmp_end=obj_end - obj_start + 1 - agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+") - else: - agp.add_seq_line(header, "1", str(gap_coords[0][0]), str(pid), "W", new_header, "1", str(gap_coords[0][0]), "+") + agp.add_seq_line(header, "1", str(gap_coords[0][0]), str(pid), "W", new_header, "1", str(gap_coords[0][0]), "+") new_header_idx += 1 pid += 1 From e5fac60a004388be981d54e1c4f6d5679a584f43 Mon Sep 17 00:00:00 2001 From: Nick Carleson Date: Wed, 30 Nov 2022 12:35:25 -0800 Subject: [PATCH 9/9] Less logging for quieter runs --- ragtag_manualsplits.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/ragtag_manualsplits.py b/ragtag_manualsplits.py index dfea756..9c07c41 100755 --- a/ragtag_manualsplits.py +++ b/ragtag_manualsplits.py @@ -64,11 +64,6 @@ def main(): log("INFO", "Breaking across positions from BED file") bed = pysam.TabixFile(bed_file) bed_contigs = [row.contig for row in bed.fetch(parser = pysam.asBed())] - #print(bed_contigs) - # Fetch will break if sequence is absent - #for row in bed.fetch("seq00000000", parser = pysam.asBed()): - # print(row) - #log("INFO", row.contig) if gff_file: gff_file = os.path.abspath(gff_file) it = make_gff_interval_tree(gff_file) @@ -88,8 +83,6 @@ def main(): splits_pos = [(row.start, row.end) for row in bed.fetch(header, parser=pysam.asBed()) if row.end <= seq_len] else: splits_pos = [] - #log("INFO", "Splitting %s at %s" % (header, splits_pos)) - #print("Splitting %s at %s" % (header, splits_pos)) # Remove coordinates overlapping gff features if gff_file: #non_gff_breaks = dict() @@ -115,9 +108,7 @@ def main(): # if there's a split coordinate before the end, that needs to be the "new" end # Make any/all splits necessary if splits_pos: - log("INFO", "Should break %s %s times at gaps %s" % (header, len(splits_pos), str(splits_pos))) for i in range(0, len(splits_pos)): - log("INFO", "Breaking %s at gap %s" % (header, str(splits_pos))) # Only first iteration starts at 1 (object; component always starts at 1) if i == 0: obj_start=1 @@ -128,7 +119,7 @@ def main(): obj_start=obj_end+1 obj_end=splits_pos[i][1] cmp_end=obj_end - obj_start + 1 - log("INFO", "Found another split in %s at %s producing seq of length %s" % (header, obj_end, cmp_end)) + log("INFO", "Found another split in %s at %s" % (header, obj_end)) agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+") new_header_idx += 1 new_header = "seq{0:08}".format(new_header_idx)