diff --git a/ragtag.py b/ragtag.py old mode 100644 new mode 100755 index 66dee59..d0d9d92 --- a/ragtag.py +++ b/ragtag.py @@ -64,6 +64,7 @@ def main(): delta2paf delta to PAF file conversion paf2delta PAF to delta file conversion updategff update gff intervals + manualsplits split an assembly at manually identified misjoins options: @@ -129,6 +130,10 @@ def main(): elif cmd == "paf2delta": subcmd = ["ragtag_paf2delta.py"] + sys.argv[2:] subprocess.call(subcmd) + + elif cmd == "manualsplits": + subcmd = ["ragtag_manualsplits.py"] + sys.argv[2:] + subprocess.call(subcmd) else: print(description) diff --git a/ragtag_manualsplits.py b/ragtag_manualsplits.py new file mode 100755 index 0000000..9c07c41 --- /dev/null +++ b/ragtag_manualsplits.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python + +""" +MIT License + +Copyright (c) 2021 Michael Alonge + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + +import os +import re +import sys +import argparse + +import pysam + +from ragtag_correct import make_gff_interval_tree + +from ragtag_utilities.utilities import get_ragtag_version +from ragtag_utilities.utilities import log +from ragtag_utilities.AGPFile import AGPFile + + +def main(): + parser = argparse.ArgumentParser(description='Split assembly at specified gaps', usage="ragtag.py manualsplits --bed ") + parser.add_argument("asm", metavar="", default="", type=str, help="assembly fasta file (uncompressed or bgzipped)") + parser.add_argument("--bed", metavar="", type=str, default="", help="bgzipped and Tabix-indexed BED file of where to break sequences.\nIf translating from 1-based coordinates e.g. SAM/BAM/VCF:\nuse the coordinate to be split on as the End of each BED region, and Start should be End-1.") + parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.manualsplits.agp", help="output AGP file path [./ragtag.splitasm.agp]") + parser.add_argument("--gff", metavar="", type=str, default="", help="don't break sequences within gff intervals [null]") + + # Parse the command line arguments + args = parser.parse_args() + if not args.asm: + parser.print_help() + print("\n** The assembly FASTA file is required **") + sys.exit() + if not args.bed: + parser.print_help() + print("\n** BED file of is required **") + + asm_fn = args.asm + agp_fn = args.o + gff_file = args.gff + bed_file = args.bed + # Initialize Tabix-indexed BED file + if bed_file: + log("INFO", "Breaking across positions from BED file") + bed = pysam.TabixFile(bed_file) + bed_contigs = [row.contig for row in bed.fetch(parser = pysam.asBed())] + if gff_file: + gff_file = os.path.abspath(gff_file) + it = make_gff_interval_tree(gff_file) + log("INFO", "Avoiding breaks within GFF intervals") + # Initialize the AGP file + agp = AGPFile(agp_fn, mode="w") + agp.add_pragma() + agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version())) + + # Process the FASTA file + new_header_idx = 0 + fai = pysam.FastaFile(asm_fn) + for header in sorted(fai.references): + seq = fai.fetch(header).upper() + seq_len = fai.get_reference_length(header) + if bed_file and header in bed_contigs: + splits_pos = [(row.start, row.end) for row in bed.fetch(header, parser=pysam.asBed()) if row.end <= seq_len] + else: + splits_pos = [] + # Remove coordinates overlapping gff features + if gff_file: + #non_gff_breaks = dict() + if splits_pos: + new_splits = [] + for split in splits_pos: + if it[header][split[1]]: + log("INFO", "Avoiding breaking %s at %d. This point intersects a feature in the gff file." + % (header, split[1])) + else: + new_splits.append(split) + if new_splits: + splits_pos = new_splits + + if not splits_pos: + new_header = "seq{0:08}".format(new_header_idx) + new_header_idx += 1 + agp.add_seq_line(header, "1", seq_len, "1", "W", new_header, "1", seq_len, "+") + else: + pid = 1 + # The sequence doesn't start with a gap + new_header = "seq{0:08}".format(new_header_idx) + # if there's a split coordinate before the end, that needs to be the "new" end + # Make any/all splits necessary + if splits_pos: + for i in range(0, len(splits_pos)): + # Only first iteration starts at 1 (object; component always starts at 1) + if i == 0: + obj_start=1 + obj_end=splits_pos[i][1] + cmp_end=obj_end + log("INFO", "Found first split in %s at %s" % (header, obj_end)) + else: + obj_start=obj_end+1 + obj_end=splits_pos[i][1] + cmp_end=obj_end - obj_start + 1 + log("INFO", "Found another split in %s at %s" % (header, obj_end)) + agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+") + new_header_idx += 1 + new_header = "seq{0:08}".format(new_header_idx) + pid += 1 + # Add in final component to object - from last break till end of the sequence + obj_start=obj_end+1 + obj_end=seq_len + cmp_end=obj_end - obj_start + 1 + agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+") + else: + agp.add_seq_line(header, "1", str(gap_coords[0][0]), str(pid), "W", new_header, "1", str(gap_coords[0][0]), "+") + new_header_idx += 1 + pid += 1 + + agp.write() + + # Iterate over the AGP file and print the sequences + agp = AGPFile(agp_fn, mode="r") + for line in agp.iterate_lines(): + if not line.is_gap: + obj, comp, obj_beg, obj_end = line.obj, line.comp, line.obj_beg, line.obj_end + print(">" + comp) + print(fai.fetch(obj, obj_beg-1, obj_end)) + + fai.close() + + +if __name__ == "__main__": + main() diff --git a/ragtag_splitasm.py b/ragtag_splitasm.py old mode 100644 new mode 100755 index d050ac8..1e63f39 --- a/ragtag_splitasm.py +++ b/ragtag_splitasm.py @@ -24,21 +24,26 @@ SOFTWARE. """ +import os import re import sys import argparse import pysam +from ragtag_correct import make_gff_interval_tree + from ragtag_utilities.utilities import get_ragtag_version +from ragtag_utilities.utilities import log from ragtag_utilities.AGPFile import AGPFile def main(): - parser = argparse.ArgumentParser(description='Split sequencs at gaps', usage="ragtag.py splitasm ") + parser = argparse.ArgumentParser(description='Split sequences at gaps', usage="ragtag.py splitasm ") parser.add_argument("asm", metavar="", default="", type=str, help="assembly fasta file (uncompressed or bgzipped)") parser.add_argument("-n", metavar="INT", type=int, default=0, help="minimum gap size [0]") parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.splitasm.agp", help="output AGP file path [./ragtag.splitasm.agp]") + parser.add_argument("--gff", metavar="", type=str, default="", help="don't break sequences within gff intervals [null]") # Parse the command line arguments args = parser.parse_args() @@ -50,7 +55,11 @@ def main(): asm_fn = args.asm min_gap_size = args.n agp_fn = args.o - + gff_file = args.gff + if gff_file: + gff_file = os.path.abspath(gff_file) + it = make_gff_interval_tree(gff_file) + log("INFO", "Avoiding breaks within GFF intervals") # Initialize the AGP file agp = AGPFile(agp_fn, mode="w") agp.add_pragma() @@ -63,8 +72,21 @@ def main(): seq = fai.fetch(header).upper() seq_len = fai.get_reference_length(header) gap_coords = [(i.start(), i.end()) for i in re.finditer(r'N+', seq) if i.end() - i.start() > min_gap_size] - - if not gap_coords: + # Remove coordinates overlapping gff features + if gff_file: + #non_gff_breaks = dict() + new_breaks = [] + for gap in gap_coords: + if it[header][gap[0]] or it[header][gap[1]]: + log("INFO", "Avoiding breaking %s between %d-%d. This point intersects a feature in the gff file." + % (header, gap[0], gap[1])) + else: + new_breaks.append(gap) + if new_breaks: + #non_gff_breaks[header] = new_breaks + gap_coords = new_breaks + + if not gap_coords and not splits_pos: new_header = "seq{0:08}".format(new_header_idx) new_header_idx += 1 agp.add_seq_line(header, "1", seq_len, "1", "W", new_header, "1", seq_len, "+") diff --git a/ragtag_update_gff.py b/ragtag_update_gff.py old mode 100644 new mode 100755 index 0c5cccc..18ca364 --- a/ragtag_update_gff.py +++ b/ragtag_update_gff.py @@ -28,6 +28,7 @@ import sys import argparse from collections import defaultdict +import re from intervaltree import IntervalTree @@ -35,25 +36,32 @@ from ragtag_utilities.AGPFile import AGPFile -def sub_update(gff_file, agp_file): +def sub_update(gff_file, agp_file, is_split): # Make a dictionary associating each original sequence with an interval tree of component sequences trans = defaultdict(IntervalTree) agp = AGPFile(agp_file, mode="r") for agp_line in agp.iterate_lines(): # Check that the agp file looks correct for this task - if agp_line.orientation == "-": - raise ValueError("The placement BED file is not formatted correctly. No sequences should be reverse complemented for misassembly correction.") - if not agp_line.comp_type == "W": + if not agp_line.comp_type == "W" and not is_split: raise ValueError("The placement BED file is not formatted correctly. All lines should be WGS contig (W).") - if agp_line.is_gap: + if agp_line.is_gap and not is_split: raise ValueError("There should be no gaps in the correction AGP file.") + # gaps don't have the orientation attribute so this check should come last + if not is_split: + if agp_line.orientation == "-": + raise ValueError("The placement BED file is not formatted correctly. No sequences should be reverse complemented for misassembly correction.") - start, end = agp_line.obj_beg - 1, agp_line.obj_end - trans[agp_line.obj][start:end] = agp_line.comp + # Cant adjust gap comps because gaps aren't assembled into object + if agp_line.comp_type == "W": + start, end = agp_line.obj_beg - 1, agp_line.obj_end + trans[agp_line.obj][start:end] = agp_line.comp # Iterate through the gff intervals and update them according to trans with open(gff_file, "r") as f: + attr_id = re.compile('(?<=ID=).*') + attr_parent = re.compile('(?<=Parent=).*') + ovlp_ids = [] for line in f: line = line.rstrip() if line.startswith("#"): @@ -61,6 +69,18 @@ def sub_update(gff_file, agp_file): else: fields = line.split("\t") h, s, e = fields[0], int(fields[3]), int(fields[4]) + attributes = fields[8] + parent = None + feat_id = None + for attr in attributes.split(";"): + feat_id_matches = attr_id.findall(attr) + parent_matches = attr_parent.findall(attr) + if feat_id_matches: + feat_id = feat_id_matches + if parent_matches: + parent = parent_matches + + s -= 1 # Keep everything zero-indexed if h not in trans: @@ -68,20 +88,32 @@ def sub_update(gff_file, agp_file): ovlps = trans[h][s:e] if len(ovlps) > 1: - raise ValueError( - "%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e) - ) + #raise ValueError( + # "%s:%d-%d in the gff file overlaps two sub-sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e) + #) + log("WARNING", "%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Skipping %s and any child features. To retain feature, include '--gff' and re-run 'correct' or 'splitasm'" % (h, s, e, feat_id)) + if feat_id: + ovlp_ids.append(feat_id) + continue if len(ovlps) < 1: raise ValueError("The placement BED file is not formatted correctly.") - # Get the data from the overlapping interval and print the new line - o = list(ovlps)[0] - new_s = s - o.begin - new_e = e - o.begin - fields[0] = o.data - fields[3] = str(new_s + 1) # back to one-based indexing for gff format - fields[4] = str(new_e) - print("\t".join(fields)) + # Check if feat needs to be skipped because of ID or parent match, + # complex solution ensuring orphan feats aren't added e.g. parent gene overlaps but not its stop codon + if parent and parent in ovlp_ids: + log("WARNING", "Parent %s already excluded for overlapping two sub sequences. Skipping %s." % (parent, feat_id)) + # To access any potential level 3 feats, add parent to ID exclusion list + ovlp_ids.append(feat_id) + continue + else: + # Get the data from the overlapping interval and print the new line + o = list(ovlps)[0] + new_s = s - o.begin + new_e = e - o.begin + fields[0] = o.data + fields[3] = str(new_s + 1) # back to one-based indexing for gff format + fields[4] = str(new_e) + print("\t".join(fields)) def sup_update(gff_file, agp_file): @@ -132,10 +164,11 @@ def sup_update(gff_file, agp_file): def main(): - parser = argparse.ArgumentParser(description="Update gff intervals given a RagTag AGP file", usage="ragtag.py updategff [-c] ") + parser = argparse.ArgumentParser(description="Update gff intervals from given a RagTag AGP file", usage="ragtag.py updategff [-c] ") parser.add_argument("gff", nargs='?', default="", metavar="", type=str, help="gff file") parser.add_argument("agp", nargs='?', default="", metavar="", type=str, help="agp file") parser.add_argument("-c", action="store_true", default=False, help="update for misassembly correction (ragtag.correction.agp)") + parser.add_argument("-s", action="store_true", default=False, help="update for assembly splitting from RagTag") args = parser.parse_args() @@ -149,9 +182,10 @@ def main(): gff_file = os.path.abspath(args.gff) agp_file = os.path.abspath(args.agp) is_sub = args.c + is_split = args.s - if is_sub: - sub_update(gff_file, agp_file) + if is_sub or is_split: + sub_update(gff_file, agp_file, is_split) else: sup_update(gff_file, agp_file)