Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split assembly at BED coordinates #142

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
5 changes: 5 additions & 0 deletions ragtag.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def main():
delta2paf delta to PAF file conversion
paf2delta PAF to delta file conversion
updategff update gff intervals
manualsplits split an assembly at manually identified misjoins


options:
Expand Down Expand Up @@ -129,6 +130,10 @@ def main():
elif cmd == "paf2delta":
subcmd = ["ragtag_paf2delta.py"] + sys.argv[2:]
subprocess.call(subcmd)

elif cmd == "manualsplits":
subcmd = ["ragtag_manualsplits.py"] + sys.argv[2:]
subprocess.call(subcmd)

else:
print(description)
Expand Down
151 changes: 151 additions & 0 deletions ragtag_manualsplits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#!/usr/bin/env python

"""
MIT License

Copyright (c) 2021 Michael Alonge <[email protected]>

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""

import os
import re
import sys
import argparse

import pysam

from ragtag_correct import make_gff_interval_tree

from ragtag_utilities.utilities import get_ragtag_version
from ragtag_utilities.utilities import log
from ragtag_utilities.AGPFile import AGPFile


def main():
parser = argparse.ArgumentParser(description='Split assembly at specified gaps', usage="ragtag.py manualsplits --bed <splits.bed> <asm.fa>")
parser.add_argument("asm", metavar="<asm.fa>", default="", type=str, help="assembly fasta file (uncompressed or bgzipped)")
parser.add_argument("--bed", metavar="<splits.bed.gz>", type=str, default="", help="bgzipped and Tabix-indexed BED file of where to break sequences.\nIf translating from 1-based coordinates e.g. SAM/BAM/VCF:\nuse the coordinate to be split on as the End of each BED region, and Start should be End-1.")
parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.manualsplits.agp", help="output AGP file path [./ragtag.splitasm.agp]")
parser.add_argument("--gff", metavar="<features.gff>", type=str, default="", help="don't break sequences within gff intervals [null]")

# Parse the command line arguments
args = parser.parse_args()
if not args.asm:
parser.print_help()
print("\n** The assembly FASTA file is required **")
sys.exit()
if not args.bed:
parser.print_help()
print("\n** BED file of is required **")

asm_fn = args.asm
agp_fn = args.o
gff_file = args.gff
bed_file = args.bed
# Initialize Tabix-indexed BED file
if bed_file:
log("INFO", "Breaking across positions from BED file")
bed = pysam.TabixFile(bed_file)
bed_contigs = [row.contig for row in bed.fetch(parser = pysam.asBed())]
if gff_file:
gff_file = os.path.abspath(gff_file)
it = make_gff_interval_tree(gff_file)
log("INFO", "Avoiding breaks within GFF intervals")
# Initialize the AGP file
agp = AGPFile(agp_fn, mode="w")
agp.add_pragma()
agp.add_comment("# AGP created by RagTag {}".format(get_ragtag_version()))

# Process the FASTA file
new_header_idx = 0
fai = pysam.FastaFile(asm_fn)
for header in sorted(fai.references):
seq = fai.fetch(header).upper()
seq_len = fai.get_reference_length(header)
if bed_file and header in bed_contigs:
splits_pos = [(row.start, row.end) for row in bed.fetch(header, parser=pysam.asBed()) if row.end <= seq_len]
else:
splits_pos = []
# Remove coordinates overlapping gff features
if gff_file:
#non_gff_breaks = dict()
if splits_pos:
new_splits = []
for split in splits_pos:
if it[header][split[1]]:
log("INFO", "Avoiding breaking %s at %d. This point intersects a feature in the gff file."
% (header, split[1]))
else:
new_splits.append(split)
if new_splits:
splits_pos = new_splits

if not splits_pos:
new_header = "seq{0:08}".format(new_header_idx)
new_header_idx += 1
agp.add_seq_line(header, "1", seq_len, "1", "W", new_header, "1", seq_len, "+")
else:
pid = 1
# The sequence doesn't start with a gap
new_header = "seq{0:08}".format(new_header_idx)
# if there's a split coordinate before the end, that needs to be the "new" end
# Make any/all splits necessary
if splits_pos:
for i in range(0, len(splits_pos)):
# Only first iteration starts at 1 (object; component always starts at 1)
if i == 0:
obj_start=1
obj_end=splits_pos[i][1]
cmp_end=obj_end
log("INFO", "Found first split in %s at %s" % (header, obj_end))
else:
obj_start=obj_end+1
obj_end=splits_pos[i][1]
cmp_end=obj_end - obj_start + 1
log("INFO", "Found another split in %s at %s" % (header, obj_end))
agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+")
new_header_idx += 1
new_header = "seq{0:08}".format(new_header_idx)
pid += 1
# Add in final component to object - from last break till end of the sequence
obj_start=obj_end+1
obj_end=seq_len
cmp_end=obj_end - obj_start + 1
agp.add_seq_line(header, str(obj_start), str(obj_end), str(pid), "W", new_header, "1", str(cmp_end), "+")
else:
agp.add_seq_line(header, "1", str(gap_coords[0][0]), str(pid), "W", new_header, "1", str(gap_coords[0][0]), "+")
new_header_idx += 1
pid += 1

agp.write()

# Iterate over the AGP file and print the sequences
agp = AGPFile(agp_fn, mode="r")
for line in agp.iterate_lines():
if not line.is_gap:
obj, comp, obj_beg, obj_end = line.obj, line.comp, line.obj_beg, line.obj_end
print(">" + comp)
print(fai.fetch(obj, obj_beg-1, obj_end))

fai.close()


if __name__ == "__main__":
main()
30 changes: 26 additions & 4 deletions ragtag_splitasm.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,26 @@
SOFTWARE.
"""

import os
import re
import sys
import argparse

import pysam

from ragtag_correct import make_gff_interval_tree

from ragtag_utilities.utilities import get_ragtag_version
from ragtag_utilities.utilities import log
from ragtag_utilities.AGPFile import AGPFile


def main():
parser = argparse.ArgumentParser(description='Split sequencs at gaps', usage="ragtag.py splitasm <asm.fa>")
parser = argparse.ArgumentParser(description='Split sequences at gaps', usage="ragtag.py splitasm <asm.fa>")
parser.add_argument("asm", metavar="<asm.fa>", default="", type=str, help="assembly fasta file (uncompressed or bgzipped)")
parser.add_argument("-n", metavar="INT", type=int, default=0, help="minimum gap size [0]")
parser.add_argument("-o", metavar="PATH", type=str, default="ragtag.splitasm.agp", help="output AGP file path [./ragtag.splitasm.agp]")
parser.add_argument("--gff", metavar="<features.gff>", type=str, default="", help="don't break sequences within gff intervals [null]")

# Parse the command line arguments
args = parser.parse_args()
Expand All @@ -50,7 +55,11 @@ def main():
asm_fn = args.asm
min_gap_size = args.n
agp_fn = args.o

gff_file = args.gff
if gff_file:
gff_file = os.path.abspath(gff_file)
it = make_gff_interval_tree(gff_file)
log("INFO", "Avoiding breaks within GFF intervals")
# Initialize the AGP file
agp = AGPFile(agp_fn, mode="w")
agp.add_pragma()
Expand All @@ -63,8 +72,21 @@ def main():
seq = fai.fetch(header).upper()
seq_len = fai.get_reference_length(header)
gap_coords = [(i.start(), i.end()) for i in re.finditer(r'N+', seq) if i.end() - i.start() > min_gap_size]

if not gap_coords:
# Remove coordinates overlapping gff features
if gff_file:
#non_gff_breaks = dict()
new_breaks = []
for gap in gap_coords:
if it[header][gap[0]] or it[header][gap[1]]:
log("INFO", "Avoiding breaking %s between %d-%d. This point intersects a feature in the gff file."
% (header, gap[0], gap[1]))
else:
new_breaks.append(gap)
if new_breaks:
#non_gff_breaks[header] = new_breaks
gap_coords = new_breaks

if not gap_coords and not splits_pos:
new_header = "seq{0:08}".format(new_header_idx)
new_header_idx += 1
agp.add_seq_line(header, "1", seq_len, "1", "W", new_header, "1", seq_len, "+")
Expand Down
76 changes: 55 additions & 21 deletions ragtag_update_gff.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -28,60 +28,92 @@
import sys
import argparse
from collections import defaultdict
import re

from intervaltree import IntervalTree

from ragtag_utilities.utilities import log, get_ragtag_version
from ragtag_utilities.AGPFile import AGPFile


def sub_update(gff_file, agp_file):
def sub_update(gff_file, agp_file, is_split):
# Make a dictionary associating each original sequence with an interval tree of component sequences
trans = defaultdict(IntervalTree)
agp = AGPFile(agp_file, mode="r")
for agp_line in agp.iterate_lines():

# Check that the agp file looks correct for this task
if agp_line.orientation == "-":
raise ValueError("The placement BED file is not formatted correctly. No sequences should be reverse complemented for misassembly correction.")
if not agp_line.comp_type == "W":
if not agp_line.comp_type == "W" and not is_split:
raise ValueError("The placement BED file is not formatted correctly. All lines should be WGS contig (W).")
if agp_line.is_gap:
if agp_line.is_gap and not is_split:
raise ValueError("There should be no gaps in the correction AGP file.")
# gaps don't have the orientation attribute so this check should come last
if not is_split:
if agp_line.orientation == "-":
raise ValueError("The placement BED file is not formatted correctly. No sequences should be reverse complemented for misassembly correction.")

start, end = agp_line.obj_beg - 1, agp_line.obj_end
trans[agp_line.obj][start:end] = agp_line.comp
# Cant adjust gap comps because gaps aren't assembled into object
if agp_line.comp_type == "W":
start, end = agp_line.obj_beg - 1, agp_line.obj_end
trans[agp_line.obj][start:end] = agp_line.comp

# Iterate through the gff intervals and update them according to trans
with open(gff_file, "r") as f:
attr_id = re.compile('(?<=ID=).*')
attr_parent = re.compile('(?<=Parent=).*')
ovlp_ids = []
for line in f:
line = line.rstrip()
if line.startswith("#"):
print(line) # Print this comment line
else:
fields = line.split("\t")
h, s, e = fields[0], int(fields[3]), int(fields[4])
attributes = fields[8]
parent = None
feat_id = None
for attr in attributes.split(";"):
feat_id_matches = attr_id.findall(attr)
parent_matches = attr_parent.findall(attr)
if feat_id_matches:
feat_id = feat_id_matches
if parent_matches:
parent = parent_matches


s -= 1 # Keep everything zero-indexed

if h not in trans:
raise ValueError("Inconsistent input files.")

ovlps = trans[h][s:e]
if len(ovlps) > 1:
raise ValueError(
"%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e)
)
#raise ValueError(
# "%s:%d-%d in the gff file overlaps two sub-sequences in the placement file. Make sure to run 'ragtag.py correct' with '--gff'" % (h, s, e)
#)
log("WARNING", "%s:%d-%d in the gff file overlaps two sub sequences in the placement file. Skipping %s and any child features. To retain feature, include '--gff' and re-run 'correct' or 'splitasm'" % (h, s, e, feat_id))
if feat_id:
ovlp_ids.append(feat_id)
continue
if len(ovlps) < 1:
raise ValueError("The placement BED file is not formatted correctly.")

# Get the data from the overlapping interval and print the new line
o = list(ovlps)[0]
new_s = s - o.begin
new_e = e - o.begin
fields[0] = o.data
fields[3] = str(new_s + 1) # back to one-based indexing for gff format
fields[4] = str(new_e)
print("\t".join(fields))
# Check if feat needs to be skipped because of ID or parent match,
# complex solution ensuring orphan feats aren't added e.g. parent gene overlaps but not its stop codon
if parent and parent in ovlp_ids:
log("WARNING", "Parent %s already excluded for overlapping two sub sequences. Skipping %s." % (parent, feat_id))
# To access any potential level 3 feats, add parent to ID exclusion list
ovlp_ids.append(feat_id)
continue
else:
# Get the data from the overlapping interval and print the new line
o = list(ovlps)[0]
new_s = s - o.begin
new_e = e - o.begin
fields[0] = o.data
fields[3] = str(new_s + 1) # back to one-based indexing for gff format
fields[4] = str(new_e)
print("\t".join(fields))


def sup_update(gff_file, agp_file):
Expand Down Expand Up @@ -132,10 +164,11 @@ def sup_update(gff_file, agp_file):


def main():
parser = argparse.ArgumentParser(description="Update gff intervals given a RagTag AGP file", usage="ragtag.py updategff [-c] <genes.gff> <ragtag.agp>")
parser = argparse.ArgumentParser(description="Update gff intervals from given a RagTag AGP file", usage="ragtag.py updategff [-c] <genes.gff> <ragtag.agp>")
parser.add_argument("gff", nargs='?', default="", metavar="<genes.gff>", type=str, help="gff file")
parser.add_argument("agp", nargs='?', default="", metavar="<ragtag.*.agp>", type=str, help="agp file")
parser.add_argument("-c", action="store_true", default=False, help="update for misassembly correction (ragtag.correction.agp)")
parser.add_argument("-s", action="store_true", default=False, help="update for assembly splitting from RagTag")

args = parser.parse_args()

Expand All @@ -149,9 +182,10 @@ def main():
gff_file = os.path.abspath(args.gff)
agp_file = os.path.abspath(args.agp)
is_sub = args.c
is_split = args.s

if is_sub:
sub_update(gff_file, agp_file)
if is_sub or is_split:
sub_update(gff_file, agp_file, is_split)
else:
sup_update(gff_file, agp_file)

Expand Down