diff --git a/liftoff/align_features.py b/liftoff/align_features.py index d107424..598ac78 100644 --- a/liftoff/align_features.py +++ b/liftoff/align_features.py @@ -1,12 +1,14 @@ -from multiprocessing import Pool -import math from functools import partial -import numpy as np -from pyfaidx import Fasta, Faidx +import math +from multiprocessing import Pool +from os import path import subprocess + +import numpy as np +from pyfaidx import Faidx, Fasta import pysam + from liftoff import aligned_seg, liftoff_utils -from os import path def align_features_to_target(ref_chroms, target_chroms, args, feature_hierarchy, liftover_type, unmapped_features): @@ -54,14 +56,33 @@ def align_single_chroms(ref_chroms, target_chroms, threads, args, genome_size, l command = [minimap2_path, '-o', output_file, target_file, features_file] + args.mm2_options.split(" ") + [ "--split-prefix", split_prefix, '-t', threads_arg] subprocess.run(command) + modified_sam_header(output_file) else: minimap2_index = build_minimap2_index(target_file, args, threads_arg, minimap2_path) command = [minimap2_path, '-o', output_file, minimap2_index, features_file] + args.mm2_options.split(" ") + [ '-t', threads_arg] subprocess.run(command) + modified_sam_header(output_file) return output_file +def modified_sam_header(sam_file): + header_lines, alignment_lines = [], [] + with open(sam_file, 'r') as open_sam: + for eachline in open_sam.readlines(): + if eachline[0] == '@': + header_lines.append(eachline) + else: + alignment_lines.append(eachline) + modified_header_lines = [] + for header_line in header_lines: + if header_line not in modified_header_lines: + modified_header_lines.append(header_line) + with open(sam_file, 'w') as open_sam: + open_sam.writelines(modified_header_lines) + open_sam.writelines(alignment_lines) + + def get_features_file(ref_chroms, args, liftover_type, index): if ref_chroms[index] == args.reference and (liftover_type == "chrm_by_chrm" or liftover_type == "copies"): features_name = 'reference_all' diff --git a/liftoff/write_new_gff.py b/liftoff/write_new_gff.py index fdf9bf0..2c615a3 100644 --- a/liftoff/write_new_gff.py +++ b/liftoff/write_new_gff.py @@ -37,7 +37,7 @@ def add_to_copy_num_dict(parent, copy_num_dict): def add_attributes(parent, copy_num, args): parent.score = "." - parent.attributes["extra_copy_number"] = str(copy_num) + parent.attributes["extra_copy_number"] = [copy_num] parent.attributes["copy_num_ID"] = [parent.id + "_" + str(copy_num)] if float(parent.attributes["coverage"][0]) < args.a: parent.attributes["partial_mapping"] = ["True"] @@ -49,7 +49,7 @@ def build_parent_dict(child_features, parent_dict, final_parent): parent_child_dict = {} for child in child_features: if child.id not in parent_dict: - child.attributes["extra_copy_number"] = final_parent.attributes["extra_copy_number"][0] + child.attributes["extra_copy_number"] = final_parent.attributes["extra_copy_number"] if child.attributes["Parent"][0] in parent_child_dict: parent_child_dict[child.attributes["Parent"][0]].append(child) else: @@ -84,7 +84,7 @@ def make_gff_line(feature): if attr != "copy_id": value_str = "" for value in feature.attributes[attr]: - value_str += value + "," + value_str += str(value) + "," attributes_str += (attr + "=" + value_str[:-1] + ";") return feature.seqid + "\t" + feature.source + "\t" + feature.featuretype + "\t" + str(feature.start) + \ "\t" + str(feature.end) + "\t" + "." + "\t" + feature.strand + "\t" + "." + "\t" + attributes_str[:-1] @@ -97,7 +97,7 @@ def make_gtf_line(feature): if len(feature.attributes[attr]) >0: value_str = "" for value in feature.attributes[attr]: - value_str += value + "," + value_str += str(value) + "," attributes_str += (attr + " " + '"' + value_str[:-1] + '"' + "; ") return feature.seqid + "\t" + feature.source + "\t" + feature.featuretype + "\t" + str(feature.start) + \ - "\t" + str(feature.end) + "\t" + "." + "\t" + feature.strand + "\t" + "." + "\t" + attributes_str \ No newline at end of file + "\t" + str(feature.end) + "\t" + "." + "\t" + feature.strand + "\t" + "." + "\t" + attributes_str