From 1a3affdb448c873861f8146f31ad8446f9f813c1 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Fri, 16 Aug 2024 17:35:46 -0400 Subject: [PATCH] pore-c pipeline fixed --- src/Snakefiles/8-hicPipeline.sm | 9 +- src/main.mk | 4 +- .../{hicverkko.py => launch_phasing.py} | 0 .../{rdna_scaff.py => launch_scaffolding.py} | 9 +- src/scripts/scaffolding/scaffold_graph.py | 82 ++++++++----------- 5 files changed, 46 insertions(+), 58 deletions(-) rename src/scripts/{hicverkko.py => launch_phasing.py} (100%) rename src/scripts/{rdna_scaff.py => launch_scaffolding.py} (85%) diff --git a/src/Snakefiles/8-hicPipeline.sm b/src/Snakefiles/8-hicPipeline.sm index 7630b2b8..e2b219ec 100644 --- a/src/Snakefiles/8-hicPipeline.sm +++ b/src/Snakefiles/8-hicPipeline.sm @@ -510,7 +510,7 @@ cd 8-hicPipeline cat > ./hic_phasing.sh < ./hic_scaff.sh < ../lib/verkko/scripts/circularize_ctgs.py \ scripts/cluster.py -> ../lib/verkko/scripts/cluster.py \ scripts/graph_functions.py -> ../lib/verkko/scripts/graph_functions.py \ - scripts/hicverkko.py -> ../lib/verkko/scripts/hicverkko.py \ + scripts/launch_phasing.py -> ../lib/verkko/scripts/launch_phasing.py \ scripts/parse_sam_pairs.py -> ../lib/verkko/scripts/parse_sam_pairs.py \ scripts/hic_prefilter.py -> ../lib/verkko/scripts/hic_prefilter.py \ - scripts/rdna_scaff.py -> ../lib/verkko/scripts/rdna_scaff.py \ + scripts/launch_scaffolding.py -> ../lib/verkko/scripts/launch_scaffolding.py \ scripts/remove_nodes_add_telomere.py -> ../lib/verkko/scripts/remove_nodes_add_telomere.py \ scripts/scaffolding/logger_wrap.py -> ../lib/verkko/scripts/scaffolding/logger_wrap.py \ scripts/scaffolding/match_graph.py -> ../lib/verkko/scripts/scaffolding/match_graph.py \ diff --git a/src/scripts/hicverkko.py b/src/scripts/launch_phasing.py similarity index 100% rename from src/scripts/hicverkko.py rename to src/scripts/launch_phasing.py diff --git a/src/scripts/rdna_scaff.py b/src/scripts/launch_scaffolding.py similarity index 85% rename from src/scripts/rdna_scaff.py rename to src/scripts/launch_scaffolding.py index 43242005..82f44dd5 100755 --- a/src/scripts/rdna_scaff.py +++ b/src/scripts/launch_scaffolding.py @@ -18,7 +18,7 @@ debug = False run_dir = sys.argv[1] - +porec = (sys.argv[2] == "True") #TODO: deprectated telomere_script = "/data/antipovd2/devel/scripts/verkko_output_analysis//getOnlyTelomere.sh" hap_assign_script = "/data/antipovd2/devel/scripts/verkko_output_analysis//get.sh" @@ -50,7 +50,10 @@ #path_mashmap = os.path.join(hicrun_dir, "paths2chm13100K.mashmap") rukki_path_storage = path_storage.PathStorage(G) rukki_path_storage.readFromFile(old_rukki_tsv_file) -#sf.try_to_scaff(rukki_paths, telomere_locations_file, os.path.join(hicrun_dir, "hic_mapping.byread.output"), os.path.join(hicrun_dir, "unitigs.matches"), G, indirectG, uncompressed_nodes) -sg = scaffold_graph.ScaffoldGraph(rukki_path_storage, telomere_locations_file, os.path.join(hicrun_dir, "hic_nodefiltered.bam"), os.path.join(hicrun_dir, "unitigs_nonhpc50.mashmap"), G, uncompressed_nodes, path_mashmap, logger) +if porec: + read_aln = os.path.join(hicrun_dir, "hic_mapping.byread.output") +else: + read_aln = os.path.join(hicrun_dir, "hic_nodefiltered.bam") +sg = scaffold_graph.ScaffoldGraph(rukki_path_storage, telomere_locations_file, read_aln, os.path.join(hicrun_dir, "unitigs_nonhpc50.mashmap"), G, uncompressed_nodes, path_mashmap, porec, logger) res = sg.generateScaffolds() diff --git a/src/scripts/scaffolding/scaffold_graph.py b/src/scripts/scaffolding/scaffold_graph.py index f625b6cd..9e3eff34 100755 --- a/src/scripts/scaffolding/scaffold_graph.py +++ b/src/scripts/scaffolding/scaffold_graph.py @@ -112,7 +112,7 @@ class ScaffoldGraph: #to check whether paths have similar lengths, in cheating t2t connection SIMILAR_LEN_FRAC = 0.7 - def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, matches_file, G, uncompressed_fasta, path_mashmap, logger): + def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, matches_file, G, uncompressed_fasta, path_mashmap, porec, logger): self.logger = logger_wrap.UpdatedAdapter(logger, self.__class__.__name__) self.rukki_paths = rukki_paths self.uncompressed_lens = gf.get_lengths(uncompressed_fasta) @@ -126,17 +126,15 @@ def __init__(self, rukki_paths, telomere_locations_file, hic_alignment_file, mat self.logger.debug(self.tel_nodes) self.match_graph = match_graph.MatchGraph(matches_file, G, -239239239, ScaffoldGraph.MATCHGRAPH_LONG_NODE, ScaffoldGraph.MATCHGRAPH_MIN_ALIGNMENT, logger) - - - all_connections, unique_connections = self.get_connections_bam(hic_alignment_file, interesting_nodes, True) - + if porec == True: + self.logger.info ("Loading pore-c alignments") + all_connections, unique_connections = self.get_connections_porec(hic_alignment_file, True) + else: + self.logger.info ("Loading Hi-C alignments") + all_connections, unique_connections = self.get_connections_bam(hic_alignment_file, True) self.output_basename = "scaff_rukki.paths" - - #presaved_pathscores = self.loadPresavedScores("precomputed.pathscores") - - #Used for scaffolding starts and debug, telomeric_ends = self.getTelomericEnds() self.hic_alignment_file = hic_alignment_file @@ -210,15 +208,6 @@ def isHaploidPath(self, paths, nor_path_id): nor_node = or_node.strip("-+") for next in self.match_graph.getHomologousNodes(nor_node, True): - '''' - if len (nodes_to_path_ids[next]) > 1: - #soemthing weird, ignoring - continue - next_id = nodes_to_path_ids[next][0] - if not (next_id in homs): - homs[next_id] = 0 - homs[next_id] += self.match_graph.getEdgeAttribute(nor_node, next, 'homology_len') - ''' total_hom += self.match_graph.getEdgeAttribute(nor_node, next, 'homology_len') path_len = paths.getLength(nor_path_id) #TODO: should exclude homozygous nodes here @@ -380,7 +369,6 @@ def isNextByRef(self, from_path_id, to_path_id): if aligned[mid].query_len * ScaffoldGraph.INTERMEDIATE_PATH_FRACTION < inconsistency_len: return False return True - return False def forbiddenPair(self, from_path_id, to_path_id): @@ -834,44 +822,38 @@ def outputScaffolds(self, final_paths): tsv_file.write(final_paths.getPathTsv(path_id) + "\n") gaf_file.write(final_paths.getPathGaf(path_id) + "\n") return - - #returns: dict {(start_id, end_id):[[start_pos1, end_pos1]]}. Coords not compressed! - def get_connections(self, alignment_file, use_multimappers:bool): + #TODO: this works with hic_mapping.byread.output file that should be exterminated after phasing refactoring + def get_connections_porec(self, alignment_file, use_multimappers:bool): res = {} + unique_storage = {} #A01660:39:HNYM7DSX3:1:1101:1696:29982 utig4-73 utig4-1056 1 16949880 78591191 ind = 0 for line in open (alignment_file): ind += 1 - if (ind % 1000000 == 0): - self.logger.debug (f"Processed {ind} alignments") + if (ind % 10000000 == 0): + self.logger.debug (f"Processed {ind} pore-c alignments") arr = line.split() - if not (use_multimappers) and (arr[1].find(",") != -1 or arr[2].find(",") != -1): - continue - first = arr[1].split(",") - second = arr[2].split(",") - first_coords = arr[4].split(",") - second_coords = arr[5].split(",") - weight = 1 / (len (first) * len(second)) - #efficiently 1/2, 1/3, 1/4, 2/2 are allowed for now. - if weight < 0.24: - continue - - for i in range (0, len(first)): - node_f = first[i] - for j in range (0, len(second)): - node_s = second[j] - if not (node_f, node_s) in res: - res[(node_f, node_s)] = [] - next = [int(first_coords[i]), int(second_coords[j]), weight] - res[(node_f, node_s)].append(next) - - if not (node_s, node_f) in res: - res[(node_s, node_f)] = [] - res[(node_s, node_f)].append([int(second_coords[j]), int(first_coords[i]), weight]) - return res - - def get_connections_bam(self, bam_filename, interesting_names, use_multimappers:bool): + first = int(arr[1]) + second = int(arr[2]) + first_coords = int(arr[4]) + second_coords = int(arr[5]) + if first > second: + first, second = second, first + first_coords, second_coords = second_coords, first_coords + weight = self.INT_NORMALIZATION + names_pair = (first, second) + coords_pair = (first_coords, second_coords) + if not names_pair in res: + res[names_pair] = [] + if not names_pair in unique_storage: + unique_storage[names_pair] = [] + res.append((coords_pair[0], coords_pair[1], weight)) + unique_storage.append((coords_pair[0], coords_pair[1], weight)) + self.logger.debug (f"Finally processed {ind} pore-c alignments") + return res, unique_storage + + def get_connections_bam(self, bam_filename, use_multimappers:bool): res = {} unique_storage = {} #A01660:39:HNYM7DSX3:1:1101:1696:29982 utig4-73 utig4-1056 1 16949880 78591191