Skip to content

Commit

Permalink
correctness fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Dmitry-Antipov committed Aug 8, 2024
1 parent a372038 commit 03eb016
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 8 deletions.
5 changes: 3 additions & 2 deletions src/scripts/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,8 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une
#TODO: move constants to some more relevant place
MIN_LEN = 200000 # best result so far with 200000

MAX_GRAPH_DIST = 10000000 # hi-c links over 10M are believed to be useless
#likely do not need
#MAX_GRAPH_DIST = 100000000 # hi-c links over 10M are believed to be useless

KLIN_STARTS = 1000 # number of different starts of kernighan lin
KLIN_ITER = 10000 # number of iterations inside kernighan lin
Expand Down Expand Up @@ -398,7 +399,7 @@ def run_clustering (graph_gfa, mashmap_sim, hic_byread, output_dir, no_rdna, une
#TODO: likely this is not needed anymore since we already added links between homologous edges.
for e0like in similar_edges[0]:
for e1like in similar_edges[1]:
if e0like in dists and e1like in dists[e0like] and dists[e0like][e1like] < MAX_GRAPH_DIST + G.nodes[e1like]['length']:
if e0like in dists and e1like in dists[e0like]: #and dists[e0like][e1like] < MAX_GRAPH_DIST + G.nodes[e1like]['length']:
C.add_edge(e[0], e[1], weight=hicGraph[e[0]][e[1]]['weight'])
break
logging_f.write(f'Currently {C.number_of_nodes()} in current component\n')
Expand Down
11 changes: 10 additions & 1 deletion src/scripts/hic_prefilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
both_middle_total = 0
one_middle_total = 0
basic_filtered = 0

sam_string_filtering = 0
XA_self_mapped = 0
output_bam = pysam.AlignmentFile('-', 'wb', template=bamfile)
for cur_read in bamfile:
Expand All @@ -36,6 +36,15 @@
if cur_read.is_unmapped or cur_read.reference_name == cur_read.next_reference_name or (cur_read.has_tag("XA") and cur_read.get_tag("XA").find(cur_read.next_reference_name+",") != -1) or (not cur_read.has_tag("XA") and cur_read.mapping_quality == 0):
basic_filtered += 1
continue
al_len = 0
for cigar_operation, length in cur_read.cigartuples:
if cigar_operation == 0:
al_len += length
if al_len * 2 < cur_read.query_length:
#sys.stderr.write (f"{cur_read.query_name} {cur_read.query_length} {al_len} {cur_read.cigarstring}")
sam_string_filtering += 1
continue

cur_name = cur_read.query_name
if cur_name == prev_name:
#TODO: poreC is not compatible with this now
Expand Down
28 changes: 23 additions & 5 deletions src/scripts/scaffolding/scaffold_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def nodeToPathDist(self, node, path, check_homologous):
return closest/2

#not to be used directly, with IDs we presave dists in pathsStorage
def pathDist(self, path_from:list[str], path_to:list[str], check_homologous:bool):
def pathDist(self, path_from:list, path_to:list, check_homologous:bool):
closest = ScaffoldGraph.TOO_FAR
add_dist = 0
for node in reversed(path_from):
Expand Down Expand Up @@ -415,11 +415,17 @@ def forbiddenPair(self, from_path_id, to_path_id):
nor_to_path_id = to_path_id.strip('-+')
#Heterogametic chromosomes get more links since there is no homologous one to absorb multiple alignments, so no connection of diploid and long enough ahploids
if nor_from_path_id in self.haploids and self.rukki_paths.getLength(nor_from_path_id) > ScaffoldGraph.LONG_HAPLOID_CUTOFF and not (nor_to_path_id in self.haploids):
self.logger.warning(f"Banning link from long haploid {from_path_id} to diploid {to_path_id}")
return True
if self.orPathIdDist(from_path_id, to_path_id, self.rukki_paths, True) > ScaffoldGraph.NEAR_PATH_END:
self.logger.warning(f"Banning distant link from long haploid {from_path_id} to diploid {to_path_id}")
return True
else:
self.logger.warning(f"Allowing link from long haploid {from_path_id} to diploid {to_path_id}, close in graph")
if nor_to_path_id in self.haploids and self.rukki_paths.getLength(nor_to_path_id) > ScaffoldGraph.LONG_HAPLOID_CUTOFF and not (nor_from_path_id in self.haploids):
self.logger.warning(f"Banning link from diploid {from_path_id} to long haploid {to_path_id}")
return True
if self.orPathIdDist(from_path_id, to_path_id, self.rukki_paths, True) > ScaffoldGraph.NEAR_PATH_END:
self.logger.warning(f"Banning distant link from diploid {from_path_id} to long haploid {to_path_id}")
return True
else:
self.logger.warning(f"Allowing link from diploid {from_path_id} to long haploid {to_path_id}, close in graph")
#relatively short fragments with telomere are special case, we may fail to detect orientation there but belive in telomere.
if self.rukki_paths.getLength(nor_to_path_id) <= ScaffoldGraph.SHORT_TEL_CUTOFF and self.scaffold_graph.nodes[to_path_id]['telomere'][0]:
self.logger.error(f"Banning link from {from_path_id} into short telomere, should not happen {to_path_id}")
Expand Down Expand Up @@ -526,13 +532,25 @@ def generateScaffolds(self):
if next_path_id == "NONE":
self.logger.info ("Failed to find regular extension for {next_path_id}, trying unique")
next_path_id = self.findNextPath(cur_path_id, nor_used_path_ids, "unique_weight")

if next_path_id == "DONE":
self.logger.info ("All done, stopped at telomere")
break
elif next_path_id == "NONE":
self.logger.info ("Failed to find extension, stopping")
break
else:
hom_before = False
for prev_path_id in cur_scaffold:
nor_new_path_id = gf.nor_path_id(next_path_id)
nor_prev_path_id = gf.nor_path_id(prev_path_id)
if self.match_graph.isHomologousPath([self.rukki_paths.getPathById(nor_new_path_id), self.rukki_paths.getPathById(nor_prev_path_id)], [self.rukki_paths.getLength(nor_new_path_id), self.rukki_paths.getLength(nor_prev_path_id)]):
#TODO: really not normal if we see that best extension is homologous to some path in scaffold, deserves investigation
self.logger.warning(f"Trying to extend, had homologous path in same scaffold before! {nor_new_path_id} {nor_prev_path_id}")
hom_before = True
if hom_before:
self.logger.info (f"Homologous paths before in scaffold, not extending {cur_path_id} with {next_path_id}")
break
self.logger.info (f"Extending {cur_path_id} with {next_path_id}")

#possibly not do so with paths of length one? They can be more successful in other direction
Expand Down

0 comments on commit 03eb016

Please sign in to comment.