Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add conjugate links during GFA I/O #1423

Merged
merged 4 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/common/assembly_graph/core/graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ class DeBruijnGraph: public omnigraph::ObservableGraph<DeBruijnDataMaster> {
auto links() { return adt::make_range(link_begin(), link_end()); }
auto links() const { return adt::make_range(link_begin(), link_end()); }

size_t link_size() const {return link_storage_.size(); }

using base::AddVertex;
using base::AddEdge;

Expand Down
5 changes: 2 additions & 3 deletions src/common/io/graph/gfa_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ static void HandleLink(Links &links,
e2 = g.conjugate(e2);

links.emplace_back(e1, e2, record.overlap);
if (e1 != g.conjugate(e2))
links.emplace_back(g.conjugate(e2), g.conjugate(e1), record.overlap);
}

static void HandlePath(std::vector<GFAReader::GFAPath> &paths,
Expand Down Expand Up @@ -175,10 +177,7 @@ static std::pair<unsigned, bool> ProcessLinks(DeBruijnGraph &g, const Links &lin
if (simple) {
g.set_overlap(v1, ovl);
g.set_overlap(v2, ovl);
g.set_overlap(g.conjugate(v1), ovl);
g.set_overlap(g.conjugate(v2), ovl);
} else {
// FIXME: Correctly add conjugate links
LinkId link_idx = g.add_link(e1, e2, ovl);
g.add_link(v1, link_idx);
if (v1 != v2) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>path1
ACATGCTCGATATAACGGTTGCTCACCGGAGAGACGACTTTATAGTTCTTGTAATGGGACTGGCGGGAGGAGAGTTACTACGTGTGGTATATCGGGGTTGGATCAGAGTAATCTGAAACAAAGAAGGTAAGATGTTAGACGGGCGGCATCTATATATATAGGGATACAGTTAGAAGAATAATTGGGATGTATGGTACCTCGTCTGTTGCGGCGTCGAAGTAGCGGTTCCATTTATCGCTGCCGTGCGAAATTAAACTGCGTGGATGCTCTCTTTCCGAGCTCCAACTTCTGTCTTGGCCAGGCGGCATCACTGCGGATCCCCCTTTTTGTGAGGGCCAGTTCCGACCCGTTCACTACAAGACTAGACTGTTCACGTGGCAACCTAGATCCCCAGGCGTACGAGGTCAACTCCTCTCGGAACACCGAAGATGCCGTGGTTGCCACTTAGAAGGCTCTATGACTTTGCACCTATTTTTTGAGAGAGTTATGTCTTGCATCGA
>path2
GGTACCTTGGAGGGACAATTCGGTTACGGATTCATTCATCTTAAGGCAAGACGTTTCCGCGTGTCGCTGTGGTGGCTCTCCCTCGCGTAGTTTCTATACGGATCAGAGTAATCTGAAACAAAGAAGGTAAGATGTTAGACGGGCGGCATCTATATATATAGGGATACAGTTAGAAGAATAATTGGGATGTATGGTACCTCCGGGACTGTTGGTCTTCCCCAAGGGTTCACACGCCCACTTGCGACTGGAGCACGAAATTGTCCAGAATGGCTGCGCTTATGAGACTGGACCCTATCCTCGCGGATTTACGTAATGTTCTACGTACCTATTCCGGCGCCACCAGGTCATGTCCGGCTTGCGCACTTAATTGCGCTATGATGTCCGGAGGCAGCCGACCCAG
11 changes: 11 additions & 0 deletions src/test/debruijn/graph_fragments/v_overlaps/conjugate_bone.gfa
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
H VN:Z:1.0
S edge1 ACATGCTCGATATAACGGTTGCTCACCGGAGAGACGACTTTATAGTTCTTGTAATGGGACTGGCGGGAGGAGAGTTACTACGTGTGGTATATCGGGGTTGGATCAGAGTAATCTGAAACAAAGAAGGTAAGATGTTAGACGGGCGGCATCTATATATATAGGGATACAGTTAGAAGAATAATTGGGATGTATGGTACCTC
S edge2 GGTACCTTGGAGGGACAATTCGGTTACGGATTCATTCATCTTAAGGCAAGACGTTTCCGCGTGTCGCTGTGGTGGCTCTCCCTCGCGTAGTTTCTATACGGATCAGAGTAATCTGAAACAAAGAAGGTAAGATGTTAGACGGGCGGCATCTATATATATAGGGATACAGTTAGAAGAATAATTGGGATGTATGGTACCTC
S edge3 GATCAGAGTAATCTGAAACAAAGAAGGTAAGATGTTAGACGGGCGGCATCTATATATATAGGGATACAGTTAGAAGAATAATTGGGATGTATGGTACCTCGTCTGTTGCGGCGTCGAAGTAGCGGTTCCATTTATCGCTGCCGTGCGAAATTAAACTGCGTGGATGCTCTCTTTCCGAGCTCCAACTTCTGTCTTGGCCAGGCGGCATCACTGCGGATCCCCCTTTTTGTGAGGGCCAGTTCCGACCCGTTCACTACAAGACTAGACTGTTCACGTGGCAACCTAGATCCCCAGGCGTACGAGGTCAACTCCTCTCGGAACACCGAAGATGCCGTGGTTGCCACTTAGAAGGCTCTATGACTTTGCACCTATTTTTTGAGAGAGTTATGTCTTGCATCGA
S edge4 GATCAGAGTAATCTGAAACAAAGAAGGTAAGATGTTAGACGGGCGGCATCTATATATATAGGGATACAGTTAGAAGAATAATTGGGATGTATGGTACCTCCGGGACTGTTGGTCTTCCCCAAGGGTTCACACGCCCACTTGCGACTGGAGCACGAAATTGTCCAGAATGGCTGCGCTTATGAGACTGGACCCTATCCTCGCGGATTTACGTAATGTTCTACGTACCTATTCCGGCGCCACCAGGTCATGTCCGGCTTGCGCACTTAATTGCGCTATGATGTCCGGAGGCAGCCGACCCAG
L edge1 + edge3 + 100M
L edge4 - edge1 - 100M
L edge3 - edge2 - 100M
L edge4 - edge2 - 100M
P path1 edge1+,edge3+ 100M
P path2 edge2+,edge4+ 100M
21 changes: 21 additions & 0 deletions src/test/debruijn/graph_fragments/v_overlaps/conjugate_bone.graph
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
bone
4
edge1 200
edge2 200
edge3 400
edge4 300
1
vertex1 100
4
edge1 vertex1
edge2 vertex1
vertex1 edge3
vertex1 edge4
4
edge1 edge3 100
edge1 edge4 100
edge2 edge3 100
edge2 edge4 100
2
path1 edge1 edge3
path2 edge2 edge4
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
1
split edge1 edge3
2
path1 2 edge1 edge3
path2 2 edge2 edge4
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>path1
GCGGATAGACTTGTGGCAAAAGATGTAAGCTTCATTGTCTGCTAAAGCAGTTCTCAGAACGCCGAAACAGTACCAAAGCCAATGTGCTTCGGTATACACGGGCCTGAGCGGGCCGCACACTCGACAACGGTCGAACTAGTTATCGTATGAGGTCACTCGAGGATAAGCGTAATACGAGTGTCTCTTACGACCACCATGGGTGAGGGGGCCCCGTCGTGGACAAATTTTCTTGGTTTACCCGGAACTAGAAATTAAGTCACGCTTATGTGATAGCTATTGATCTGTAGGATCTTACCGTGTCTTATAGAACCACTCTACATGGGTATTGGGCAGGGTTGAGTTCAACTGTCCTCGGCGGACAGGTATTCTGACGCGGTAAAAGTCCTATCCTCCTCTATGTTGATATGGGACAGGCACGCAGCGAGATATACTGATTAGAGGTATTACACTGACATCAGGAAACATCACAGAATAGGGCCCAGGAGAATACGAGAAAGACAGAAGCAGAATTCAGTTTACAAACCCCGACAACCTGCTTCCGAAAGCGTTCTGCCGGAAAGCTTAGTTCGCGATCAAAGTAAGGCTGTGTGAAGCAAATTCATAAGTGGCCTTTCATGCGCCGTTTGAAGTGTCACAATGAGTCGGGACAGCTCGCACGAAGCTGAATCAATTATGCTAAACCGTGAGACGATATCCTAAGAAGGAATGCAAGGCCGAATGGAATCCTCCGTACTATTCATGTCTTCCCCAGACTGATCGTGACAACCTGGAGGTACTTTTTTGCATCAGTAGAGTTTTGC
>path2
AACCCGGAGAGAACCCAACTTGAAGAGAGAGCCAGGATGTCGGTCAAACAGTTTTCCCTGTTTTCACCCGCGACGAAACGAATCCAGCTCGTAAAATGAGTCAATGGGGAACCACGAAAATGTTCTGACAAAATACGGAACCGCTTGCTACAGGGAGGGAACTAAACGATAAGCATATTTGAACGACAGCTACCAGTAAATGAGGGGGCCCCGTCGTGGACAAATTTTCTTGGTTTACCCGGAACTAGAAATTAAGTCACGCTTATGTGATAGCTATTGATCTGTAGGATCTTACCGTGTCTTATAGAACCACTCTACATGGGTATTGGGCAGGGTTGAGTTCAACTGTCCTCGGCGGACAGGTATTCTGACGCGGTAAAAGTCCTATCCTCCTCTATGTGGCACCGACGACACCTTAGGCACCTGGAATAGAGTAGCATTTGGTGGCCATGCAATCCGCTTGTTCTACACCCCGTACGTCATTGCACATTTCCCTGGAGTATTATTTTATCGTCGCCGGATGGATCCTGCGGTGTTCGTTGCCAAGCGGAGAGCATCGACGGGTGCTGAGGTTCCCTGAGGCTTACAATGTGTCGGTGCTCACCCCGTGAATTATAGAAAACCTGTTTGGTGGGACACATCTTACGGGGATGTATCATAAACAATTCTGCAGATGAAGCACTGCCTAATCACCATCCGCTGATATGGGACAGGCACGCAGCGAGATATACTGATTAGAGGTATTACACTGACATCAGGAAACATCACAGAATAGGGCCCAGGAGAATACGAGAAAGACA
15 changes: 15 additions & 0 deletions src/test/debruijn/graph_fragments/v_overlaps/conjugate_triple.gfa
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
H VN:Z:1.0
S edge1_in GCGGATAGACTTGTGGCAAAAGATGTAAGCTTCATTGTCTGCTAAAGCAGTTCTCAGAACGCCGAAACAGTACCAAAGCCAATGTGCTTCGGTATACACGGGCCTGAGCGGGCCGCACACTCGACAACGGTCGAACTAGTTATCGTATGAGGTCACTCGAGGATAAGCGTAATACGAGTGTCTCTTACGACCACCATGGGTGAGGGGGCCCCGTCGTGGACAAATTTTCTTGGTTTACCCGGAACTAGAAATTAAGTCACGCTTATGTGATAGCTATTGATCTGTAGGATCTTACCGTGTCTTATAGAACCACTCTACATGGGTATTGGGCAGGGTTGAGTTCAACTGTCCTCGGCGGACAGGTATTCTGACGCGGTAAAAGTCCTATCCTCCTCTATGT
S edge2_in AACCCGGAGAGAACCCAACTTGAAGAGAGAGCCAGGATGTCGGTCAAACAGTTTTCCCTGTTTTCACCCGCGACGAAACGAATCCAGCTCGTAAAATGAGTCAATGGGGAACCACGAAAATGTTCTGACAAAATACGGAACCGCTTGCTACAGGGAGGGAACTAAACGATAAGCATATTTGAACGACAGCTACCAGTAAATGAGGGGGCCCCGTCGTGGACAAATTTTCTTGGTTTACCCGGAACTAGAAATTAAGTCACGCTTATGTGATAGCTATTGATCTGTAGGATCTTACCGTGTCTTATAGAACCACTCTACATGGGTATTGGGCAGGGTTGAGTTCAACTGTCCTCGGCGGACAGGTATTCTGACGCGGTAAAAGTCCTATCCTCCTCTATGT
S edge1_mid TGAGGGGGCCCCGTCGTGGACAAATTTTCTTGGTTTACCCGGAACTAGAAATTAAGTCACGCTTATGTGATAGCTATTGATCTGTAGGATCTTACCGTGTCTTATAGAACCACTCTACATGGGTATTGGGCAGGGTTGAGTTCAACTGTCCTCGGCGGACAGGTATTCTGACGCGGTAAAAGTCCTATCCTCCTCTATGTTGATATGGGACAGGCACGCAGCGAGATATACTGATTAGAGGTATTACACTGACATCAGGAAACATCACAGAATAGGGCCCAGGAGAATACGAGAAAGACA
S edge2_mid TGAGGGGGCCCCGTCGTGGACAAATTTTCTTGGTTTACCCGGAACTAGAAATTAAGTCACGCTTATGTGATAGCTATTGATCTGTAGGATCTTACCGTGTCTTATAGAACCACTCTACATGGGTATTGGGCAGGGTTGAGTTCAACTGTCCTCGGCGGACAGGTATTCTGACGCGGTAAAAGTCCTATCCTCCTCTATGTGGCACCGACGACACCTTAGGCACCTGGAATAGAGTAGCATTTGGTGGCCATGCAATCCGCTTGTTCTACACCCCGTACGTCATTGCACATTTCCCTGGAGTATTATTTTATCGTCGCCGGATGGATCCTGCGGTGTTCGTTGCCAAGCGGAGAGCATCGACGGGTGCTGAGGTTCCCTGAGGCTTACAATGTGTCGGTGCTCACCCCGTGAATTATAGAAAACCTGTTTGGTGGGACACATCTTACGGGGATGTATCATAAACAATTCTGCAGATGAAGCACTGCCTAATCACCATCCGCTGATATGGGACAGGCACGCAGCGAGATATACTGATTAGAGGTATTACACTGACATCAGGAAACATCACAGAATAGGGCCCAGGAGAATACGAGAAAGACA
S edge3_in ATTCCTCTGTACAGGACGCCTGCCCTTCCGTGTTAGCAATTTAGTTCGGATTTCGCTTTGGGAGAGCAACCTTCAGCTCAGGCATCATACGAAAATATTTAAAACGTTCCCCAAAGCACTACTTAGTACCTTTTACTGGTAAAAGGGGGCGGAAGTAACCGCCCTTTGCTTAGGCGTGCCAGATAGTCGCGTTAGACAGATATAACCACATGAGCTATACGAAGACAACGTTCTCAGTTTGGGTCGTTATATCCACCGGCGGCCTAAACGGGCCTATATTAATGGCGGGCGAGGGCGAGTCCTTTCGTCCGTTGGCTGCCTGGAGTGGAAGGGTACCACTGGGTCCTTGGGCCTCTCTTCGCTGGCCTATTAGGGTGTTGGCTGACCGGTCACAGAACTGTGATATGGGACAGGCACGCAGCGAGATATACTGATTAGAGGTATTACACTGACATCAGGAAACATCACAGAATAGGGCCCAGGAGAATACGAGAAAGACA
S edge1_out TGATATGGGACAGGCACGCAGCGAGATATACTGATTAGAGGTATTACACTGACATCAGGAAACATCACAGAATAGGGCCCAGGAGAATACGAGAAAGACAGAAGCAGAATTCAGTTTACAAACCCCGACAACCTGCTTCCGAAAGCGTTCTGCCGGAAAGCTTAGTTCGCGATCAAAGTAAGGCTGTGTGAAGCAAATTCATAAGTGGCCTTTCATGCGCCGTTTGAAGTGTCACAATGAGTCGGGACAGCTCGCACGAAGCTGAATCAATTATGCTAAACCGTGAGACGATATCCTAAGAAGGAATGCAAGGCCGAATGGAATCCTCCGTACTATTCATGTCTTCCCCAGACTGATCGTGACAACCTGGAGGTACTTTTTTGCATCAGTAGAGTTTTGC
S edge2_out TGATATGGGACAGGCACGCAGCGAGATATACTGATTAGAGGTATTACACTGACATCAGGAAACATCACAGAATAGGGCCCAGGAGAATACGAGAAAGACAGTATTTGTACGAGCGGTGTGAAGAAACAACGTGCACGTTCCTGTGCTTAAGGAGACGTGTCCATTTTGGGCTTGTCTTATCAAGTATATATTTTTATCTCATAGCTCATTTAGACGGAAGTCTAGTGCACATCGCTGCAGCCCTTCGAAACCTCGTCCAGTATCCCGCGCAGATGACTGGCACTAAGGTAAGCGTGTTGGTCTGTGGCAAAACGTAACCAGTTGGCGAAGCTTGGTATCCTACTGCGCCAATTTTCTCGGCAGTCTTACCCTTTCCAACCTCGCATGATCTCGGTCGTGTTTTTGCTCAATCTGGCTCGTACTGGTATTAAGCCGCATCCCAAGATTCTCGCGCGACTGAATTGACGGGGCGCAAGCGTAAACGGGAAAGAACATCTCCA
S edge3_out TGATATGGGACAGGCACGCAGCGAGATATACTGATTAGAGGTATTACACTGACATCAGGAAACATCACAGAATAGGGCCCAGGAGAATACGAGAAAGACAAGGATGCCTCATAGCTTACTGCACACTAAGAATCCATTGAACCTGAAAACTCGCAATTCCGACGCGACGTTGGGCGATGTCTAGCGCATGCTACCAATGGTACAATTCAATCAGACTTCCGGGTATAACGGTTATCAGCGTATGAACAATAGTTTTTCGTTTGATTGGTGACGTGCTAATACCCCAGGAGCCAGTGAGAGC
L edge1_mid - edge1_in - 200M
L edge2_in + edge2_mid + 200M
L edge1_out - edge1_mid - 100M
L edge3_in + edge3_out + 100M
P path1 edge1_in+,edge1_mid+,edge1_out+ 200M,100M
P path2 edge2_in+,edge2_mid+ 200M
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
triple_repeat
8
edge1_in 400
edge2_in 400
edge1_mid 300
edge2_mid 600
edge3_in 500
edge1_out 400
edge2_out 500
edge3_out 301
2
vertex1 200
vertex2 100
10
edge1_in vertex1
edge2_in vertex1
vertex1 edge1_mid
vertex1 edge2_mid
edge1_mid vertex2
edge2_mid vertex2
edge3_in vertex2
vertex2 edge1_out
vertex2 edge2_out
vertex2 edge3_out
4
edge1_in edge1_mid 200
edge2_in edge2_mid 200
edge1_mid edge1_out 100
edge3_in edge3_out 100
2
path1 edge1_in edge1_mid edge1_out
path2 edge2_in edge2_mid
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
2
split edge1_in edge1_mid
split edge1_mid edge1_out
2
path1 3 edge1_in edge1_mid edge1_out
path2 2 edge2_in edge2_mid
37 changes: 26 additions & 11 deletions src/test/debruijn/v_overlaps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,29 +82,42 @@ std::unordered_map<VertexId, std::string> CheckStructure(std::ifstream &graph_st
void CheckLinks(std::ifstream &graph_stream, const Graph &graph, const IdMapper &id_mapper) {
int num_links;
graph_stream >> num_links;
size_t checked_links = 0;
bool is_complex = false;
for (int i = 0; i < num_links; ++i) {
std::string first_edge_name, second_edge_name;
int overlap;
graph_stream >> first_edge_name >> second_edge_name >> overlap;
EdgeId first_edge = id_mapper[first_edge_name];
EdgeId second_edge = id_mapper[second_edge_name];
VertexId vertex = graph.EdgeEnd(first_edge);
EXPECT_EQ(vertex, graph.EdgeStart(second_edge));
EdgeId in_edge = id_mapper[first_edge_name];
EdgeId out_edge = id_mapper[second_edge_name];
EdgeId in_conjugate = graph.conjugate(in_edge);
EdgeId out_conjugate = graph.conjugate(out_edge);
VertexId vertex = graph.EdgeEnd(in_edge);
EXPECT_EQ(vertex, graph.EdgeStart(out_edge));
EXPECT_EQ(graph.conjugate(vertex), graph.EdgeEnd(out_conjugate));
if (graph.is_complex(vertex)) {
bool link_found = false;
bool conj_link_found = false;
for (const auto &link: graph.links(vertex)) {
if (graph.link(link).link.first == first_edge and graph.link(link).link.second == second_edge) {
auto link_in_edge = graph.link(link).link.first;
auto link_out_edge = graph.link(link).link.second;
if (link_in_edge == in_edge and link_out_edge == out_edge) {
link_found = true;
}
}
EXPECT_TRUE(link_found);
++checked_links;
} else {
checked_links++;
for (const auto &link: graph.links(graph.conjugate(vertex))) {
auto link_in_edge = graph.link(link).link.first;
auto link_out_edge = graph.link(link).link.second;
if (link_in_edge == out_conjugate and link_out_edge == in_conjugate) {
conj_link_found = true;
}
}
EXPECT_TRUE(link_found && conj_link_found);
is_complex = true;
}
}
EXPECT_EQ(num_links, checked_links);
if (is_complex) {
EXPECT_EQ(num_links * 2, graph.link_size());
}
}

void PerformSplits(debruijn_graph::Graph &graph, std::ifstream &ops_stream, const IdMapper &id_mapper) {
Expand Down Expand Up @@ -216,5 +229,7 @@ void CheckGraphWithPaths(const std::filesystem::path &graph_basename) {

TEST(VariableOverlaps, BasicOperations) {
CheckGraphWithPaths("src/test/debruijn/graph_fragments/v_overlaps/bone");
CheckGraphWithPaths("src/test/debruijn/graph_fragments/v_overlaps/conjugate_bone");
CheckGraphWithPaths("src/test/debruijn/graph_fragments/v_overlaps/conjugate_triple");
CheckGraphWithPaths("src/test/debruijn/graph_fragments/v_overlaps/triple_repeat");
}