Skip to content

Commit

Permalink
Implement Chop (#188)
Browse files Browse the repository at this point in the history
  • Loading branch information
susan-garry authored Sep 12, 2024
1 parent eace30f commit ccf1911
Show file tree
Hide file tree
Showing 11 changed files with 305 additions and 17 deletions.
30 changes: 28 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ jobs:

- name: Pull odgi container
run: |
docker pull quay.io/biocontainers/odgi:0.8.3--py310h6cc9453_0
docker tag quay.io/biocontainers/odgi:0.8.3--py310h6cc9453_0 odgi
docker pull quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1
docker tag quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1 odgi
- name: Install odgi alias
run: |
mkdir -p $HOME/.local/bin
Expand Down Expand Up @@ -76,6 +76,32 @@ jobs:
- uses: actions/checkout@v4
- uses: actions-rust-lang/setup-rust-toolchain@v1

# Install slow-odgi.
- uses: actions/cache@v4
id: cache-uv
with:
path: ~/.cache/uv
key: ${{ runner.os }}-python-${{ matrix.python-version }}-uv
- name: Create and activate virtualenv with uv
run: |
curl -LsSf https://astral.sh/uv/install.sh | sh
uv venv
echo "VIRTUAL_ENV=.venv" >> $GITHUB_ENV
echo "$PWD/.venv/bin" >> $GITHUB_PATH
- name: Install Python tools
run: uv pip install -r requirements.txt

# Install odgi
- name: Pull odgi container
run: |
docker pull quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1
docker tag quay.io/biocontainers/odgi:0.8.6--py310hdf79db3_1 odgi
- name: Install odgi alias
run: |
mkdir -p $HOME/.local/bin
cp .github/odgi.sh $HOME/.local/bin/odgi
chmod a+x $HOME/.local/bin/odgi
# Install Turnt.
- uses: actions/setup-python@v5
with:
Expand Down
17 changes: 16 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,29 @@ endif
tests/%.gfa:
curl -Lo ./$@ $(GFA_URL)/$*.gfa

tests/%.og: tests/%.gfa
odgi build -g $< -o $@

.PHONY: fetch
fetch: $(TEST_FILES:%=tests/%.gfa)

fetch-og: $(TEST_FILES:%=tests/%.og)

.PHONY: test-slow-odgi
test-slow-odgi: fetch
make -C slow_odgi test

.PHONY: test-flatgfa
test-flatgfa:
test-flatgfa: fetch
cd flatgfa ; cargo build

turnt -e flatgfa_mem -e flatgfa_file -e flatgfa_file_inplace tests/*.gfa

-turnt --save -v -e chop_oracle_fgfa tests/*.gfa
turnt -v -e flatgfa_chop tests/*.gfa

-turnt --save -v -e odgi_depth tests/*.gfa
turnt -v -e flatgfa_depth tests/*.gfa

clean:
-rm tests/*.flatgfa tests/*.inplace.flatgfa tests/*.chop tests/*.depth tests/*.gfa tests/*.og
3 changes: 1 addition & 2 deletions bench/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ def hyperfine(cmds):
"hyperfine",
"--export-json",
tmp.name,
"--shell=none",
"--warmup=1",
"--min-runs=3",
"--max-runs=16",
Expand Down Expand Up @@ -223,7 +222,7 @@ def run_bench(graph_set, mode, tools, out_csv):
for tool in tools:
assert tool in runner.config["modes"][mode]["cmd"], f"unknown tool {tool}"
else:
# Use all supported tools by deefault.
# Use all supported tools by default.
tools = list(runner.config["modes"][mode]["cmd"].keys())

# Fetch all the graphs and convert them to both odgi and FlatGFA.
Expand Down
9 changes: 7 additions & 2 deletions bench/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,14 @@ convert = false
cmd.flatgfa = '{fgfa} -I {files[gfa]}'
cmd.slow_odgi = '{slow_odgi} norm {files[gfa]}'
cmd.odgi = '{odgi} view -g -i {files[gfa]}'
cmd.gfatools = "{gfatools} view {files[gfa]}"
cmd.gfatools = '{gfatools} view {files[gfa]}'

[modes.depth]
cmd.flatgfa = '{fgfa} -i {files[flatgfa]} depth'
cmd.odgi = '{odgi} depth -i {files[og]} -d'
cmd.slow_odgi = '{slow_odgi} depth {files[gfa]}'
cmd.slow_odgi = '{slow_odgi} depth {files[gfa]}'

[modes.chop]
cmd.flatgfa = '{fgfa} -i {files[flatgfa]} chop -c 3'
cmd.odgi = '{odgi} chop -i {files[og]} -c 3 -o -'
cmd.slow_odgi = '{slow_odgi} chop {files[gfa]} -n 3'
163 changes: 160 additions & 3 deletions flatgfa/src/cmds.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crate::flatgfa::{self, Handle, Segment};
use crate::pool::{self, Id, Store};
use crate::flatgfa::{self, Handle, Link, Orientation, Path, Segment};
use crate::pool::{self, Id, Span, Store};
use crate::{GFAStore, HeapFamily};
use argh::FromArgs;
use std::collections::{HashMap, HashSet};

Expand Down Expand Up @@ -201,7 +202,7 @@ impl<'a> SubgraphBuilder<'a> {

/// Add a single subpath from the given path to the subgraph.
fn include_subpath(&mut self, path: &flatgfa::Path, start: &SubpathStart, end_pos: usize) {
let steps = pool::Span::new(start.step, self.store.steps.next_id());
let steps = pool::Span::new(start.step, self.store.steps.next_id()); // why the next id?
let name = format!("{}:{}-{}", self.old.get_path_name(path), start.pos, end_pos);
self.store
.add_path(name.as_bytes(), steps, std::iter::empty());
Expand Down Expand Up @@ -245,6 +246,7 @@ impl<'a> SubgraphBuilder<'a> {

/// Translate a handle from the source graph to this subgraph.
fn tr_handle(&self, old_handle: flatgfa::Handle) -> flatgfa::Handle {
// TODO: is this just generating the handle or should we add it to the new graph?
flatgfa::Handle::new(self.seg_map[&old_handle.segment()], old_handle.orient())
}

Expand Down Expand Up @@ -318,3 +320,158 @@ pub fn depth(gfa: &flatgfa::FlatGFA) {
);
}
}

/// chop the segments in a graph into sizes of N or smaller
#[derive(FromArgs, PartialEq, Debug)]
#[argh(subcommand, name = "chop")]
pub struct Chop {
/// maximimum segment size.
// Use c in keeping with odgi convention
#[argh(option, short = 'c')]
c: usize,

/// compute new links
#[argh(switch, short = 'l')]
l: bool,
}

/// Chop a graph into segments of size no larger than c
/// By default, compact node ids
/// CIGAR strings, links, and optional Segment data are invalidated by chop
/// Generates a new graph, rather than modifying the old one in place
pub fn chop<'a>(
gfa: &'a flatgfa::FlatGFA<'a>,
args: Chop,
) -> Result<flatgfa::HeapGFAStore, &'static str> {
let mut flat = flatgfa::HeapGFAStore::default();

// when segment S is chopped into segments S1 through S2 (exclusive),
// seg_map[S.name] = Span(Id(S1.name), Id(S2.name)). If S is not chopped: S=S1, S2.name = S1.name+1
let mut seg_map: Vec<Span<Segment>> = Vec::new();
// The smallest id (>0) which does not already belong to a segment in `flat`
let mut max_node_id = 1;

fn link_forward(flat: &mut GFAStore<'static, HeapFamily>, span: &Span<Segment>) {
// Link segments spanned by `span` from head to tail
let overlap = Span::new_empty();
flat.add_links((span.start.index()..span.end.index() - 1).map(|idx| Link {
from: Handle::new(Id::new(idx), Orientation::Forward),
to: Handle::new(Id::new(idx + 1), Orientation::Forward),
overlap,
}));
}

// Add new, chopped segments
for seg in gfa.segs.all().iter() {
let len = seg.len();
if len <= args.c {
// Leave the segment as is
let id = flat.segs.add(Segment {
name: max_node_id,
seq: seg.seq,
optional: Span::new_empty(), // TODO: Optional data may stay valid when seg not chopped?
});
max_node_id += 1;
seg_map.push(Span::new(id, flat.segs.next_id()));
} else {
let seq_end = seg.seq.end;
let mut offset = seg.seq.start.index();
let segs_start = flat.segs.next_id();
// Could also generate end_id by setting it equal to the start_id and
// updating it for each segment that is added - only benefits us if we
// don't unroll the last iteration of this loop
while offset < seq_end.index() - args.c {
// Generate a new segment of length c
flat.segs.add(Segment {
name: max_node_id,
seq: Span::new(Id::new(offset), Id::new(offset + args.c)),
optional: Span::new_empty(),
});
offset += args.c;
max_node_id += 1;
}
// Generate the last segment
flat.segs.add(Segment {
name: max_node_id,
seq: Span::new(Id::new(offset), seq_end),
optional: Span::new_empty(),
});
max_node_id += 1;
let new_seg_span = Span::new(segs_start, flat.segs.next_id());
seg_map.push(new_seg_span);
if args.l {
link_forward(&mut flat, &new_seg_span);
}
}
}

// For each path, add updated handles. Then add the updated path
for path in gfa.paths.all().iter() {
let path_start = flat.steps.next_id();
let mut path_end = flat.steps.next_id();
// Generate the new handles
// Tentative to-do: see if it is faster to read Id from segs than to re-generate it?
for step in gfa.get_path_steps(path) {
let range = {
let span = seg_map[step.segment().index()];
std::ops::Range::from(span)
};
match step.orient() {
Orientation::Forward => {
// In this builder, Id.index() == seg.name - 1 for all seg
path_end = flat
.add_steps(range.map(|idx| Handle::new(Id::new(idx), Orientation::Forward)))
.end;
}
Orientation::Backward => {
path_end = flat
.add_steps(
range
.rev()
.map(|idx| Handle::new(Id::new(idx), Orientation::Backward)),
)
.end;
}
}
}

// Add the updated path
flat.paths.add(Path {
name: path.name,
steps: Span::new(path_start, path_end),
overlaps: Span::new_empty(),
});
}

// If the 'l' flag is specified, compute the links in the new graph
if args.l {
// For each link in the old graph, from handle A -> B:
// Add a link from
// (A.forward ? (A.end, forward) : (A.begin, backwards))
// -> (B.forward ? (B.begin, forward) : (B.end ? backwards))

for link in gfa.links.all().iter() {
let new_from = {
let old_from = link.from;
let chopped_segs = seg_map[old_from.segment().index()];
let seg_id = match old_from.orient() {
Orientation::Forward => chopped_segs.end - 1,
Orientation::Backward => chopped_segs.start,
};
Handle::new(seg_id, old_from.orient())
};
let new_to = {
let old_to = link.to;
let chopped_segs = seg_map[old_to.segment().index()];
let seg_id = match old_to.orient() {
Orientation::Forward => chopped_segs.start,
Orientation::Backward => chopped_segs.end - 1,
};
Handle::new(seg_id, old_to.orient())
};
flat.add_link(new_from, new_to, vec![]);
}
}

Ok(flat)
}
15 changes: 15 additions & 0 deletions flatgfa/src/flatgfa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,12 @@ pub struct Path {
pub overlaps: Span<Span<AlignOp>>,
}

impl Path {
pub fn step_count(&self) -> usize {
self.steps.end.index() - self.steps.start.index()
}
}

/// An allowed edge between two oriented segments.
#[derive(Debug, FromBytes, FromZeroes, AsBytes, Clone, Copy)]
#[repr(packed)]
Expand Down Expand Up @@ -265,6 +271,10 @@ impl<'a> FlatGFA<'a> {
self.name_data[path.name].as_ref()
}

pub fn get_path_steps(&self, path: &Path) -> impl Iterator<Item = &Handle> {
self.steps[path.steps].iter()
}

/// Get a handle's associated segment.
pub fn get_handle_seg(&self, handle: Handle) -> &Segment {
&self.segs[handle.segment()]
Expand Down Expand Up @@ -353,6 +363,11 @@ impl<'a, P: StoreFamily<'a>> GFAStore<'a, P> {
self.steps.add(step)
}

/// Add a sequence of links.
pub fn add_links(&mut self, links: impl Iterator<Item = Link>) -> Span<Link> {
self.links.add_iter(links)
}

/// Add a link between two (oriented) segments.
pub fn add_link(&mut self, from: Handle, to: Handle, overlap: Vec<AlignOp>) -> Id<Link> {
self.links.add(Link {
Expand Down
25 changes: 24 additions & 1 deletion flatgfa/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use argh::FromArgs;
use flatgfa::flatgfa::FlatGFA;
use flatgfa::parse::Parser;
use flatgfa::{cmds, file, parse};
use flatgfa::pool::Store;
use flatgfa::{cmds, file, parse}; // TODO: hopefully remove at some point, this breaks a lot of principles

#[derive(FromArgs)]
/// Convert between GFA text and FlatGFA binary formats.
Expand Down Expand Up @@ -39,6 +40,7 @@ enum Command {
Position(cmds::Position),
Extract(cmds::Extract),
Depth(cmds::Depth),
Chop(cmds::Chop),
}

fn main() -> Result<(), &'static str> {
Expand Down Expand Up @@ -104,6 +106,27 @@ fn main() -> Result<(), &'static str> {
Some(Command::Depth(_)) => {
cmds::depth(&gfa);
}
Some(Command::Chop(sub_args)) => {
let store = cmds::chop(&gfa, sub_args)?;
// TODO: Ideally, find a way to encapsulate the logic of chop in `cmd.rs`, instead of
// defining here which values from out input `gfa` are needed by our final `flat` gfa.
// Here we are reference values in two different Stores to create this Flatgfa, and
// have not yet found a good rust-safe way to do this
let flat = flatgfa::FlatGFA {
header: gfa.header,
seq_data: gfa.seq_data,
name_data: gfa.name_data,
segs: store.segs.as_ref(),
paths: store.paths.as_ref(),
links: store.links.as_ref(),
steps: store.steps.as_ref(),
overlaps: store.overlaps.as_ref(),
alignment: store.alignment.as_ref(),
optional_data: store.optional_data.as_ref(),
line_order: store.line_order.as_ref(),
};
dump(&flat, &args.output);
}
None => {
// Just emit the GFA or FlatGFA file.
dump(&gfa, &args.output);
Expand Down
2 changes: 1 addition & 1 deletion flatgfa/src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ impl<'a, P: flatgfa::StoreFamily<'a>> Parser<'a, P> {

/// Parse a GFA text file from an I/O stream.
pub fn parse_stream<R: BufRead>(mut self, stream: R) -> flatgfa::GFAStore<'a, P> {
// We can parse sements immediately, but we need to defer links and paths until we have all
// We can parse segments immediately, but we need to defer links and paths until we have all
// the segment names that they might refer to.
let mut deferred_links = Vec::new();
let mut deferred_paths = Vec::new();
Expand Down
Loading

0 comments on commit ccf1911

Please sign in to comment.