Skip to content

Commit

Permalink
implement select with downsample and better error handling with anyhow
Browse files Browse the repository at this point in the history
  • Loading branch information
luizirber committed Jan 30, 2021
1 parent 86f568e commit e9603f3
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 24 deletions.
7 changes: 7 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
anyhow = "1.0.38"
clap = "2.33.3"
env_logger = "0.8.2"
log = "0.4.14"
Expand Down
1 change: 1 addition & 0 deletions data/bat2-LU__AAACCTGAGCCACGCT-s100.sig

Large diffs are not rendered by default.

62 changes: 38 additions & 24 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@ use std::io::{BufRead, BufReader, BufWriter};
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicUsize, Ordering};

use anyhow::{anyhow, Context, Result};
use clap::arg_enum;
use log::info;
use rayon::prelude::*;
use sourmash::signature::Signature;
use sourmash::signature::{Signature, SigsTrait};
use sourmash::sketch::minhash::{
max_hash_for_scaled, HashFunctions, KmerMinHash, KmerMinHashBTree,
};
Expand Down Expand Up @@ -66,7 +67,7 @@ fn subtract<P: AsRef<Path>>(
scaled: usize,
hash_function: HashFunctions,
output: Option<P>,
) -> Result<(), Box<dyn std::error::Error>> {
) -> Result<()> {
info!("Loading queries");

let max_hash = max_hash_for_scaled(scaled as u64);
Expand Down Expand Up @@ -121,42 +122,55 @@ fn subtract<P: AsRef<Path>>(

let processed_sigs = AtomicUsize::new(0);

search_sigs.par_iter().for_each(|filename| {
search_sigs.par_iter().try_for_each(|filename| {
let i = processed_sigs.fetch_add(1, Ordering::SeqCst);
if i % 1000 == 0 {
info!("Processed {} sigs", i);
}

let mut search_mh: Option<KmerMinHashBTree> = None;
let mut search_sig = Signature::from_path(&filename)
.unwrap_or_else(|_| panic!("Error processing {:?}", filename))
.swap_remove(0);
if let Some(sketch) = search_sig.select_sketch(&template) {
if let Sketch::MinHash(mh) = sketch {
search_mh = Some(mh.clone().into());
}
}
let mut search_mh = search_mh.unwrap_or_else(|| {
panic!(
"Unable to load a sketch from {:?} matching the provided template: {:?}",
filename, &template
)
});
let mut search_sig = Signature::from_path(&filename)?.swap_remove(0);

let mut search_mh = select_and_downsample(&search_sig, &template)
.with_context(|| format!("Unable to load a sketch from {:?}", &filename,))?;
// remove the hashes
search_mh.remove_many(&hashes_to_remove).unwrap();
search_mh.remove_many(&hashes_to_remove)?;

// save to output dir
let mut path = outdir.clone();
path.push(filename.file_name().unwrap());
path.push(
filename
.file_name()
.ok_or_else(|| anyhow!("Error converting filename"))?,
);

let mut out = BufWriter::new(File::create(path).unwrap());
let mut out = BufWriter::new(File::create(path)?);
search_sig.reset_sketches();
search_sig.push(Sketch::LargeMinHash(search_mh));
serde_json::to_writer(&mut out, &[search_sig]).unwrap();
});
serde_json::to_writer(&mut out, &[search_sig])?;

Ok(())
Ok(())
})
}

fn select_and_downsample(search_sig: &Signature, template: &Sketch) -> Result<KmerMinHashBTree> {
let mut search_mh: Option<KmerMinHashBTree> = None;
if let Sketch::MinHash(template) = template {
for sketch in search_sig.sketches() {
if let Sketch::MinHash(mh) = sketch {
if mh.check_compatible(&template).is_ok() {
search_mh = Some(mh.into());
} else if mh.ksize() == template.ksize()
&& mh.hash_function() == template.hash_function()
&& mh.seed() == template.seed()
&& mh.scaled() < template.scaled()
{
search_mh = Some(mh.downsample_max_hash(template.max_hash())?.into());
}
}
}
}

search_mh.ok_or_else(|| anyhow!("No sketch matching the provided template: {:?}", &template))
}

fn main() -> Result<(), Box<dyn std::error::Error>> {
Expand Down
29 changes: 29 additions & 0 deletions tests/subtract_cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,32 @@ fn subtract_protein_from_dayhoff() -> Result<(), Box<dyn std::error::Error>> {

Ok(())
}

#[test]
fn dayhoff_downsample() -> Result<(), Box<dyn std::error::Error>> {
let mut cmd = Command::cargo_bin("subtract")?;

let tmp_dir = TempDir::new()?;

let mut file = NamedTempFile::new()?;
writeln!(file, "data/bat2-LU__AAACCTGAGCCACGCT.sig")?;

cmd.arg("data/bat2-LU__AAACCTGAGCCACGCT-s100.sig")
.arg(file.path())
.args(&["-k", "42"])
.args(&["-s", "100"])
.args(&["-e", "dayhoff"])
.args(&["-o", tmp_dir.path().to_str().unwrap()])
.assert()
.success();

let path = tmp_dir
.path()
.join("42")
.join("bat2-LU__AAACCTGAGCCACGCT.sig");
assert!(path.exists());
let mh = &Signature::from_path(path)?.swap_remove(0).sketches()[0];
assert_eq!(mh.size(), 0);

Ok(())
}

0 comments on commit e9603f3

Please sign in to comment.