From ce6957159518deb3480bdaf56067ac9b323e474a Mon Sep 17 00:00:00 2001 From: Vince Buffalo Date: Tue, 20 Feb 2024 12:05:55 -0800 Subject: [PATCH] GenomicRangeRecordUnwrappable trait added, some renaming, better CLI ergo. - Changed --seqlens to --genome. - Added short arguments. - Added skip_missing to skip over ranges not in the genome file. - skip_missing required new filtering, so introduced the `GenomicRangeRecordUnwrappable` trait. This implements the `try_unwrap_data()` method (which was removed from `BedlikeIterator`. The previous method in `BedlikeIterator` returned `impl Iterator` which since it isn't a concrete type, caused lots of ergonomic issues. - Introduced new `UnwrappedRanges` type for the above. - Removed `into_variants()`. - Implemented `retain_seqnames()` and `exclude_seqnames()` for `UnwrappedRanges`. --- benches/bedtools_comparison.rs | 4 +- src/commands.rs | 64 ++++++++++--- src/error.rs | 2 +- src/io/parsers.rs | 162 ++++++++++++++++++--------------- src/lib.rs | 5 +- src/main/mod.rs | 80 +++++++++------- src/traits.rs | 26 +++++- tests/bedtools_validation.rs | 9 +- 8 files changed, 221 insertions(+), 131 deletions(-) diff --git a/benches/bedtools_comparison.rs b/benches/bedtools_comparison.rs index 9b3bd30..0a3cdf2 100644 --- a/benches/bedtools_comparison.rs +++ b/benches/bedtools_comparison.rs @@ -41,7 +41,7 @@ fn bench_range_adjustment(c: &mut Criterion) { b.iter(|| { let granges_output = Command::new(granges_binary_path()) .arg("adjust") - .arg("--seqlens") + .arg("--genome") .arg("tests_data/hg38_seqlens.tsv") .arg("--both") .arg("10") @@ -86,7 +86,7 @@ fn bench_filter_adjustment(c: &mut Criterion) { b.iter(|| { let granges_output = Command::new(granges_binary_path()) .arg("filter") - .arg("--seqlens") + .arg("--genome") .arg("tests_data/hg38_seqlens.tsv") .arg("--left") .arg(&random_bedfile_left) diff --git a/src/commands.rs b/src/commands.rs index 5ab3948..bc9b2f1 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -77,7 +77,7 @@ pub fn granges_adjust( // we know that the records *do* have data. Unwrapping the Option // values means that writing to TSV doesn't have to deal with this (which // always creates headaches). - let gr = GRanges::from_iter(iter.try_unwrap_data()?, &genome)?; + let gr = GRanges::from_iter(iter.try_unwrap_data(), &genome)?; gr.adjust_ranges(-both, both).to_tsv(output)? } GenomicRangesParser::Unsupported => { @@ -94,9 +94,11 @@ pub fn granges_filter( left_path: &PathBuf, right_path: &PathBuf, output: Option<&PathBuf>, + skip_missing: bool, sort: bool, ) -> Result, GRangesError> { let genome = read_seqlens(seqlens)?; + let seqnames: Vec = genome.keys().cloned().collect(); let left_iter = GenomicRangesFile::parsing_iterator(left_path)?; let right_iter = GenomicRangesFile::parsing_iterator(right_path)?; @@ -106,19 +108,38 @@ pub fn granges_filter( match (left_iter, right_iter) { (GenomicRangesParser::Bed3(left), GenomicRangesParser::Bed3(right)) => { - let left_gr = GRangesEmpty::from_iter(left, &genome)?; - let right_gr = GRangesEmpty::from_iter(right, &genome)?; + let left_gr; + let right_gr; + + if skip_missing { + left_gr = GRangesEmpty::from_iter(left.retain_seqnames(&seqnames), &genome)?; + right_gr = GRangesEmpty::from_iter(right.retain_seqnames(&seqnames), &genome)?; + } else { + left_gr = GRangesEmpty::from_iter(left, &genome)?; + right_gr = GRangesEmpty::from_iter(right, &genome)?; + } let right_gr = right_gr.into_coitrees()?; - let mut intersection = left_gr.filter_overlaps(&right_gr)?; + let intersection = left_gr.filter_overlaps(&right_gr)?; intersection.to_tsv(output)?; Ok(CommandOutput::new((), report)) } (GenomicRangesParser::Bed3(left), GenomicRangesParser::Bedlike(right)) => { - let left_gr = GRangesEmpty::from_iter(left, &genome)?; - let right_gr = GRanges::from_iter(right.try_unwrap_data()?, &genome)?; + let left_gr; + let right_gr; + + if skip_missing { + left_gr = GRangesEmpty::from_iter(left.retain_seqnames(&seqnames), &genome)?; + right_gr = GRanges::from_iter( + right.try_unwrap_data().retain_seqnames(&seqnames), + &genome, + )?; + } else { + left_gr = GRangesEmpty::from_iter(left, &genome)?; + right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?; + } let right_gr = right_gr.into_coitrees()?; @@ -128,8 +149,17 @@ pub fn granges_filter( Ok(CommandOutput::new((), report)) } (GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bed3(right)) => { - let left_gr = GRanges::from_iter(left.try_unwrap_data()?, &genome)?; - let right_gr = GRangesEmpty::from_iter(right, &genome)?; + let left_gr; + let right_gr; + + if skip_missing { + left_gr = + GRanges::from_iter(left.try_unwrap_data().retain_seqnames(&seqnames), &genome)?; + right_gr = GRangesEmpty::from_iter(right.retain_seqnames(&seqnames), &genome)?; + } else { + left_gr = GRanges::from_iter(left.try_unwrap_data(), &genome)?; + right_gr = GRangesEmpty::from_iter(right, &genome)?; + } let right_gr = right_gr.into_coitrees()?; @@ -139,8 +169,20 @@ pub fn granges_filter( Ok(CommandOutput::new((), report)) } (GenomicRangesParser::Bedlike(left), GenomicRangesParser::Bedlike(right)) => { - let left_gr = GRanges::from_iter(left.try_unwrap_data()?, &genome)?; - let right_gr = GRanges::from_iter(right.try_unwrap_data()?, &genome)?; + let left_gr; + let right_gr; + + if skip_missing { + left_gr = + GRanges::from_iter(left.try_unwrap_data().retain_seqnames(&seqnames), &genome)?; + right_gr = GRanges::from_iter( + right.try_unwrap_data().retain_seqnames(&seqnames), + &genome, + )?; + } else { + left_gr = GRanges::from_iter(left.try_unwrap_data(), &genome)?; + right_gr = GRanges::from_iter(right.try_unwrap_data(), &genome)?; + } let right_gr = right_gr.into_coitrees()?; @@ -163,7 +205,7 @@ pub fn granges_random_bed( // get the genome info let genome = read_seqlens(seqlens)?; - let mut gr = random_granges(&genome, num.try_into().unwrap())?; + let mut gr = random_granges(&genome, num)?; if sort { gr = gr.sort(); diff --git a/src/error.rs b/src/error.rs index e59f4cb..09382ab 100644 --- a/src/error.rs +++ b/src/error.rs @@ -34,7 +34,7 @@ pub enum GRangesError { InvalidGenomicRange(Position, Position), #[error("Range [{0}, {1}] is invalid for sequence of length {2}")] InvalidGenomicRangeForSequence(Position, Position, Position), - #[error("Sequence name '{0}' is not the ranges container")] + #[error("Sequence name '{0}' is not in the ranges container")] MissingSequence(String), #[error("Error encountered in genomap::GenomeMap")] GenomeMapError(#[from] GenomeMapError), diff --git a/src/io/parsers.rs b/src/io/parsers.rs index 29a38b4..bbad5bb 100644 --- a/src/io/parsers.rs +++ b/src/io/parsers.rs @@ -77,7 +77,7 @@ use std::path::{Path, PathBuf}; use crate::error::GRangesError; use crate::io::file::InputFile; use crate::ranges::{GenomicRangeEmptyRecord, GenomicRangeRecord}; -use crate::traits::GeneralRangeRecordIterator; +use crate::traits::{GeneralRangeRecordIterator, GenomicRangeRecordUnwrappable}; use crate::Position; /// Get the *base* extension to help infer filetype, which ignores compression-related @@ -365,37 +365,10 @@ impl BedlikeIterator { self.number_columns() == 3 } - /// Try to unwrap each [`GenomicRangeRecord>`] iterator item into a - /// [`GenomicRangeRecord`] iterator item. - /// - /// This will raise errors if: - /// 1. The detected number of columns is < 4 (i.e. there appears to be no data to unwrap). - /// 2. During iteration, a `None` data element is encountered. - pub fn try_unwrap_data( - self, - ) -> Result, GRangesError>>, GRangesError> - { - if self.number_columns() < 4 { - return Err(GRangesError::TooFewColumns)?; - } - Ok(self.iter.map(|result| { - result.and_then(|record| { - if let Some(data) = record.data { - Ok(GenomicRangeRecord::new( - record.seqname, - record.start, - record.end, - data, - )) - } else { - Err(GRangesError::TryUnwrapDataError) - } - }) - })) - } - /// Drop the data in each [`GenomicRangeRecord>`] iterator, converting it to a range-only /// [`GenomicRangeRecord<()>`] iterator item. + // TODO: candidate for trait? the impl Iterator isn't a concrete type and can be cumbersome + // in terms of ergonomics. See UnwrappedRanges for an example. pub fn drop_data(self) -> impl Iterator, GRangesError>> { self.iter.map(|result| { result @@ -410,37 +383,6 @@ impl BedlikeIterator { .unwrap_or_else(Err) // pass through parsing errors }) } - - /// Try to convert the iterator into one of two variants: one with data and one without. - pub fn into_variant(self) -> Result { - let number_columns = self.number_columns(); - - if number_columns == 3 { - let without_data_iterator = self.drop_data().map(|result| { - result.map(|record| GenomicRangeRecord { - seqname: record.seqname, - start: record.start, - end: record.end, - data: (), - }) - }); - Ok(GenomicRangesIteratorVariant::Empty(Box::new( - without_data_iterator, - ))) - } else { - let with_data_iterator = self.try_unwrap_data()?.map(|result| { - result.map(|record| GenomicRangeRecord { - seqname: record.seqname, - start: record.start, - end: record.end, - data: record.data, - }) - }); - Ok(GenomicRangesIteratorVariant::WithData(Box::new( - with_data_iterator, - ))) - } - } } impl Iterator for BedlikeIterator { @@ -465,7 +407,7 @@ impl Iterator for BedlikeIterator { /// /// let iter = Bed3Iterator::new("tests_data/example.bed") /// .expect("error reading file") -/// .exclude_seqnames(vec!["chr1".to_string()]); +/// .exclude_seqnames(&vec!["chr1".to_string()]); /// /// let seqlens = seqlens! { "chr1" => 22, "chr2" => 10, "chr3" => 10, "chr4" => 15 }; /// let gr = GRangesEmpty::from_iter(iter, &seqlens) @@ -479,6 +421,7 @@ impl Iterator for BedlikeIterator { /// assert_eq!(iter.next().unwrap().end, 15); /// assert_eq!(iter.next(), None); /// ``` +#[derive(Debug)] pub struct FilteredRanges where I: Iterator>, @@ -494,11 +437,11 @@ where { pub fn new( inner: I, - retain_seqnames: Option>, - exclude_seqnames: Option>, + retain_seqnames: Option<&Vec>, + exclude_seqnames: Option<&Vec>, ) -> Self { - let retain_seqnames = retain_seqnames.map(HashSet::from_iter); - let exclude_seqnames = exclude_seqnames.map(HashSet::from_iter); + let retain_seqnames = retain_seqnames.cloned().map(HashSet::from_iter); + let exclude_seqnames = exclude_seqnames.cloned().map(HashSet::from_iter); Self { inner, retain_seqnames, @@ -575,18 +518,93 @@ where } } -impl GeneralRangeRecordIterator for Bed3Iterator { +impl GeneralRangeRecordIterator>> for BedlikeIterator { fn retain_seqnames( self, - seqnames: Vec, - ) -> FilteredRanges { - FilteredRanges::new(self, Some(seqnames), None) + seqnames: &[String], + ) -> FilteredRanges>> { + FilteredRanges::new(self, Some(&seqnames.to_vec()), None) } fn exclude_seqnames( self, - seqnames: Vec, + seqnames: &[String], + ) -> FilteredRanges>> { + FilteredRanges::new(self, None, Some(&seqnames.to_vec())) + } +} + +impl GeneralRangeRecordIterator for Bed3Iterator { + fn retain_seqnames(self, seqnames: &[String]) -> FilteredRanges { + FilteredRanges::new(self, Some(&seqnames.to_vec()), None) + } + fn exclude_seqnames( + self, + seqnames: &[String], ) -> FilteredRanges { - FilteredRanges::new(self, None, Some(seqnames)) + FilteredRanges::new(self, None, Some(&seqnames.to_vec())) + } +} + +impl GeneralRangeRecordIterator> for UnwrappedRanges +where + I: Iterator>, GRangesError>>, +{ + fn retain_seqnames( + self, + seqnames: &[String], + ) -> FilteredRanges> { + FilteredRanges::new(self, Some(&seqnames.to_vec()), None) + } + fn exclude_seqnames( + self, + seqnames: &[String], + ) -> FilteredRanges> { + FilteredRanges::new(self, None, Some(&seqnames.to_vec())) + } +} + +#[derive(Debug)] +pub struct UnwrappedRanges +where + I: Iterator>, GRangesError>>, +{ + inner: I, +} + +impl UnwrappedRanges +where + I: Iterator>, GRangesError>>, +{ + pub fn new(inner: I) -> Self { + Self { inner } + } +} + +impl Iterator for UnwrappedRanges +where + I: Iterator>, GRangesError>>, +{ + type Item = Result, GRangesError>; + + fn next(&mut self) -> Option { + if let Some(result) = self.inner.next() { + return Some(result.and_then(|record| match record.data { + Some(data) => Ok(GenomicRangeRecord::new( + record.seqname, + record.start, + record.end, + data, + )), + None => Err(GRangesError::TryUnwrapDataError), + })); + } + None + } +} + +impl GenomicRangeRecordUnwrappable for BedlikeIterator { + fn try_unwrap_data(self) -> UnwrappedRanges { + UnwrappedRanges::new(self) } } diff --git a/src/lib.rs b/src/lib.rs index d290463..d3d5644 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -32,8 +32,9 @@ pub mod prelude { pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed}; pub use crate::traits::{ - AsGRangesRef, GeneralRangeRecordIterator, GenericRange, GenomicRangesTsvSerialize, - IndexedDataContainer, IntoIterableRangesContainer, IterableRangeContainer, TsvSerialize, + AsGRangesRef, GeneralRangeRecordIterator, GenericRange, GenomicRangeRecordUnwrappable, + GenomicRangesTsvSerialize, IndexedDataContainer, IntoIterableRangesContainer, + IterableRangeContainer, TsvSerialize, }; pub use crate::seqlens; diff --git a/src/main/mod.rs b/src/main/mod.rs index 151f45e..891faec 100644 --- a/src/main/mod.rs +++ b/src/main/mod.rs @@ -10,15 +10,19 @@ use granges::{ #[cfg(feature = "dev-commands")] use granges::commands::granges_random_bed; -const INFO: &str = "\ +const INFO: &str = r#" granges: genomic range operations built off of the GRanges library usage: granges [--help] Subcommands: - adjust: adjust each genomic range, e.g. to add a kilobase to each end. - -"; + adjust: Adjust each genomic range, e.g. to add a kilobase to each end. + + filter: Filter the left ranges based on whether they have at least one + overlap with a right range. This is equivalent to a filtering + "semi-join" in SQL terminology. + +"#; #[derive(Parser)] #[clap(name = "granges")] @@ -34,64 +38,69 @@ struct Cli { #[derive(Subcommand)] enum Commands { Adjust { - /// a TSV genome file of chromosome names and their lengths - #[arg(long, required = true)] - seqlens: PathBuf, + /// A TSV genome file of chromosome names and their lengths + #[arg(short, long, required = true)] + genome: PathBuf, - /// an input BED-like TSV file + /// An input BED-like TSV file #[arg(required = true)] bedfile: PathBuf, - /// number of basepairs to expand the range start and end positions by - #[arg(long)] + /// Number of basepairs to expand the range start and end positions by + #[arg(short, long)] both: PositionOffset, - /// an optional output file (standard output will be used if not specified) - #[arg(long)] + /// An optional output file (standard output will be used if not specified) + #[arg(short, long)] output: Option, - /// sort the ranges after adjusting their start and end positions - #[arg(long)] + /// Sort the ranges after adjusting their start and end positions + #[arg(short, long)] sort: bool, }, Filter { - /// a TSV genome file of chromosome names and their lengths - #[arg(long, required = true)] - seqlens: PathBuf, + /// A TSV genome file of chromosome names and their lengths + #[arg(short, long, required = true)] + genome: PathBuf, - /// the "left" BED-like TSV file - #[arg(long, required = true)] + /// The "left" BED-like TSV file + #[arg(short, long, required = true)] left: PathBuf, - /// the "right" BED-like TSV file - #[arg(long, required = true)] + /// The "right" BED-like TSV file + #[arg(short, long, required = true)] right: PathBuf, - /// an optional output file (standard output will be used if not specified) - #[arg(long)] + /// An optional output file (standard output will be used if not specified) + #[arg(short, long)] output: Option, - /// sort the ranges after adjusting their start and end positions - #[arg(long)] + /// Sort the ranges after adjusting their start and end positions + #[arg(short, long)] sort: bool, + + /// Skip ranges from sequences (e.g. chromosomes) not present in the genome file. + /// By default, ranges with sequence names not in the genome file will raise an error. + #[arg(short = 'f', long)] + skip_missing: bool, }, #[cfg(feature = "dev-commands")] RandomBed { /// a TSV genome file of chromosome names and their lengths #[arg(required = true)] - seqlens: PathBuf, + genome: PathBuf, /// number of random ranges to generate - #[arg(long, required = true)] + #[arg(short, long, required = true)] num: usize, /// an optional output file (standard output will be used if not specified) - #[arg(long)] + #[arg(short, long)] output: Option, /// sort the ranges - #[arg(long)] + #[arg(short, long)] sort: bool, }, } @@ -101,25 +110,26 @@ fn run() -> Result<(), GRangesError> { let result = match &cli.command { Some(Commands::Adjust { bedfile, - seqlens, + genome, both, output, sort, - }) => granges_adjust(bedfile, seqlens, *both, output.as_ref(), *sort), + }) => granges_adjust(bedfile, genome, *both, output.as_ref(), *sort), Some(Commands::Filter { - seqlens, + genome, left, right, output, sort, - }) => granges_filter(seqlens, left, right, output.as_ref(), *sort), + skip_missing, + }) => granges_filter(genome, left, right, output.as_ref(), *skip_missing, *sort), #[cfg(feature = "dev-commands")] Some(Commands::RandomBed { - seqlens, + genome, num, output, sort, - }) => granges_random_bed(seqlens, *num, output.as_ref(), *sort), + }) => granges_random_bed(genome, *num, output.as_ref(), *sort), None => { println!("{}\n", INFO); std::process::exit(1); diff --git a/src/traits.rs b/src/traits.rs index e46dafc..3e7325e 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -3,7 +3,13 @@ use std::path::PathBuf; -use crate::{error::GRangesError, granges::GRanges, io::parsers::FilteredRanges, Position}; +use crate::{ + error::GRangesError, + granges::GRanges, + io::parsers::{FilteredRanges, UnwrappedRanges}, + ranges::GenomicRangeRecord, + Position, +}; /// Traits for [`GRanges`] types that can be modified. //pub trait GenomicRangesModifiableRanges { @@ -86,8 +92,22 @@ pub trait DataContainer {} pub trait GeneralRangeRecordIterator: Iterator> + Sized { - fn retain_seqnames(self, seqnames: Vec) -> FilteredRanges; - fn exclude_seqnames(self, seqnames: Vec) -> FilteredRanges; + fn retain_seqnames(self, seqnames: &[String]) -> FilteredRanges; + fn exclude_seqnames(self, seqnames: &[String]) -> FilteredRanges; +} + +/// [`GenomicRangeRecordUnwrappable`] defines functionality for unwrapping +/// values from some sort of iterator over [`Result`] (to handle e.g. parsing errors) +/// containing [`GenomicRangeRecord>`], turning them into +/// [`GenomicRangeRecord`] (i.e. "unwrapping" them). +pub trait GenomicRangeRecordUnwrappable: + Iterator>, GRangesError>> + Sized +{ + /// Try unwrapping a [`GenomicRangeRecord>`] into a + /// [`GenomicRangeRecord`], raising a [`GRangesError::TryUnwrapDataError`] if + /// if a `None` is encountered. This is used in lazy parsing when we expect additional + /// data to be parsed, but there isn't any. + fn try_unwrap_data(self) -> UnwrappedRanges; } /// The [`IterableRangeContainer`] trait defines common functionality for iterating over diff --git a/tests/bedtools_validation.rs b/tests/bedtools_validation.rs index bc28518..93cc157 100644 --- a/tests/bedtools_validation.rs +++ b/tests/bedtools_validation.rs @@ -5,8 +5,7 @@ use granges::{ prelude::GenomicRangesFile, test_utilities::{granges_binary_path, random_bedfile, temp_bedfile}, }; -use std::{path::Path, process::Command}; -use tempfile::{Builder, NamedTempFile}; +use std::process::Command; #[test] fn test_random_bed3file_filetype_detect() { @@ -54,7 +53,7 @@ fn test_against_bedtools_slop() { let granges_output = Command::new(granges_binary_path()) .arg("adjust") - .arg("--seqlens") + .arg("--genome") .arg("tests_data/hg38_seqlens.tsv") .arg("--both") .arg(width.to_string()) @@ -74,7 +73,7 @@ fn test_against_bedtools_slop() { /// Test bedtools intersect -a -b -wa -u /// against -/// granges filter --seqlens --left --right +/// granges filter --genome --left --right #[test] fn test_against_bedtools_intersect_wa() { let num_ranges = 1_000_000; @@ -109,7 +108,7 @@ fn test_against_bedtools_intersect_wa() { let granges_output = Command::new(granges_binary_path()) .arg("filter") - .arg("--seqlens") + .arg("--genome") .arg("tests_data/hg38_seqlens.tsv") .arg("--left") .arg(&random_bedfile_left)