Skip to content

Commit

Permalink
Sequences::region_apply_into_granges(), `GRanges::map_into_array*()…
Browse files Browse the repository at this point in the history
…` added, etc.

 - New `Sequences::region_apply_into_granges()`, which applies a function to
   regions of a sequence, and puts the results in a new `GRanges`.
 - New `Sequences` test cases corresponding to `granges_test_case_01()`,
   and lots of new `Sequence`-related tests.
 - Added `map_into_array1()` and `map_into_array2()` for the
   --feature=ndarray, with tests.
 - `take_ranges()` and `take_both()` added.
 - New `GRanges::midpoints()` method, with tests.
 - New `GRanges::data_indices()` method, to build a `GenomeMap` out of
   the data indices.
 - New `GRanges::data_by_seqnames()` which builds a `GenomeMap` of
   `Vec<U>` of the data, by sequence name. `GRanges::data_refs_by_seqnames()`
   also added, for references.
 - `GRanges::into_vecranegs()` added, with `From` method
   for `COITreesEmpty` to `VecRangesEmpty` added since it's needed.
 - Cleaned lifetime out of `IndexedDataContainer`, brought down
   to assoc. type level. Cleans up things a lot!
 - Remove section on readme on slow benchmarks -- this is now
   GH issue #4.
  • Loading branch information
vsbuffalo committed Feb 28, 2024
1 parent f144ab4 commit 114613d
Show file tree
Hide file tree
Showing 22 changed files with 625 additions and 170 deletions.
20 changes: 0 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,23 +110,3 @@ windows 418.87 s 65.13 s 84.4515
map_median 112.35 s 82.81 s 26.2995
```

Note however, there is a arithmetic scaling issue. Larger range datasets
(1,000,000) lead to map operations that are inefficient. This is almost surely
due to `data/operations.rs`. Here is an example benchmark:

```
command bedtools time granges time granges speedup (%)
------------ --------------- -------------- ---------------------
map_multiple 26.89 min 688.60 s 57.3189
map_max 21.34 min 21.54 min -0.94889 *
adjust 21.35 min 499.54 s 60.9959
filter 27.01 min 20.38 min 24.5583
map_min 935.69 s 17.48 min -12.1008 *
flank 30.26 min 943.36 s 48.0353
map_mean 20.13 min 931.21 s 22.913
map_sum 17.99 min 18.44 min -2.5225 *
windows 507.84 s 84.79 s 83.304
map_median 16.90 min 17.81 min -5.36811 *
```

So median, sum, min, and max are slower.
2 changes: 1 addition & 1 deletion benches/bedtools_comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ fn bench_map(c: &mut Criterion) {
let mut group = c.benchmark_group(format!("map_{}", operation).to_string());

// configure the sample size for the group
group.sample_size(10);
group.sample_size(100);
group.bench_function("bedtools_map", |b| {
b.iter(|| {
let bedtools_output_file = File::create(&bedtools_path).unwrap();
Expand Down
4 changes: 1 addition & 3 deletions examples/mean_score.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,7 @@ fn try_main() -> Result<(), granges::error::GRangesError> {
.map_data(|bed5_cols| bed5_cols.score)?;

// Compute overlaps and combine scores into mean.
let results_gr = left_gr
.left_overlaps(&right_gr)?
.map_joins(mean_score)?;
let results_gr = left_gr.left_overlaps(&right_gr)?.map_joins(mean_score)?;

results_gr.to_tsv(None::<String>, &BED_TSV)?;
Ok(())
Expand Down
8 changes: 5 additions & 3 deletions src/data/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,14 @@
use crate::traits::{DataContainer, IntoDatumType};

#[cfg(feature = "ndarray")]
pub mod ndarray;
pub mod operations;
pub mod vec;

impl<U> DataContainer for Vec<U> {}
impl DataContainer for () {}

/// These are core supported data types stored in an `enum`, to
/// unify the types that come out of standard operations like
/// `select()`.
Expand Down Expand Up @@ -67,6 +72,3 @@ impl<T: IntoDatumType> From<T> for DatumType {
item.into_data_type()
}
}

impl<U> DataContainer for Vec<U> {}
impl DataContainer for () {}
114 changes: 103 additions & 11 deletions src/data/ndarray.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,88 @@
//! Data container implementations for [`ndarray::Array1`] and [`ndarray::Array2`].
use crate::error::GRangesError;
use crate::granges::GRanges;
use crate::traits::{DataContainer, IndexedDataContainer, RangeContainer};
use ndarray::{Array1, Array2, ArrayView1};
use crate::traits::IndexedDataContainer;

impl<'a, U> IndexedDataContainer<'a> for Array1<U>
impl<T> DataContainer for Array1<T> {}
impl<T> DataContainer for Array2<T> {}

impl<C, T> GRanges<C, T>
where
C: RangeContainer,
{
/// Take the container data into an iterator, apply a function, a put the results in an
/// [`Array1`].
///
/// [`Array1`]: ndarray::Array1
pub fn map_into_array1<F, U>(mut self, func: F) -> Result<GRanges<C, Array1<U>>, GRangesError>
where
F: FnMut(<T as IntoIterator>::Item) -> U,
T: IntoIterator,
{
let total_len = self.len();
let (ranges, data) = self.take_both()?;

let transformed_data = data.into_iter().map(func).collect::<Array1<U>>();

assert_eq!(
transformed_data.len(),
total_len,
"Transformed data has different length than GRanges!"
);

Ok(GRanges {
ranges,
data: Some(transformed_data),
})
}

/// Take the container data into an iterator, apply a function, a put the results in an
/// [`Array2`].
///
/// [`Array2`]: ndarray::Array2
pub fn map_into_array2<F, U>(
mut self,
ncols: usize,
func: F,
) -> Result<GRanges<C, Array2<U>>, GRangesError>
where
F: FnMut(<T as IntoIterator>::Item) -> Vec<U>,
T: IntoIterator,
{
let total_len = self.len();
let (ranges, data) = self.take_both()?;

let flat_vec: Vec<U> = data.into_iter().flat_map(func).collect();
let transformed_data = Array2::from_shape_vec((total_len, ncols), flat_vec)?;

assert_eq!(
transformed_data.shape()[0],
total_len,
"Transformed data has different length than GRanges!"
);

Ok(GRanges {
ranges,
data: Some(transformed_data),
})
}
}

impl<U> IndexedDataContainer for Array1<U>
where
U: Copy + Default + 'a,
U: Copy + Default + 'static,
{
type Item = U;
type Item<'a> = U;
type OwnedItem = U;
type Output = Array1<U>;

fn get_value(&'a self, index: usize) -> Self::Item {
fn get_value(&self, index: usize) -> Self::Item<'_> {
self[index]
}

fn get_owned(&'a self, index: usize) -> <Self as IndexedDataContainer>::OwnedItem {
fn get_owned(&self, index: usize) -> <Self as IndexedDataContainer>::OwnedItem {
// already owned
self[index]
}
Expand All @@ -33,19 +100,19 @@ where
}
}

impl<'a, U> IndexedDataContainer<'a> for Array2<U>
impl<U> IndexedDataContainer for Array2<U>
where
U: Copy + Default + 'a,
U: Copy + Default + 'static,
{
type Item = ArrayView1<'a, U>;
type Item<'a> = ArrayView1<'a, U>;
type OwnedItem = Array1<U>;
type Output = Array2<U>;

fn get_value(&'a self, index: usize) -> Self::Item {
fn get_value(&self, index: usize) -> Self::Item<'_> {
self.row(index)
}

fn get_owned(&'a self, index: usize) -> <Self as IndexedDataContainer>::OwnedItem {
fn get_owned(&self, index: usize) -> <Self as IndexedDataContainer>::OwnedItem {
self.row(index).to_owned()
}

Expand All @@ -71,3 +138,28 @@ where
.expect("Shape and collected data size mismatch")
}
}

#[cfg(test)]
mod tests {
use crate::test_utilities::granges_test_case_01;

#[test]
fn test_map_into_array1() {
let gr = granges_test_case_01();
let new_gr = gr.map_into_array1(|x| x + 1.0).unwrap();
// note: 5.0, because 5 ranges each 1 added to
assert!((new_gr.data().unwrap().sum() - (24.1 + 5.0)).abs() < 1e-4);
}

#[test]
fn test_map_into_array2() {
let gr = granges_test_case_01();
let new_gr = gr.map_into_array2(2, |x| vec![x + 1.0, 1.0]).unwrap();
// note: 5.0, because 5 ranges each 1 added to
let array2 = new_gr.data().unwrap();
let first_col_sum = array2.sum_axis(ndarray::Axis(0))[0];
let second_col_sum = array2.sum_axis(ndarray::Axis(0))[1];
assert!((first_col_sum - (24.1 + 5.0)).abs() < 1e-4);
assert!((second_col_sum - (5.0)).abs() < 1e-4);
}
}
8 changes: 8 additions & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ pub enum GRangesError {
#[error("The GRanges object is invalid because it lacks an associated data container. Ensure data is loaded or associated with the GRanges object before attempting operations that require data.")]
NoDataContainer,

#[error("The supplied GRanges object and data container cannot be united into a new GRanges since they have differing lengths.")]
IncompatableGRangesAndData,

// Sequences related errors
#[error("The sequence name '{0}' was not found in the dataset. Verify the sequence names and try again.")]
MissingSequenceName(String),
Expand All @@ -159,4 +162,9 @@ pub enum GRangesError {

#[error("No operation was specified. See granges map --help.")]
NoOperationSpecified,

// ndarray related errors
#[cfg(feature = "ndarray")]
#[error("Invalid shape encountered by ndarray: {0}")]
InvalidNdarrayShape(#[from] ndarray::ShapeError),
}
Loading

0 comments on commit 114613d

Please sign in to comment.