Skip to content

Commit

Permalink
New csv + serde parsing, with cleaner type inference; new `TsvRecordI…
Browse files Browse the repository at this point in the history
…terator`.

 - Rename of `map_over_joins()` to `map_joins()`.
 - New IO benchmark of our (old) `Bed5Iterator` vs csv + serde.
 - Reorganized `io::parsers` into different modules.
 - `deserialize_bed_missing()` and `deserialize_option_generic()` for
   missing value to `None`.
 - All new `Bed3Iterator`, `Bed5Iterator`, and `BedlikeIterator`
 - New `TsvRecordIterator` built on csv + serde.
  • Loading branch information
vsbuffalo committed Feb 27, 2024
1 parent 181046b commit 81226a9
Show file tree
Hide file tree
Showing 18 changed files with 576 additions and 553 deletions.
7 changes: 6 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ bytes = "1.5.0"
ndarray-npy = { version = "0.8.1", optional = true }
num-traits = "0.2.18"
lazy_static = "1.4.0"
medians = "3.0.6"
csv = "1.3.0"
serde = { version = "1.0.197", features = ["derive"] }

[features]
# cli = [ "clap" ] # TODO make feature
Expand All @@ -54,3 +55,7 @@ criterion = { version = "0.5.1", features = ["html_reports"] }
name = "bedtools_comparison"
harness = false

[[bench]]
name = "io"
harness = false

2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ let results_gr = left_gr
// Find overlaps
.left_overlaps(&right_gr)?
// Summarize overlap data
.map_over_joins(mean_score)?;
.map_joins(mean_score)?;
```

where `mean_score()` is:
Expand Down
66 changes: 66 additions & 0 deletions benches/io.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
use criterion::{criterion_group, criterion_main, Criterion};
use csv::{self, ReaderBuilder};
use granges::io::parsers::Bed5Addition;
use granges::ranges::GenomicRangeRecord;
use granges::test_utilities::random_bed5file;
use granges::{prelude::*, Position};

#[derive(Debug, serde::Deserialize, PartialEq)]
struct Bed5Row {
seqname: String,
start: Position,
end: Position,
name: String,
score: f64,
}

const BED_LENGTH: usize = 1_000_000;

fn bench_io_shootout(c: &mut Criterion) {
// create the benchmark group
let mut group = c.benchmark_group("adjust");

// create the test data
let input_bedfile = random_bed5file(BED_LENGTH);

let genome = read_seqlens("tests_data/hg38_seqlens.tsv").unwrap();

// configure the sample size for the group
group.sample_size(10);

// Bed5Iterator
group.bench_function("bed5iterator", |b| {
b.iter(|| {
let iter = Bed5Iterator::new(input_bedfile.path()).unwrap();
let gr = GRanges::from_iter(iter, &genome).unwrap();
gr.len()
});
});

// CSV
group.bench_function("csv", |b| {
b.iter(|| {
let mut rdr = ReaderBuilder::new()
.delimiter(b'\t')
.has_headers(false)
.from_path(input_bedfile.path())
.unwrap();

let iter = rdr.deserialize();
let mut gr: GRanges<VecRangesIndexed, Vec<Bed5Addition>> = GRanges::new_vec(&genome);

for result in iter {
let row: GenomicRangeRecord<Bed5Addition> = result.unwrap();
let data = Bed5Addition {
name: row.data.name,
score: row.data.score,
};
gr.push_range(&row.seqname, row.start, row.end, data)
.unwrap();
}
});
});
}

criterion_group!(benches, bench_io_shootout,);
criterion_main!(benches);
21 changes: 11 additions & 10 deletions examples/mean_score.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
use granges::{prelude::*, join::CombinedJoinDataLeftEmpty};
use granges::{join::CombinedJoinDataLeftEmpty, prelude::*};

// Our overlap data processing function.
pub fn mean_score(join_data: CombinedJoinDataLeftEmpty<Option<f64>>) -> f64 {
// Get the "right data" -- the BED5 scores.
let overlap_scores: Vec<f64> = join_data.right_data.into_iter()
let overlap_scores: Vec<f64> = join_data
.right_data
.into_iter()
// filter out missing values ('.' in BED)
.filter_map(|x| x).collect();
.filter_map(|x| x)
.collect();

// calculate the mean
let score_sum: f64 = overlap_scores.iter().sum();
score_sum / (overlap_scores.len() as f64)
}


fn try_main() -> Result<(), granges::error::GRangesError> {
// Mock sequence lengths (e.g. "genome" file)
let genome = seqlens!("chr1" => 100, "chr2" => 100);
Expand All @@ -30,18 +32,17 @@ fn try_main() -> Result<(), granges::error::GRangesError> {
// Convert to interval trees.
.into_coitrees()?
// Extract out just the score from the additional BED5 columns.
.map_data(|bed5_cols| {
bed5_cols.score
})?;
.map_data(|bed5_cols| bed5_cols.score)?;

// Compute overlaps and combine scores into mean.
let results_gr = left_gr
.left_overlaps(&right_gr)?
.map_over_joins(mean_score)?;
.map_joins(mean_score)?;

results_gr.to_tsv(None::<String>, &BED_TSV)?;
Ok(())
}

fn main() { try_main().unwrap(); }

fn main() {
try_main().unwrap();
}
2 changes: 1 addition & 1 deletion src/commands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ pub fn granges_map(
let left_join_gr = left_gr.left_overlaps(&right_gr)?;

// Process all the overlaps.
let result_gr = left_join_gr.map_over_joins(|join_data| {
let result_gr = left_join_gr.map_joins(|join_data| {
// Get the "right data" -- the BED5 scores
let mut overlap_scores: Vec<f64> = join_data
.right_data
Expand Down
7 changes: 3 additions & 4 deletions src/data/operations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ pub fn median<F: Float + Sum>(numbers: &mut [F]) -> Option<F> {
}
let mid = numbers.len() / 2;
if numbers.len() % 2 == 0 {
numbers.select_nth_unstable_by(mid-1, |a, b| a.partial_cmp(b).unwrap());
let lower = numbers[mid-1];
numbers.select_nth_unstable_by(mid - 1, |a, b| a.partial_cmp(b).unwrap());
let lower = numbers[mid - 1];
numbers.select_nth_unstable_by(mid, |a, b| a.partial_cmp(b).unwrap());
let upper = numbers[mid];
Some((lower + upper) / F::from(2.0).unwrap())
Expand Down Expand Up @@ -93,7 +93,7 @@ impl Operation {

#[cfg(test)]
mod tests {
use super::*;
use super::*;

#[test]
fn test_median_empty() {
Expand All @@ -118,5 +118,4 @@ mod tests {
let mut numbers = vec![-3.0, -1.0, -2.0];
assert_eq!(median(&mut numbers), Some(-2.0));
}

}
3 changes: 3 additions & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ pub enum GRangesError {
#[error("File reading error: {0}. Please check if the file exists and you have permission to read it.")]
IOError(#[from] std::io::Error),

#[error("File parsing error: {0}")]
TsvParsingError(#[from] csv::Error),

// File parsing related errors
#[error("Could not determine the file type based on its extension. Ensure the file has a standard genomic data extension (.bed, .gff, etc.).")]
CouldNotDetectRangesFiletype,
Expand Down
12 changes: 6 additions & 6 deletions src/granges.rs
Original file line number Diff line number Diff line change
Expand Up @@ -806,7 +806,7 @@ where
/// See [`CombinedJoinData`] and its convenience methods, which are designed
/// to help downstream statistical calculations that could use the number of overlapping
/// basepairs, overlapping fraction, etc.
pub fn map_over_joins<F, V>(
pub fn map_joins<F, V>(
mut self,
func: F,
) -> Result<GRanges<VecRangesIndexed, Vec<V>>, GRangesError>
Expand Down Expand Up @@ -847,7 +847,7 @@ impl GRanges<VecRangesIndexed, JoinDataBothEmpty> {
/// to help downstream statistical calculations that could use the number of overlapping
/// basepairs, overlapping fraction, etc.
pub fn map_over_joins<F, V>(
pub fn map_joins<F, V>(
mut self,
func: F,
) -> Result<GRanges<VecRangesIndexed, Vec<V>>, GRangesError>
Expand Down Expand Up @@ -885,7 +885,7 @@ where
/// See [`CombinedJoinDataRightEmpty`] and its convenience methods, which are designed
/// to help downstream statistical calculations that could use the number of overlapping
/// basepairs, overlapping fraction, etc.
pub fn map_over_joins<F, V>(
pub fn map_joins<F, V>(
mut self,
func: F,
) -> Result<GRanges<VecRangesIndexed, Vec<V>>, GRangesError>
Expand Down Expand Up @@ -924,7 +924,7 @@ where
/// to help downstream statistical calculations that could use the number of overlapping
/// basepairs, overlapping fraction, etc.
pub fn map_over_joins<F, V>(
pub fn map_joins<F, V>(
mut self,
func: F,
) -> Result<GRanges<VecRangesIndexed, Vec<V>>, GRangesError>
Expand Down Expand Up @@ -1470,7 +1470,7 @@ mod tests {
}

#[test]
fn test_map_over_joins() {
fn test_map_oins() {
let sl = seqlens!("chr1" => 50);
let windows: GRangesEmpty<VecRangesEmpty> =
GRangesEmpty::from_windows(&sl, 10, None, true).unwrap();
Expand All @@ -1487,7 +1487,7 @@ mod tests {
let joined_results = windows
.left_overlaps(&right_gr)
.unwrap()
.map_over_joins(|join_data| {
.map_joins(|join_data| {
let overlap_scores = join_data.right_data;
overlap_scores.iter().sum::<f64>()
})
Expand Down
4 changes: 2 additions & 2 deletions src/io/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ pub mod tsv;

pub use file::{InputStream, OutputStream};
pub use parsers::{
Bed3Iterator, Bed5Iterator, BedlikeIterator, GenomicRangesFile, GenomicRangesParser,
TsvRecordIterator,
bed::Bed3Iterator, bed::Bed5Iterator, bed::BedlikeIterator, tsv::TsvRecordIterator,
GenomicRangesFile, GenomicRangesParser,
};
pub use tsv::BED_TSV;
Loading

0 comments on commit 81226a9

Please sign in to comment.