diff --git a/rten-tensor/src/copy.rs b/rten-tensor/src/copy.rs index 88321119..f8477f1d 100644 --- a/rten-tensor/src/copy.rs +++ b/rten-tensor/src/copy.rs @@ -171,14 +171,13 @@ fn copy_blocked(src: Matrix, mut dest: MatrixMut>) { /// /// Returns `dest` as an initialized slice. pub fn copy_into_slice<'a, T: Clone>( - src: TensorView, + mut src: TensorView, dest: &'a mut [MaybeUninit], ) -> &'a [T] { assert!(dest.len() == src.len()); // Merge axes to increase the chance that we can use the fast path and // also maximize the iteration count of the innermost loops. - let mut src = src.clone(); src.merge_axes(); if src.ndim() > 4 { diff --git a/rten-tensor/src/tensor.rs b/rten-tensor/src/tensor.rs index e5c98ac6..6971944e 100644 --- a/rten-tensor/src/tensor.rs +++ b/rten-tensor/src/tensor.rs @@ -117,6 +117,14 @@ pub trait AsView: Layout { self.view().broadcast(shape) } + /// Copy elements from this tensor into `dest` in logical order. + /// + /// Returns the initialized slice. Panics if the length of `dest` does + /// not match the number of elements in `self`. + fn copy_into_slice<'a>(&self, dest: &'a mut [MaybeUninit]) -> &'a [Self::Elem] + where + Self::Elem: Copy; + /// Return the layout of this tensor as a slice, if it is contiguous. fn data(&self) -> Option<&[Self::Elem]>; @@ -1676,6 +1684,22 @@ impl, L: MutLayout + Clone> AsView for TensorBase self.view().iter() } + fn copy_into_slice<'a>(&self, dest: &'a mut [MaybeUninit]) -> &'a [T] + where + T: Copy, + { + if let Some(data) = self.data() { + // Safety: `[T]` and `[MaybeUninit]` have same layout. + let src_uninit = unsafe { std::mem::transmute::<&[T], &[MaybeUninit]>(data) }; + dest.copy_from_slice(src_uninit); + // Safety: `copy_from_slice` initializes the whole slice or panics + // if there is a length mismatch. + unsafe { std::mem::transmute::<&[MaybeUninit], &[T]>(dest) } + } else { + copy_into_slice(self.as_dyn(), dest) + } + } + fn data(&self) -> Option<&[Self::Elem]> { self.view().data() } @@ -2540,6 +2564,21 @@ mod tests { assert_eq!(dest.to_vec(), &[1., 2., 3., 4.]); } + #[test] + fn test_copy_into_slice() { + let src = NdTensor::from([[1, 2], [3, 4], [5, 6]]); + let mut buf = Vec::with_capacity(src.len()); + let buf_uninit = &mut buf.spare_capacity_mut()[..src.len()]; + + // Contiguous case. + let elts = src.copy_into_slice(buf_uninit); + assert_eq!(elts, &[1, 2, 3, 4, 5, 6]); + + // Non-contiguous case. + let transposed_elts = src.transposed().copy_into_slice(buf_uninit); + assert_eq!(transposed_elts, &[1, 3, 5, 2, 4, 6]); + } + #[test] fn test_data() { let data = &[1., 2., 3., 4., 5., 6.]; diff --git a/src/ops/reduce.rs b/src/ops/reduce.rs index ab99fdd3..9206aabf 100644 --- a/src/ops/reduce.rs +++ b/src/ops/reduce.rs @@ -3,7 +3,7 @@ use std::cmp::Ordering; use rten_tensor; use rten_tensor::prelude::*; -use rten_tensor::{DynIndices, NdTensor, NdTensorView, SliceItem, Tensor, TensorView}; +use rten_tensor::{NdTensor, NdTensorView, Tensor, TensorView}; use rten_vecmath::{vec_sum, vec_sum_square}; use crate::number::{Identities, IsNaN}; @@ -11,8 +11,8 @@ use crate::ops::layout::squeeze_in_place; use crate::ops::{ resolve_axes, resolve_axis, Input, InputList, IntoOpResult, OpError, Operator, OutputList, }; -use crate::slice_reductions::{iter_sum, slice_sum}; -use crate::tensor_pool::TensorPool; +use crate::slice_reductions::slice_sum; +use crate::tensor_pool::{AutoReturn, TensorPool}; /// Compute the indices of the max elements along an axis, according to a /// comparison function `compare`. @@ -262,28 +262,23 @@ impl Operator for NonZero { } } -/// Trait for reducing a subset of elements from a tensor to a single value. -/// -/// This is a trait rather than a closure to support being invoked with -/// dynamically chosen iterator types. -trait Reducer { - fn reduce>(&self, iter: I) -> T; - +/// Kernel that handles reducing a single slice of the input. +trait ReduceKernel { /// Reduce a contiguous slice of values to a single value. - fn reduce_slice(&self, slice: &[T]) -> T - where - T: Copy, - { - self.reduce(slice.iter().copied()) - } + fn reduce_slice(&self, slice: &[T]) -> T; } -fn reduce>( +/// Outer loop of reduction operations. +/// +/// This iterates over slices of the input that are reduced independently and +/// invokes the kernel on that slice. If the input is not contiguous, the slice +/// is packed before calling the kernel. +fn reduce( pool: &TensorPool, input: TensorView, axes: Option<&[i32]>, keep_dims: bool, - reducer: R, + kernel: &dyn ReduceKernel, ) -> Result, OpError> { let mut resolved_axes = match axes { Some(axes) if !axes.is_empty() => resolve_axes(input.ndim(), axes.iter())?, @@ -291,8 +286,19 @@ fn reduce>( }; resolved_axes.sort(); + // Allocate temporary buffer where slices of the input to be reduced are + // packed first if non-contiguous. + let mut tmp_buf = if !input.is_contiguous() { + let reduced_slice_len = resolved_axes.iter().map(|&dim| input.size(dim)).product(); + pool.alloc(reduced_slice_len) + } else { + Vec::new() + } + .auto_return(pool); + if input.ndim() == 0 { - return Ok(Tensor::from_scalar(reducer.reduce(input.iter().copied()))); + let item = input.item().unwrap(); + return Ok(Tensor::from_scalar(kernel.reduce_slice(&[*item]))); } // nb. Some reduce operations cannot produce a meaningful result with @@ -335,41 +341,40 @@ fn reduce>( reduced_data.extend( input_data .chunks(slice_len) - .map(|chunk| reducer.reduce_slice(chunk)), + .map(|chunk| kernel.reduce_slice(chunk)), ); } _ => { if resolved_axes.len() == 1 { // Fast path for reducing a single axis. let resolved_axis = resolved_axes[0]; - reduced_data.extend( - input - .lanes(resolved_axis) - .map(|lane| reducer.reduce(lane.copied())), - ); + reduced_data.extend(input.lanes(resolved_axis).map(|lane| { + if let Some(lane_slice) = lane.as_slice() { + kernel.reduce_slice(lane_slice) + } else { + tmp_buf.clear(); + tmp_buf.extend(lane.copied()); + kernel.reduce_slice(&tmp_buf) + } + })); } else { - // Slow case when we have to step through each index - let outer_range: Vec<_> = (0..input.ndim()) - .map(|dim| { - if resolved_axes.contains(&dim) { - 1 - } else { - input.size(dim) - } - }) - .collect(); - let mut inner_range = Vec::with_capacity(input.ndim()); - for index in DynIndices::from_shape(&outer_range) { - inner_range.clear(); - inner_range.extend(index.iter().enumerate().map(|(dim, &idx)| { - if resolved_axes.contains(&dim) { - SliceItem::range(0, Some(input.size(dim) as isize), 1) - } else { - SliceItem::Index(idx as isize) - } - })); - let slice = input.slice(inner_range.as_slice()); - let reduced = reducer.reduce(slice.iter().copied()); + // Permute input so the N reduced dims are last, then iterate + // over slices of the inner N dims. + let mut perm: Vec = (0..input.ndim()).collect(); + perm.sort_by_key(|&dim| (resolved_axes.contains(&dim), dim)); + let permuted = input.permuted(&perm); + + for slice in permuted.inner_iter_dyn(resolved_axes.len()) { + // The reduced dimensions may be contiguous even if the + // tensor is not. + let reduced = if let Some(data) = slice.data() { + kernel.reduce_slice(data) + } else { + tmp_buf.clear(); + let tmp_uninit = &mut tmp_buf.spare_capacity_mut()[..slice.len()]; + let tmp = slice.copy_into_slice(tmp_uninit); + kernel.reduce_slice(tmp) + }; reduced_data.push(reduced); } } @@ -393,19 +398,14 @@ pub fn reduce_mean( axes: Option<&[i32]>, keep_dims: bool, ) -> Result { - struct MeanReducer {} - impl Reducer for MeanReducer { - fn reduce>(&self, iter: I) -> f32 { - let len = iter.len(); - iter_sum(iter) / len as f32 - } - + struct MeanKernel {} + impl ReduceKernel for MeanKernel { fn reduce_slice(&self, slice: &[f32]) -> f32 { vec_sum(slice) / slice.len() as f32 } } - reduce(pool, input, axes, keep_dims, MeanReducer {}) + reduce(pool, input, axes, keep_dims, &MeanKernel {}) } /// Reduces axes of a tensor using an inverse Root Mean Squared (RMS) @@ -423,24 +423,18 @@ pub fn reduce_inverse_rms( keep_dims: bool, epsilon: f32, ) -> Result { - struct InverseRmsReducer { + struct InverseRmsKernel { epsilon: f32, } - impl Reducer for InverseRmsReducer { - fn reduce>(&self, iter: I) -> f32 { - let len = iter.len(); - let mean_square = iter_sum(iter.map(|x| x * x)) / len as f32; - 1. / (mean_square + self.epsilon).sqrt() - } - + impl ReduceKernel for InverseRmsKernel { fn reduce_slice(&self, slice: &[f32]) -> f32 { let mean_square = vec_sum_square(slice) / slice.len() as f32; 1. / (mean_square + self.epsilon).sqrt() } } - reduce(pool, input, axes, keep_dims, InverseRmsReducer { epsilon }) + reduce(pool, input, axes, keep_dims, &InverseRmsKernel { epsilon }) } #[derive(Debug)] @@ -473,19 +467,14 @@ pub fn reduce_l2( axes: Option<&[i32]>, keep_dims: bool, ) -> Result { - struct L2Reducer {} - impl Reducer for L2Reducer { - fn reduce>(&self, iter: I) -> f32 { - let sum_of_squares: f32 = iter.map(|val| val * val).sum(); - sum_of_squares.sqrt() - } - + struct L2ReduceKernel {} + impl ReduceKernel for L2ReduceKernel { fn reduce_slice(&self, slice: &[f32]) -> f32 { vec_sum_square(slice).sqrt() } } - reduce(pool, input, axes, keep_dims, L2Reducer {}) + reduce(pool, input, axes, keep_dims, &L2ReduceKernel {}) } #[derive(Debug)] @@ -550,17 +539,17 @@ fn reduce_min_max( struct MinMaxReducer { max: bool, } - impl Reducer for MinMaxReducer { - fn reduce>(&self, iter: I) -> T { + impl ReduceKernel for MinMaxReducer { + fn reduce_slice(&self, slice: &[T]) -> T { let reduced = if self.max { - iter.max_by(|a, b| cmp_nan_greater(*a, *b)) + slice.iter().copied().max_by(|a, b| cmp_nan_greater(*a, *b)) } else { - iter.min_by(|a, b| cmp_nan_less(*a, *b)) + slice.iter().copied().min_by(|a, b| cmp_nan_less(*a, *b)) }; reduced.expect("attempted to get min/max of empty axis") } } - reduce(pool, input, axes, keep_dims, MinMaxReducer { max }) + reduce(pool, input, axes, keep_dims, &MinMaxReducer { max }) } /// Extract axes from input 1 in `inputs` or `attr`. @@ -638,13 +627,13 @@ pub fn reduce_prod( axes: Option<&[i32]>, keep_dims: bool, ) -> Result, OpError> { - struct ProdReducer {} - impl Reducer for ProdReducer { - fn reduce>(&self, iter: I) -> T { - iter.product() + struct ProdKernel {} + impl ReduceKernel for ProdKernel { + fn reduce_slice(&self, slice: &[T]) -> T { + slice.iter().copied().product() } } - reduce(pool, input, axes, keep_dims, ProdReducer {}) + reduce(pool, input, axes, keep_dims, &ProdKernel {}) } #[derive(Debug)] @@ -671,17 +660,13 @@ pub fn reduce_sum>( axes: Option<&[i32]>, keep_dims: bool, ) -> Result, OpError> { - struct SumReducer {} - impl> Reducer for SumReducer { - fn reduce>(&self, iter: I) -> T { - iter_sum(iter) - } - + struct SumKernel {} + impl> ReduceKernel for SumKernel { fn reduce_slice(&self, slice: &[T]) -> T { slice_sum(slice) } } - reduce(pool, input, axes, keep_dims, SumReducer {}) + reduce(pool, input, axes, keep_dims, &SumKernel {}) } #[derive(Debug)] @@ -708,13 +693,13 @@ pub fn reduce_sum_square + std::iter::Sum axes: Option<&[i32]>, keep_dims: bool, ) -> Result, OpError> { - struct SumSquareReducer {} - impl> Reducer for SumSquareReducer { - fn reduce>(&self, iter: I) -> T { - iter.map(|x| x * x).sum() + struct SumSquareKernel {} + impl> ReduceKernel for SumSquareKernel { + fn reduce_slice(&self, slice: &[T]) -> T { + slice.iter().copied().map(|x| x * x).sum() } } - reduce(pool, input, axes, keep_dims, SumSquareReducer {}) + reduce(pool, input, axes, keep_dims, &SumSquareKernel {}) } #[derive(Debug)] @@ -851,7 +836,7 @@ mod tests { use rten_tensor::prelude::*; use rten_tensor::test_util::{eq_with_nans, expect_equal}; - use rten_tensor::{NdTensor, Tensor}; + use rten_tensor::{NdTensor, SliceRange, Tensor}; use crate::ops::tests::{new_pool, run_op}; use crate::ops::{ @@ -1069,6 +1054,8 @@ mod tests { Ok(()) } + // Tests for ReduceMean specifically that also cover common functionality + // across the different reductions. #[test] fn test_reduce_mean() -> Result<(), Box> { let pool = new_pool(); @@ -1128,6 +1115,44 @@ mod tests { .unwrap(); assert_eq!(result.to_vec(), &[5.0]); + // Reduce non-contiguous lane + let tensor = Tensor::from([0., 1., 2., 3., 4., 5., 6.]); + let slice = tensor.slice(SliceRange::new(0, None, 2)); + let expected_mean = slice.iter().sum::() / slice.len() as f32; + let result = reduce_mean(&pool, slice.view(), Some(&[0]), false /* keep_dims */).unwrap(); + assert_eq!(result.to_vec(), &[expected_mean]); + + // Reduce contiguous lanes in non-contiguous tensor + let tensor = Tensor::from([[0., 1.], [2., 3.], [4., 5.]]); + let slice = tensor.slice(SliceRange::new(0, None, 2)); + let result = reduce_mean(&pool, slice.view(), Some(&[1]), false /* keep_dims */).unwrap(); + assert_eq!(result.to_vec(), &[0.5, 4.5]); + + // Reduce multiple non-contiguous dimensions + let tensor = Tensor::from([[0., 1.], [2., 3.], [4., 5.]]); + let slice = tensor.slice((SliceRange::new(0, None, 2), SliceRange::new(0, None, 2))); + let expected_mean = slice.iter().sum::() / slice.len() as f32; + let result = reduce_mean( + &pool, + slice.view(), + Some(&[0, 1]), + false, /* keep_dims */ + ) + .unwrap(); + assert_eq!(result.to_vec(), &[expected_mean]); + + // Reduce multiple contiguous dimensions in non-contiguous tensor + let tensor = Tensor::from([[[0.], [1.]], [[2.], [3.]], [[4.], [5.]]]); + let slice = tensor.slice(SliceRange::new(0, None, 2)); + let result = reduce_mean( + &pool, + slice.view(), + Some(&[1, 2]), + false, /* keep_dims */ + ) + .unwrap(); + assert_eq!(result.to_vec(), &[0.5, 4.5]); + Ok(()) } diff --git a/src/slice_reductions.rs b/src/slice_reductions.rs index 57b9bb96..e514dc24 100644 --- a/src/slice_reductions.rs +++ b/src/slice_reductions.rs @@ -64,36 +64,6 @@ pub fn slice_map_sum, M: Fn(T) -> .fold(T::default(), |acc, x| acc + x) } -/// Return the sum of an iterator of numbers. -pub fn iter_sum, I: ExactSizeIterator>( - mut iter: I, -) -> T { - let zero = T::default(); - let len = iter.len(); - let mut sum = zero; - let mut n = len; - - while n > 4 { - n -= 4; - - let a = iter.next().unwrap_or(zero); - let b = iter.next().unwrap_or(zero); - let c = iter.next().unwrap_or(zero); - let d = iter.next().unwrap_or(zero); - - let ab = a + b; - let cd = c + d; - let abcd = ab + cd; - sum = sum + abcd; - } - - for x in iter { - sum = sum + x; - } - - sum -} - #[cfg(test)] mod tests { use rten_tensor::rng::XorShiftRng; diff --git a/src/tensor_pool.rs b/src/tensor_pool.rs index 55da77c1..30c318a4 100644 --- a/src/tensor_pool.rs +++ b/src/tensor_pool.rs @@ -216,6 +216,18 @@ pub trait ExtractBuffer { fn extract_buffer(self) -> Option>; } +impl ExtractBuffer for Vec { + type Elem = T; + + fn extract_buffer(self) -> Option> { + if self.capacity() > 0 { + Some(self) + } else { + None + } + } +} + impl ExtractBuffer for TensorBase, L> { type Elem = T; @@ -420,23 +432,39 @@ mod tests { let pool = TensorPool::new(); assert_eq!(pool.len(), 0); - { - // Owned tensor. This will auto-return to the pool. - let tensor = NdTensor::::zeros_in(&pool, [2, 2]).auto_return(&pool); - assert_eq!(tensor.shape(), [2, 2]); - - // Conditional copy which doesn't copy. This will not return to the pool. - tensor.to_contiguous_in(&pool).auto_return(&pool); + // Owned tensor. This will auto-return to the pool. + let tensor = NdTensor::::zeros_in(&pool, [2, 2]).auto_return(&pool); + assert_eq!(tensor.shape(), [2, 2]); + assert_eq!(pool.alloc_count(), 1); + assert_eq!(pool.len(), 0); - // Conditional copy which does copy. This will return to the pool. - tensor - .transposed() - .to_contiguous_in(&pool) - .auto_return(&pool); - } + // Conditional copy which doesn't copy. This will not return to the pool. + let copy = tensor.to_contiguous_in(&pool).auto_return(&pool); + std::mem::drop(copy); + assert_eq!(pool.alloc_count(), 1); + assert_eq!(pool.len(), 0); + // Conditional copy which does copy. This will return to the pool. + let copy = tensor + .transposed() + .to_contiguous_in(&pool) + .auto_return(&pool); + std::mem::drop(copy); assert_eq!(pool.alloc_count(), 2); + assert_eq!(pool.len(), 1); + + std::mem::drop(tensor); assert_eq!(pool.len(), 2); + + // Non-empty vector. This will return to the pool. + let non_empty = Vec::::with_capacity(16).auto_return(&pool); + std::mem::drop(non_empty); + assert_eq!(pool.len(), 3); + + // Empty vector. This will not return to the pool. + let empty = Vec::::new().auto_return(&pool); + std::mem::drop(empty); + assert_eq!(pool.len(), 3); } #[test]