From 11b3250206fec47b86c1377c0a38828b81fe20fb Mon Sep 17 00:00:00 2001 From: Robert Knight Date: Mon, 16 Dec 2024 19:54:57 +0100 Subject: [PATCH 01/10] Re-organize `auto_return` tests so we check effects after each case --- src/tensor_pool.rs | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/tensor_pool.rs b/src/tensor_pool.rs index 55da77c1..46d0723d 100644 --- a/src/tensor_pool.rs +++ b/src/tensor_pool.rs @@ -420,22 +420,28 @@ mod tests { let pool = TensorPool::new(); assert_eq!(pool.len(), 0); - { - // Owned tensor. This will auto-return to the pool. - let tensor = NdTensor::::zeros_in(&pool, [2, 2]).auto_return(&pool); - assert_eq!(tensor.shape(), [2, 2]); - - // Conditional copy which doesn't copy. This will not return to the pool. - tensor.to_contiguous_in(&pool).auto_return(&pool); + // Owned tensor. This will auto-return to the pool. + let tensor = NdTensor::::zeros_in(&pool, [2, 2]).auto_return(&pool); + assert_eq!(tensor.shape(), [2, 2]); + assert_eq!(pool.alloc_count(), 1); + assert_eq!(pool.len(), 0); - // Conditional copy which does copy. This will return to the pool. - tensor - .transposed() - .to_contiguous_in(&pool) - .auto_return(&pool); - } + // Conditional copy which doesn't copy. This will not return to the pool. + let copy = tensor.to_contiguous_in(&pool).auto_return(&pool); + std::mem::drop(copy); + assert_eq!(pool.alloc_count(), 1); + assert_eq!(pool.len(), 0); + // Conditional copy which does copy. This will return to the pool. + let copy = tensor + .transposed() + .to_contiguous_in(&pool) + .auto_return(&pool); + std::mem::drop(copy); assert_eq!(pool.alloc_count(), 2); + assert_eq!(pool.len(), 1); + + std::mem::drop(tensor); assert_eq!(pool.len(), 2); } From 9a2a022b91aa3c4da9352e612868b98350ff71ce Mon Sep 17 00:00:00 2001 From: Robert Knight Date: Mon, 16 Dec 2024 12:00:51 +0100 Subject: [PATCH 02/10] Impl ExtractBuffer for Vec This is useful for allocating temporary buffers from a pool that are not part of a tensor. --- src/tensor_pool.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/tensor_pool.rs b/src/tensor_pool.rs index 46d0723d..30c318a4 100644 --- a/src/tensor_pool.rs +++ b/src/tensor_pool.rs @@ -216,6 +216,18 @@ pub trait ExtractBuffer { fn extract_buffer(self) -> Option>; } +impl ExtractBuffer for Vec { + type Elem = T; + + fn extract_buffer(self) -> Option> { + if self.capacity() > 0 { + Some(self) + } else { + None + } + } +} + impl ExtractBuffer for TensorBase, L> { type Elem = T; @@ -443,6 +455,16 @@ mod tests { std::mem::drop(tensor); assert_eq!(pool.len(), 2); + + // Non-empty vector. This will return to the pool. + let non_empty = Vec::::with_capacity(16).auto_return(&pool); + std::mem::drop(non_empty); + assert_eq!(pool.len(), 3); + + // Empty vector. This will not return to the pool. + let empty = Vec::::new().auto_return(&pool); + std::mem::drop(empty); + assert_eq!(pool.len(), 3); } #[test] From 908bdb58a05cb3154b8122e6517bbd33da8e3b93 Mon Sep 17 00:00:00 2001 From: Robert Knight Date: Mon, 16 Dec 2024 12:02:05 +0100 Subject: [PATCH 03/10] Pack reduced slices if non-contiguous before reducing Change the core logic of reduction operations to always operate on contiguous slices. If a tensor slice to be reduced is not contiguous, copy to a temporary buffer first. This has several benefits: - It avoids the need to write two versions of each reducer: a fast, possibly vectorized path for contiguous inputs and a fallback path using iterators. - It makes the performance of the fallback case more predictable. - The amount of code generated for reductions can be reduced by using dynamic dispatch instead of generics to call the slice reduction method. This reduces LLVM IR as reported by `cargo llvm-lines -p rten` by 3.7% from 717K to 690K lines. --- src/ops/reduce.rs | 102 ++++++++++++++++++---------------------- src/slice_reductions.rs | 30 ------------ 2 files changed, 45 insertions(+), 87 deletions(-) diff --git a/src/ops/reduce.rs b/src/ops/reduce.rs index ab99fdd3..dbe136b4 100644 --- a/src/ops/reduce.rs +++ b/src/ops/reduce.rs @@ -11,8 +11,8 @@ use crate::ops::layout::squeeze_in_place; use crate::ops::{ resolve_axes, resolve_axis, Input, InputList, IntoOpResult, OpError, Operator, OutputList, }; -use crate::slice_reductions::{iter_sum, slice_sum}; -use crate::tensor_pool::TensorPool; +use crate::slice_reductions::slice_sum; +use crate::tensor_pool::{AutoReturn, TensorPool}; /// Compute the indices of the max elements along an axis, according to a /// comparison function `compare`. @@ -263,27 +263,17 @@ impl Operator for NonZero { } /// Trait for reducing a subset of elements from a tensor to a single value. -/// -/// This is a trait rather than a closure to support being invoked with -/// dynamically chosen iterator types. trait Reducer { - fn reduce>(&self, iter: I) -> T; - /// Reduce a contiguous slice of values to a single value. - fn reduce_slice(&self, slice: &[T]) -> T - where - T: Copy, - { - self.reduce(slice.iter().copied()) - } + fn reduce_slice(&self, slice: &[T]) -> T; } -fn reduce>( +fn reduce( pool: &TensorPool, input: TensorView, axes: Option<&[i32]>, keep_dims: bool, - reducer: R, + reducer: &dyn Reducer, ) -> Result, OpError> { let mut resolved_axes = match axes { Some(axes) if !axes.is_empty() => resolve_axes(input.ndim(), axes.iter())?, @@ -291,8 +281,19 @@ fn reduce>( }; resolved_axes.sort(); + // Allocate temporary buffer where slices of the input to be reduced are + // packed first if non-contiguous. + let mut tmp_buf = if !input.is_contiguous() { + let reduced_slice_len = resolved_axes.iter().map(|&dim| input.size(dim)).product(); + pool.alloc(reduced_slice_len) + } else { + Vec::new() + } + .auto_return(pool); + if input.ndim() == 0 { - return Ok(Tensor::from_scalar(reducer.reduce(input.iter().copied()))); + let item = input.item().unwrap(); + return Ok(Tensor::from_scalar(reducer.reduce_slice(&[*item]))); } // nb. Some reduce operations cannot produce a meaningful result with @@ -342,11 +343,15 @@ fn reduce>( if resolved_axes.len() == 1 { // Fast path for reducing a single axis. let resolved_axis = resolved_axes[0]; - reduced_data.extend( - input - .lanes(resolved_axis) - .map(|lane| reducer.reduce(lane.copied())), - ); + reduced_data.extend(input.lanes(resolved_axis).map(|lane| { + if let Some(lane_slice) = lane.as_slice() { + reducer.reduce_slice(lane_slice) + } else { + tmp_buf.clear(); + tmp_buf.extend(lane.copied()); + reducer.reduce_slice(&tmp_buf) + } + })); } else { // Slow case when we have to step through each index let outer_range: Vec<_> = (0..input.ndim()) @@ -369,7 +374,10 @@ fn reduce>( } })); let slice = input.slice(inner_range.as_slice()); - let reduced = reducer.reduce(slice.iter().copied()); + + tmp_buf.clear(); + tmp_buf.extend(slice.iter().copied()); + let reduced = reducer.reduce_slice(&tmp_buf); reduced_data.push(reduced); } } @@ -395,17 +403,12 @@ pub fn reduce_mean( ) -> Result { struct MeanReducer {} impl Reducer for MeanReducer { - fn reduce>(&self, iter: I) -> f32 { - let len = iter.len(); - iter_sum(iter) / len as f32 - } - fn reduce_slice(&self, slice: &[f32]) -> f32 { vec_sum(slice) / slice.len() as f32 } } - reduce(pool, input, axes, keep_dims, MeanReducer {}) + reduce(pool, input, axes, keep_dims, &MeanReducer {}) } /// Reduces axes of a tensor using an inverse Root Mean Squared (RMS) @@ -428,19 +431,13 @@ pub fn reduce_inverse_rms( } impl Reducer for InverseRmsReducer { - fn reduce>(&self, iter: I) -> f32 { - let len = iter.len(); - let mean_square = iter_sum(iter.map(|x| x * x)) / len as f32; - 1. / (mean_square + self.epsilon).sqrt() - } - fn reduce_slice(&self, slice: &[f32]) -> f32 { let mean_square = vec_sum_square(slice) / slice.len() as f32; 1. / (mean_square + self.epsilon).sqrt() } } - reduce(pool, input, axes, keep_dims, InverseRmsReducer { epsilon }) + reduce(pool, input, axes, keep_dims, &InverseRmsReducer { epsilon }) } #[derive(Debug)] @@ -475,17 +472,12 @@ pub fn reduce_l2( ) -> Result { struct L2Reducer {} impl Reducer for L2Reducer { - fn reduce>(&self, iter: I) -> f32 { - let sum_of_squares: f32 = iter.map(|val| val * val).sum(); - sum_of_squares.sqrt() - } - fn reduce_slice(&self, slice: &[f32]) -> f32 { vec_sum_square(slice).sqrt() } } - reduce(pool, input, axes, keep_dims, L2Reducer {}) + reduce(pool, input, axes, keep_dims, &L2Reducer {}) } #[derive(Debug)] @@ -551,16 +543,16 @@ fn reduce_min_max( max: bool, } impl Reducer for MinMaxReducer { - fn reduce>(&self, iter: I) -> T { + fn reduce_slice(&self, slice: &[T]) -> T { let reduced = if self.max { - iter.max_by(|a, b| cmp_nan_greater(*a, *b)) + slice.iter().copied().max_by(|a, b| cmp_nan_greater(*a, *b)) } else { - iter.min_by(|a, b| cmp_nan_less(*a, *b)) + slice.iter().copied().min_by(|a, b| cmp_nan_less(*a, *b)) }; reduced.expect("attempted to get min/max of empty axis") } } - reduce(pool, input, axes, keep_dims, MinMaxReducer { max }) + reduce(pool, input, axes, keep_dims, &MinMaxReducer { max }) } /// Extract axes from input 1 in `inputs` or `attr`. @@ -639,12 +631,12 @@ pub fn reduce_prod( keep_dims: bool, ) -> Result, OpError> { struct ProdReducer {} - impl Reducer for ProdReducer { - fn reduce>(&self, iter: I) -> T { - iter.product() + impl Reducer for ProdReducer { + fn reduce_slice(&self, slice: &[T]) -> T { + slice.iter().copied().product() } } - reduce(pool, input, axes, keep_dims, ProdReducer {}) + reduce(pool, input, axes, keep_dims, &ProdReducer {}) } #[derive(Debug)] @@ -673,15 +665,11 @@ pub fn reduce_sum>( ) -> Result, OpError> { struct SumReducer {} impl> Reducer for SumReducer { - fn reduce>(&self, iter: I) -> T { - iter_sum(iter) - } - fn reduce_slice(&self, slice: &[T]) -> T { slice_sum(slice) } } - reduce(pool, input, axes, keep_dims, SumReducer {}) + reduce(pool, input, axes, keep_dims, &SumReducer {}) } #[derive(Debug)] @@ -710,11 +698,11 @@ pub fn reduce_sum_square + std::iter::Sum ) -> Result, OpError> { struct SumSquareReducer {} impl> Reducer for SumSquareReducer { - fn reduce>(&self, iter: I) -> T { - iter.map(|x| x * x).sum() + fn reduce_slice(&self, slice: &[T]) -> T { + slice.iter().copied().map(|x| x * x).sum() } } - reduce(pool, input, axes, keep_dims, SumSquareReducer {}) + reduce(pool, input, axes, keep_dims, &SumSquareReducer {}) } #[derive(Debug)] diff --git a/src/slice_reductions.rs b/src/slice_reductions.rs index 57b9bb96..e514dc24 100644 --- a/src/slice_reductions.rs +++ b/src/slice_reductions.rs @@ -64,36 +64,6 @@ pub fn slice_map_sum, M: Fn(T) -> .fold(T::default(), |acc, x| acc + x) } -/// Return the sum of an iterator of numbers. -pub fn iter_sum, I: ExactSizeIterator>( - mut iter: I, -) -> T { - let zero = T::default(); - let len = iter.len(); - let mut sum = zero; - let mut n = len; - - while n > 4 { - n -= 4; - - let a = iter.next().unwrap_or(zero); - let b = iter.next().unwrap_or(zero); - let c = iter.next().unwrap_or(zero); - let d = iter.next().unwrap_or(zero); - - let ab = a + b; - let cd = c + d; - let abcd = ab + cd; - sum = sum + abcd; - } - - for x in iter { - sum = sum + x; - } - - sum -} - #[cfg(test)] mod tests { use rten_tensor::rng::XorShiftRng; From 5de32edf599498523f7038605f21dce2fdd503c8 Mon Sep 17 00:00:00 2001 From: Robert Knight Date: Mon, 16 Dec 2024 12:33:36 +0100 Subject: [PATCH 04/10] Simplify fallback for reducing 2+ axes in a non-contiguous tensor Shuffle the axes so the N reduced dimensions are last, then iterate over slices of the inner N dims. The implementation of `TensorBase::inner_iter_dyn` is essentially the same as the previous code, but this could be improved by taking advantage of the fact that only the start offset in the data changes on each iteration. --- src/ops/reduce.rs | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/src/ops/reduce.rs b/src/ops/reduce.rs index dbe136b4..c5897e3e 100644 --- a/src/ops/reduce.rs +++ b/src/ops/reduce.rs @@ -3,7 +3,7 @@ use std::cmp::Ordering; use rten_tensor; use rten_tensor::prelude::*; -use rten_tensor::{DynIndices, NdTensor, NdTensorView, SliceItem, Tensor, TensorView}; +use rten_tensor::{NdTensor, NdTensorView, Tensor, TensorView}; use rten_vecmath::{vec_sum, vec_sum_square}; use crate::number::{Identities, IsNaN}; @@ -353,28 +353,13 @@ fn reduce( } })); } else { - // Slow case when we have to step through each index - let outer_range: Vec<_> = (0..input.ndim()) - .map(|dim| { - if resolved_axes.contains(&dim) { - 1 - } else { - input.size(dim) - } - }) - .collect(); - let mut inner_range = Vec::with_capacity(input.ndim()); - for index in DynIndices::from_shape(&outer_range) { - inner_range.clear(); - inner_range.extend(index.iter().enumerate().map(|(dim, &idx)| { - if resolved_axes.contains(&dim) { - SliceItem::range(0, Some(input.size(dim) as isize), 1) - } else { - SliceItem::Index(idx as isize) - } - })); - let slice = input.slice(inner_range.as_slice()); + // Permute input so the N reduced dims are last, then iterate + // over slices of the inner N dims. + let mut perm: Vec = (0..input.ndim()).collect(); + perm.sort_by_key(|&dim| (resolved_axes.contains(&dim), dim)); + let permuted = input.permuted(&perm); + for slice in permuted.inner_iter_dyn(resolved_axes.len()) { tmp_buf.clear(); tmp_buf.extend(slice.iter().copied()); let reduced = reducer.reduce_slice(&tmp_buf); From 8b710e76f7b2371aeb4dd2a5286a8183f72c87f0 Mon Sep 17 00:00:00 2001 From: Robert Knight Date: Mon, 16 Dec 2024 13:14:28 +0100 Subject: [PATCH 05/10] Avoid packing input if reduced dimensions are contiguous When reducing a non-contiguous tensor, the reduced dimensions may be contiguous even if the tensor as a whole is not. In that case we can avoid packing costs. --- src/ops/reduce.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/ops/reduce.rs b/src/ops/reduce.rs index c5897e3e..a8d910ee 100644 --- a/src/ops/reduce.rs +++ b/src/ops/reduce.rs @@ -360,9 +360,15 @@ fn reduce( let permuted = input.permuted(&perm); for slice in permuted.inner_iter_dyn(resolved_axes.len()) { - tmp_buf.clear(); - tmp_buf.extend(slice.iter().copied()); - let reduced = reducer.reduce_slice(&tmp_buf); + // The reduced dimensions may be contiguous even if the + // tensor is not. + let reduced = if let Some(data) = slice.data() { + reducer.reduce_slice(data) + } else { + tmp_buf.clear(); + tmp_buf.extend(slice.iter().copied()); + reducer.reduce_slice(&tmp_buf) + }; reduced_data.push(reduced); } } From 532241fb02b6f1c6b4d5e07f4a6bda64117ed156 Mon Sep 17 00:00:00 2001 From: Robert Knight Date: Mon, 16 Dec 2024 18:05:06 +0100 Subject: [PATCH 06/10] Add `TensorBase::copy_into_slice` This is useful for cases where we want to get the elements in contiguous order but using a pre-allocated buffer. --- rten-tensor/src/tensor.rs | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/rten-tensor/src/tensor.rs b/rten-tensor/src/tensor.rs index e5c98ac6..6971944e 100644 --- a/rten-tensor/src/tensor.rs +++ b/rten-tensor/src/tensor.rs @@ -117,6 +117,14 @@ pub trait AsView: Layout { self.view().broadcast(shape) } + /// Copy elements from this tensor into `dest` in logical order. + /// + /// Returns the initialized slice. Panics if the length of `dest` does + /// not match the number of elements in `self`. + fn copy_into_slice<'a>(&self, dest: &'a mut [MaybeUninit]) -> &'a [Self::Elem] + where + Self::Elem: Copy; + /// Return the layout of this tensor as a slice, if it is contiguous. fn data(&self) -> Option<&[Self::Elem]>; @@ -1676,6 +1684,22 @@ impl, L: MutLayout + Clone> AsView for TensorBase self.view().iter() } + fn copy_into_slice<'a>(&self, dest: &'a mut [MaybeUninit]) -> &'a [T] + where + T: Copy, + { + if let Some(data) = self.data() { + // Safety: `[T]` and `[MaybeUninit]` have same layout. + let src_uninit = unsafe { std::mem::transmute::<&[T], &[MaybeUninit]>(data) }; + dest.copy_from_slice(src_uninit); + // Safety: `copy_from_slice` initializes the whole slice or panics + // if there is a length mismatch. + unsafe { std::mem::transmute::<&[MaybeUninit], &[T]>(dest) } + } else { + copy_into_slice(self.as_dyn(), dest) + } + } + fn data(&self) -> Option<&[Self::Elem]> { self.view().data() } @@ -2540,6 +2564,21 @@ mod tests { assert_eq!(dest.to_vec(), &[1., 2., 3., 4.]); } + #[test] + fn test_copy_into_slice() { + let src = NdTensor::from([[1, 2], [3, 4], [5, 6]]); + let mut buf = Vec::with_capacity(src.len()); + let buf_uninit = &mut buf.spare_capacity_mut()[..src.len()]; + + // Contiguous case. + let elts = src.copy_into_slice(buf_uninit); + assert_eq!(elts, &[1, 2, 3, 4, 5, 6]); + + // Non-contiguous case. + let transposed_elts = src.transposed().copy_into_slice(buf_uninit); + assert_eq!(transposed_elts, &[1, 3, 5, 2, 4, 6]); + } + #[test] fn test_data() { let data = &[1., 2., 3., 4., 5., 6.]; From dc5dff70249dc35c59ef2c2c1b9a7f5eb010f8f6 Mon Sep 17 00:00:00 2001 From: Robert Knight Date: Mon, 16 Dec 2024 18:17:55 +0100 Subject: [PATCH 07/10] Save a tensor view copy in `copy_into_slice` --- rten-tensor/src/copy.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rten-tensor/src/copy.rs b/rten-tensor/src/copy.rs index 88321119..f8477f1d 100644 --- a/rten-tensor/src/copy.rs +++ b/rten-tensor/src/copy.rs @@ -171,14 +171,13 @@ fn copy_blocked(src: Matrix, mut dest: MatrixMut>) { /// /// Returns `dest` as an initialized slice. pub fn copy_into_slice<'a, T: Clone>( - src: TensorView, + mut src: TensorView, dest: &'a mut [MaybeUninit], ) -> &'a [T] { assert!(dest.len() == src.len()); // Merge axes to increase the chance that we can use the fast path and // also maximize the iteration count of the innermost loops. - let mut src = src.clone(); src.merge_axes(); if src.ndim() > 4 { From 8d518fa03a67c9f81e08b4553613a3f469be67f5 Mon Sep 17 00:00:00 2001 From: Robert Knight Date: Mon, 16 Dec 2024 18:23:41 +0100 Subject: [PATCH 08/10] Use `TensoBase::copy_into_slice` to pack non-contiguous slices This will use blocked or nested-loop copy methods which are more efficient than tensor iterators when the reduced slices are sufficiently large. --- src/ops/reduce.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ops/reduce.rs b/src/ops/reduce.rs index a8d910ee..039eb5fb 100644 --- a/src/ops/reduce.rs +++ b/src/ops/reduce.rs @@ -366,8 +366,9 @@ fn reduce( reducer.reduce_slice(data) } else { tmp_buf.clear(); - tmp_buf.extend(slice.iter().copied()); - reducer.reduce_slice(&tmp_buf) + let tmp_uninit = &mut tmp_buf.spare_capacity_mut()[..slice.len()]; + let tmp = slice.copy_into_slice(tmp_uninit); + reducer.reduce_slice(tmp) }; reduced_data.push(reduced); } From 8a249ad02e703d80481bd58286afbcf2bdb2ea76 Mon Sep 17 00:00:00 2001 From: Robert Knight Date: Mon, 16 Dec 2024 20:18:32 +0100 Subject: [PATCH 09/10] Add test cases for reducing non-contiguous views --- src/ops/reduce.rs | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/src/ops/reduce.rs b/src/ops/reduce.rs index 039eb5fb..f163c19a 100644 --- a/src/ops/reduce.rs +++ b/src/ops/reduce.rs @@ -831,7 +831,7 @@ mod tests { use rten_tensor::prelude::*; use rten_tensor::test_util::{eq_with_nans, expect_equal}; - use rten_tensor::{NdTensor, Tensor}; + use rten_tensor::{NdTensor, SliceRange, Tensor}; use crate::ops::tests::{new_pool, run_op}; use crate::ops::{ @@ -1049,6 +1049,8 @@ mod tests { Ok(()) } + // Tests for ReduceMean specifically that also cover common functionality + // across the different reductions. #[test] fn test_reduce_mean() -> Result<(), Box> { let pool = new_pool(); @@ -1108,6 +1110,44 @@ mod tests { .unwrap(); assert_eq!(result.to_vec(), &[5.0]); + // Reduce non-contiguous lane + let tensor = Tensor::from([0., 1., 2., 3., 4., 5., 6.]); + let slice = tensor.slice(SliceRange::new(0, None, 2)); + let expected_mean = slice.iter().sum::() / slice.len() as f32; + let result = reduce_mean(&pool, slice.view(), Some(&[0]), false /* keep_dims */).unwrap(); + assert_eq!(result.to_vec(), &[expected_mean]); + + // Reduce contiguous lanes in non-contiguous tensor + let tensor = Tensor::from([[0., 1.], [2., 3.], [4., 5.]]); + let slice = tensor.slice(SliceRange::new(0, None, 2)); + let result = reduce_mean(&pool, slice.view(), Some(&[1]), false /* keep_dims */).unwrap(); + assert_eq!(result.to_vec(), &[0.5, 4.5]); + + // Reduce multiple non-contiguous dimensions + let tensor = Tensor::from([[0., 1.], [2., 3.], [4., 5.]]); + let slice = tensor.slice((SliceRange::new(0, None, 2), SliceRange::new(0, None, 2))); + let expected_mean = slice.iter().sum::() / slice.len() as f32; + let result = reduce_mean( + &pool, + slice.view(), + Some(&[0, 1]), + false, /* keep_dims */ + ) + .unwrap(); + assert_eq!(result.to_vec(), &[expected_mean]); + + // Reduce multiple contiguous dimensions in non-contiguous tensor + let tensor = Tensor::from([[[0.], [1.]], [[2.], [3.]], [[4.], [5.]]]); + let slice = tensor.slice(SliceRange::new(0, None, 2)); + let result = reduce_mean( + &pool, + slice.view(), + Some(&[1, 2]), + false, /* keep_dims */ + ) + .unwrap(); + assert_eq!(result.to_vec(), &[0.5, 4.5]); + Ok(()) } From 3168392e5053d10a5b60ddd0afac3b7299d551e7 Mon Sep 17 00:00:00 2001 From: Robert Knight Date: Mon, 16 Dec 2024 20:57:42 +0100 Subject: [PATCH 10/10] Rename Reducer -> ReduceKernel For consistency with other code, use the term "Kernel" to describe the code which handles the inner loop of reduction ops. --- src/ops/reduce.rs | 61 +++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/src/ops/reduce.rs b/src/ops/reduce.rs index f163c19a..9206aabf 100644 --- a/src/ops/reduce.rs +++ b/src/ops/reduce.rs @@ -262,18 +262,23 @@ impl Operator for NonZero { } } -/// Trait for reducing a subset of elements from a tensor to a single value. -trait Reducer { +/// Kernel that handles reducing a single slice of the input. +trait ReduceKernel { /// Reduce a contiguous slice of values to a single value. fn reduce_slice(&self, slice: &[T]) -> T; } +/// Outer loop of reduction operations. +/// +/// This iterates over slices of the input that are reduced independently and +/// invokes the kernel on that slice. If the input is not contiguous, the slice +/// is packed before calling the kernel. fn reduce( pool: &TensorPool, input: TensorView, axes: Option<&[i32]>, keep_dims: bool, - reducer: &dyn Reducer, + kernel: &dyn ReduceKernel, ) -> Result, OpError> { let mut resolved_axes = match axes { Some(axes) if !axes.is_empty() => resolve_axes(input.ndim(), axes.iter())?, @@ -293,7 +298,7 @@ fn reduce( if input.ndim() == 0 { let item = input.item().unwrap(); - return Ok(Tensor::from_scalar(reducer.reduce_slice(&[*item]))); + return Ok(Tensor::from_scalar(kernel.reduce_slice(&[*item]))); } // nb. Some reduce operations cannot produce a meaningful result with @@ -336,7 +341,7 @@ fn reduce( reduced_data.extend( input_data .chunks(slice_len) - .map(|chunk| reducer.reduce_slice(chunk)), + .map(|chunk| kernel.reduce_slice(chunk)), ); } _ => { @@ -345,11 +350,11 @@ fn reduce( let resolved_axis = resolved_axes[0]; reduced_data.extend(input.lanes(resolved_axis).map(|lane| { if let Some(lane_slice) = lane.as_slice() { - reducer.reduce_slice(lane_slice) + kernel.reduce_slice(lane_slice) } else { tmp_buf.clear(); tmp_buf.extend(lane.copied()); - reducer.reduce_slice(&tmp_buf) + kernel.reduce_slice(&tmp_buf) } })); } else { @@ -363,12 +368,12 @@ fn reduce( // The reduced dimensions may be contiguous even if the // tensor is not. let reduced = if let Some(data) = slice.data() { - reducer.reduce_slice(data) + kernel.reduce_slice(data) } else { tmp_buf.clear(); let tmp_uninit = &mut tmp_buf.spare_capacity_mut()[..slice.len()]; let tmp = slice.copy_into_slice(tmp_uninit); - reducer.reduce_slice(tmp) + kernel.reduce_slice(tmp) }; reduced_data.push(reduced); } @@ -393,14 +398,14 @@ pub fn reduce_mean( axes: Option<&[i32]>, keep_dims: bool, ) -> Result { - struct MeanReducer {} - impl Reducer for MeanReducer { + struct MeanKernel {} + impl ReduceKernel for MeanKernel { fn reduce_slice(&self, slice: &[f32]) -> f32 { vec_sum(slice) / slice.len() as f32 } } - reduce(pool, input, axes, keep_dims, &MeanReducer {}) + reduce(pool, input, axes, keep_dims, &MeanKernel {}) } /// Reduces axes of a tensor using an inverse Root Mean Squared (RMS) @@ -418,18 +423,18 @@ pub fn reduce_inverse_rms( keep_dims: bool, epsilon: f32, ) -> Result { - struct InverseRmsReducer { + struct InverseRmsKernel { epsilon: f32, } - impl Reducer for InverseRmsReducer { + impl ReduceKernel for InverseRmsKernel { fn reduce_slice(&self, slice: &[f32]) -> f32 { let mean_square = vec_sum_square(slice) / slice.len() as f32; 1. / (mean_square + self.epsilon).sqrt() } } - reduce(pool, input, axes, keep_dims, &InverseRmsReducer { epsilon }) + reduce(pool, input, axes, keep_dims, &InverseRmsKernel { epsilon }) } #[derive(Debug)] @@ -462,14 +467,14 @@ pub fn reduce_l2( axes: Option<&[i32]>, keep_dims: bool, ) -> Result { - struct L2Reducer {} - impl Reducer for L2Reducer { + struct L2ReduceKernel {} + impl ReduceKernel for L2ReduceKernel { fn reduce_slice(&self, slice: &[f32]) -> f32 { vec_sum_square(slice).sqrt() } } - reduce(pool, input, axes, keep_dims, &L2Reducer {}) + reduce(pool, input, axes, keep_dims, &L2ReduceKernel {}) } #[derive(Debug)] @@ -534,7 +539,7 @@ fn reduce_min_max( struct MinMaxReducer { max: bool, } - impl Reducer for MinMaxReducer { + impl ReduceKernel for MinMaxReducer { fn reduce_slice(&self, slice: &[T]) -> T { let reduced = if self.max { slice.iter().copied().max_by(|a, b| cmp_nan_greater(*a, *b)) @@ -622,13 +627,13 @@ pub fn reduce_prod( axes: Option<&[i32]>, keep_dims: bool, ) -> Result, OpError> { - struct ProdReducer {} - impl Reducer for ProdReducer { + struct ProdKernel {} + impl ReduceKernel for ProdKernel { fn reduce_slice(&self, slice: &[T]) -> T { slice.iter().copied().product() } } - reduce(pool, input, axes, keep_dims, &ProdReducer {}) + reduce(pool, input, axes, keep_dims, &ProdKernel {}) } #[derive(Debug)] @@ -655,13 +660,13 @@ pub fn reduce_sum>( axes: Option<&[i32]>, keep_dims: bool, ) -> Result, OpError> { - struct SumReducer {} - impl> Reducer for SumReducer { + struct SumKernel {} + impl> ReduceKernel for SumKernel { fn reduce_slice(&self, slice: &[T]) -> T { slice_sum(slice) } } - reduce(pool, input, axes, keep_dims, &SumReducer {}) + reduce(pool, input, axes, keep_dims, &SumKernel {}) } #[derive(Debug)] @@ -688,13 +693,13 @@ pub fn reduce_sum_square + std::iter::Sum axes: Option<&[i32]>, keep_dims: bool, ) -> Result, OpError> { - struct SumSquareReducer {} - impl> Reducer for SumSquareReducer { + struct SumSquareKernel {} + impl> ReduceKernel for SumSquareKernel { fn reduce_slice(&self, slice: &[T]) -> T { slice.iter().copied().map(|x| x * x).sum() } } - reduce(pool, input, axes, keep_dims, &SumSquareReducer {}) + reduce(pool, input, axes, keep_dims, &SumSquareKernel {}) } #[derive(Debug)]