johannesvollmer · johannesvollmer · Jul 8, 2023 · Jan 7, 2023 · Jan 7, 2023 · Jan 7, 2023
diff --git a/src/block/samples.rs b/src/block/samples.rs
@@ -1,6 +1,7 @@
 //! Extract pixel samples from a block of pixel bytes.
 
 use crate::prelude::*;
+use half::prelude::HalfFloatSliceExt;
 
 
 /// A single red, green, blue, or alpha value.
@@ -112,6 +113,7 @@ impl From<Sample> for u32 { #[inline] fn from(s: Sample) -> Self { s.to_u32() }
 
 /// Create an arbitrary sample type from one of the defined sample types.
 /// Should be compiled to a no-op where the file contains the predicted sample type.
+/// The slice functions should be optimized into a `memcpy` where there is no conversion needed.
 pub trait FromNativeSample: Sized + Copy + Default + 'static {
 
     /// Create this sample from a f16, trying to represent the same numerical value
@@ -122,31 +124,77 @@ pub trait FromNativeSample: Sized + Copy + Default + 'static {
 
     /// Create this sample from a u32, trying to represent the same numerical value
     fn from_u32(value: u32) -> Self;
+
+    /// Convert all values from the slice into this type.
+    /// This function exists to allow the compiler to perform a vectorization optimization.
+    #[inline]
+    fn from_f16s(from: &[f16], to: &mut [Self]) {
+        assert_eq!(from.len(), to.len(), "slices must have the same length");
+        for (from, to) in from.iter().zip(to.iter_mut()) {
+            *to = Self::from_f16(*from);
+        }
+    }
+
+    /// Convert all values from the slice into this type.
+    /// This function exists to allow the compiler to perform a vectorization optimization.
+    #[inline]
+    fn from_f32s(from: &[f32], to: &mut [Self]) {
+        assert_eq!(from.len(), to.len(), "slices must have the same length");
+        for (from, to) in from.iter().zip(to.iter_mut()) {
+            *to = Self::from_f32(*from);
+        }
+    }
+
+    /// Convert all values from the slice into this type.
+    /// This function exists to allow the compiler to perform a vectorization optimization.
+    #[inline]
+    fn from_u32s(from: &[u32], to: &mut [Self]) {
+        assert_eq!(from.len(), to.len(), "slices must have the same length");
+        for (from, to) in from.iter().zip(to.iter_mut()) {
+            *to = Self::from_u32(*from);
+        }
+    }
 }
 
 // TODO haven't i implemented this exact behaviour already somewhere else in this library...??
 impl FromNativeSample for f32 {
-    fn from_f16(value: f16) -> Self { value.to_f32() }
-    fn from_f32(value: f32) -> Self { value } // this branch means that we never have to match every single sample if the file format matches the expected output
-    fn from_u32(value: u32) -> Self { value as f32 }
+    #[inline] fn from_f16(value: f16) -> Self { value.to_f32() }
+    #[inline] fn from_f32(value: f32) -> Self { value }
+    #[inline] fn from_u32(value: u32) -> Self { value as f32 }
+
+    // f16 is a custom type
+    // so the compiler can not automatically vectorize the conversion
+    // that's why we need to specialize this function
+    #[inline]
+    fn from_f16s(from: &[f16], to: &mut [Self]) {
+        from.convert_to_f32_slice(to);
+    }
 }
 
 impl FromNativeSample for u32 {
-    fn from_f16(value: f16) -> Self { value.to_f32() as u32 }
-    fn from_f32(value: f32) -> Self { value as u32 }
-    fn from_u32(value: u32) -> Self { value }
+    #[inline] fn from_f16(value: f16) -> Self { value.to_f32() as u32 }
+    #[inline] fn from_f32(value: f32) -> Self { value as u32 }
+    #[inline] fn from_u32(value: u32) -> Self { value }
 }
 
 impl FromNativeSample for f16 {
-    fn from_f16(value: f16) -> Self { value }
-    fn from_f32(value: f32) -> Self { f16::from_f32(value) }
-    fn from_u32(value: u32) -> Self { f16::from_f32(value as f32) }
+    #[inline] fn from_f16(value: f16) -> Self { value }
+    #[inline] fn from_f32(value: f32) -> Self { f16::from_f32(value) }
+    #[inline] fn from_u32(value: u32) -> Self { f16::from_f32(value as f32) }
+
+    // f16 is a custom type
+    // so the compiler can not automatically vectorize the conversion
+    // that's why we need to specialize this function
+    #[inline]
+    fn from_f32s(from: &[f32], to: &mut [Self]) {
+        to.convert_from_f32_slice(from)
+    }
 }
 
 impl FromNativeSample for Sample {
-    fn from_f16(value: f16) -> Self { Self::from(value) }
-    fn from_f32(value: f32) -> Self { Self::from(value) }
-    fn from_u32(value: u32) -> Self { Self::from(value) }
+    #[inline] fn from_f16(value: f16) -> Self { Self::from(value) }
+    #[inline] fn from_f32(value: f32) -> Self { Self::from(value) }
+    #[inline] fn from_u32(value: u32) -> Self { Self::from(value) }
 }
 
 

diff --git a/src/image/read/specific_channels.rs b/src/image/read/specific_channels.rs
@@ -12,6 +12,7 @@ use crate::image::read::layers::{ChannelsReader, ReadChannels};
 use crate::block::chunk::TileCoordinates;
 
 use std::marker::PhantomData;
+use crate::io::Read;
 
 
 /// Can be attached one more channel reader.
@@ -279,30 +280,85 @@ pub struct OptionalSampleReader<DefaultSample> {
 impl<Sample: FromNativeSample> SampleReader<Sample> {
     fn read_own_samples<'s, FullPixel>(
         &self, bytes: &'s[u8], pixels: &mut [FullPixel],
-        get_pixel: impl Fn(&mut FullPixel) -> &mut Sample
+        get_sample: impl Fn(&mut FullPixel) -> &mut Sample
     ){
         let start_index = pixels.len() * self.channel_byte_offset;
         let byte_count = pixels.len() * self.channel.sample_type.bytes_per_sample();
-        let mut own_bytes_reader = &bytes[start_index .. start_index + byte_count]; // TODO check block size somewhere
-
-        let error_msg = "error when reading from in-memory slice";
+        let mut own_bytes_reader = &mut &bytes[start_index .. start_index + byte_count]; // TODO check block size somewhere
+        let output = pixels.iter_mut().map(|pixel| get_sample(pixel));
 
         // match outside the loop to avoid matching on every single sample
         match self.channel.sample_type {
-            SampleType::F16 => for pixel in pixels.iter_mut() {
-                *get_pixel(pixel) = Sample::from_f16(f16::read(&mut own_bytes_reader).expect(error_msg));
-            },
-
-            SampleType::F32 => for pixel in pixels.iter_mut() {
-                *get_pixel(pixel) = Sample::from_f32(f32::read(&mut own_bytes_reader).expect(error_msg));
-            },
-
-            SampleType::U32 => for pixel in pixels.iter_mut() {
-                *get_pixel(pixel) = Sample::from_u32(u32::read(&mut own_bytes_reader).expect(error_msg));
-            },
+            SampleType::F16 => read_and_convert_samples_batched(
+                &mut own_bytes_reader, output,
+                Sample::from_f16s
+            ),
+
+            SampleType::F32 => read_and_convert_samples_batched(
+                &mut own_bytes_reader, output,
+                Sample::from_f32s
+            ),
+
+            SampleType::U32 => read_and_convert_samples_batched(
+                &mut own_bytes_reader, output,
+                Sample::from_u32s
+            ),
         }
 
         debug_assert!(own_bytes_reader.is_empty(), "bytes left after reading all samples");
+
+
+        /// performs something similar to
+        /// `for sample in out_samples { *sample = Sample::convert_from(f16/f32/u32::read_from_bytes(bytes)); }`
+        fn read_and_convert_samples_batched<'t, From, To>(
+            mut bytes: impl Read,
+            mut out_samples: impl ExactSizeIterator<Item=&'t mut To>,
+            convert_slice: impl Fn(&[From], &mut [To])
+        ) where From: Data + Default + Copy, To: 't + Default + Copy
+        {
+            // using a batch size of 4
+            // because that's what `half` has vectorization for,
+            // and we want the compiler to
+            // optimize away all the logic in
+            // `HalfFloatSliceExt::convert_from_f32_slice`
+
+            // this is not a global! why is this warning triggered?
+            #[allow(non_upper_case_globals)]
+            const batch_size: usize = 4;
+
+            let mut source_samples_batch: [From; batch_size] = Default::default();
+            let mut desired_samples_batch: [To; batch_size] = Default::default();
+
+            let total_sample_count = out_samples.len();
+            let batch_count = total_sample_count / batch_size;
+            let remaining_samples_count = total_sample_count % batch_size;
+
+            let error_msg = "error when reading from in-memory slice";
+
+            for _ in 0 .. batch_count {
+                Data::read_slice(&mut bytes, &mut source_samples_batch).expect(error_msg);
+                convert_slice(source_samples_batch.as_slice(), desired_samples_batch.as_mut_slice());
+
+                for converted_sample in desired_samples_batch {
+                    *out_samples.next().expect("less elements than calculated") = converted_sample;
+                }
+            }
+
+            if remaining_samples_count != 0 {
+                let source_samples_batch = &mut source_samples_batch[..remaining_samples_count];
+                let desired_samples_batch = &mut desired_samples_batch[..remaining_samples_count];
+
+                // TODO dedup with above
+                Data::read_slice(&mut bytes, source_samples_batch).expect(error_msg);
+                convert_slice(source_samples_batch, desired_samples_batch);
+
+                for converted_sample in desired_samples_batch {
+                    *out_samples.next().expect("less elements than calculated") = *converted_sample;
+                }
+            }
+
+            debug_assert!(out_samples.next().is_none(), "not all samples have been written");
+        }
     }
 }
 

diff --git a/src/math.rs b/src/math.rs
@@ -194,9 +194,15 @@ impl RoundingMode {
         }
     }
 
+    /// Only works for positive numbers.
     pub(crate) fn divide<T>(self, dividend: T, divisor: T) -> T
-        where T: Copy + Add<Output = T> + Sub<Output = T> + Div<Output = T> + From<u8>
+        where T: Copy + Add<Output = T> + Sub<Output = T> + Div<Output = T> + From<u8> + std::cmp::PartialOrd
     {
+        assert!(
+            dividend >= T::from(0) && divisor >= T::from(1),
+            "division with rounding up only works for positive numbers"
+        );
+
         match self {
             RoundingMode::Up => (dividend + divisor - T::from(1_u8)) / divisor, // only works for positive numbers
             RoundingMode::Down => dividend / divisor,