From 65bbd9e82f33103dc70bcbdf1f8bed2d79dc665f Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Sat, 13 Apr 2024 11:55:11 -0700
Subject: [PATCH 01/31] misc: convert zopfli assertions to errors

---
 src/image/lodepng.rs          |  10 +-
 src/image/zopflipng/blocks.rs | 218 ++++++++++++++++++----------------
 src/image/zopflipng/cache.rs  |  20 ++--
 src/image/zopflipng/error.rs  |  44 +++++++
 src/image/zopflipng/hash.rs   |  71 ++++++-----
 src/image/zopflipng/kat.rs    |  14 ++-
 src/image/zopflipng/lz77.rs   |  35 +++---
 src/image/zopflipng/mod.rs    |   2 +
 8 files changed, 255 insertions(+), 159 deletions(-)
 create mode 100644 src/image/zopflipng/error.rs

diff --git a/src/image/lodepng.rs b/src/image/lodepng.rs
index 7834947..d85e48f 100644
--- a/src/image/lodepng.rs
+++ b/src/image/lodepng.rs
@@ -70,7 +70,7 @@ pub(crate) extern "C" fn flaca_png_deflate(
 			else { (false, ZOPFLI_MASTER_BLOCK_SIZE) };
 
 		// Crunch the part!
-		deflate_part(
+		let res = deflate_part(
 			&mut splits,
 			numiterations,
 			last_part,
@@ -79,6 +79,14 @@ pub(crate) extern "C" fn flaca_png_deflate(
 			&mut dst,
 		);
 
+		if res.is_err() {
+			// Force a panic in debug mode.
+			#[cfg(debug_assertions)] panic!("{res:?}");
+
+			// Otherwise just let lodepng know we failed.
+			return 1;
+		}
+
 		// Onward and upward!
 		i += size;
 	}
diff --git a/src/image/zopflipng/blocks.rs b/src/image/zopflipng/blocks.rs
index 374ee50..fd87571 100644
--- a/src/image/zopflipng/blocks.rs
+++ b/src/image/zopflipng/blocks.rs
@@ -25,6 +25,7 @@ use super::{
 	zopfli_lengths_to_symbols,
 	ZOPFLI_NUM_D,
 	ZOPFLI_NUM_LL,
+	ZopfliError,
 	ZopfliOut,
 };
 
@@ -77,7 +78,7 @@ impl SplitPoints {
 	///
 	/// In terms of order-of-operations, this must be called _before_ the
 	/// second-stage LZ77 pass as it would otherwise blow away that data.
-	fn split_raw(&mut self, arr: &[u8], instart: usize) -> usize {
+	fn split_raw(&mut self, arr: &[u8], instart: usize) -> Result<usize, ZopfliError> {
 		// Populate an LZ77 store from a greedy pass. This results in better
 		// block choices than a full optimal pass.
 		let mut store = LZ77Store::new();
@@ -86,10 +87,10 @@ impl SplitPoints {
 			instart,
 			&mut store,
 			None,
-		));
+		))?;
 
 		// Do an LZ77 pass.
-		let len = self.split_lz77(&store);
+		let len = self.split_lz77(&store)?;
 
 		// Find the corresponding uncompressed positions.
 		if 0 < len && len <= MAX_SPLIT_POINTS {
@@ -99,14 +100,14 @@ impl SplitPoints {
 				if i == self.slice2[j] {
 					self.slice1[j] = pos;
 					j += 1;
-					if j == len { return len; }
+					if j == len { return Ok(len); }
 				}
 				pos += e.length() as usize;
 			}
 
 			unreachable!();
 		}
-		else { len }
+		else { Ok(len) }
 	}
 
 	#[allow(clippy::cast_precision_loss)]
@@ -114,9 +115,9 @@ impl SplitPoints {
 	///
 	/// This sets the LZ77 split points according to convoluted cost
 	/// evaluations.
-	fn split_lz77(&mut self, store: &LZ77Store) -> usize {
+	fn split_lz77(&mut self, store: &LZ77Store) -> Result<usize, ZopfliError> {
 		// This won't work on tiny files.
-		if store.len() < MINIMUM_SPLIT_DISTANCE { return 0; }
+		if store.len() < MINIMUM_SPLIT_DISTANCE { return Ok(0); }
 
 		// Get started!
 		self.done.clear();
@@ -125,11 +126,13 @@ impl SplitPoints {
 		let mut last = 0;
 		let mut len = 0;
 		loop {
-			let (llpos, llcost) = find_minimum_cost(store, lstart + 1, lend);
-			assert!(lstart < llpos && llpos < lend);
+			let (llpos, llcost) = find_minimum_cost(store, lstart + 1, lend)?;
+			if llpos <= lstart || llpos >= lend {
+				return Err(ZopfliError::SplitRange(lstart, lend, llpos));
+			}
 
 			// Ignore points we've already covered.
-			if llpos == lstart + 1 || (calculate_block_size_auto_type(store, lstart, lend) as f64) < llcost {
+			if llpos == lstart + 1 || (calculate_block_size_auto_type(store, lstart, lend)? as f64) < llcost {
 				self.done.insert(lstart);
 			}
 			else {
@@ -157,7 +160,7 @@ impl SplitPoints {
 			) { break; }
 		}
 
-		len
+		Ok(len)
 	}
 
 	#[allow(unsafe_code)]
@@ -172,9 +175,9 @@ impl SplitPoints {
 		instart: usize,
 		store: &mut LZ77Store,
 		squeeze: &mut SqueezeCache,
-	) -> &[usize] {
+	) -> Result<&[usize], ZopfliError> {
 		// Start by splitting uncompressed.
-		let limit = self.split_raw(arr, instart).min(MAX_SPLIT_POINTS);
+		let limit = self.split_raw(arr, instart)?.min(MAX_SPLIT_POINTS);
 
 		let mut cost1 = 0;
 		let mut store2 = LZ77Store::new();
@@ -194,8 +197,8 @@ impl SplitPoints {
 				&mut store2,
 				&mut store3,
 				squeeze,
-			);
-			cost1 += calculate_block_size_auto_type(&store2, 0, store2.len());
+			)?;
+			cost1 += calculate_block_size_auto_type(&store2, 0, store2.len())?;
 
 			// Append its data to our main store.
 			store.append(&store2);
@@ -210,19 +213,19 @@ impl SplitPoints {
 			// Move slice2 over to slice1 so we can repopulate slice2.
 			self.slice1.copy_from_slice(self.slice2.as_slice());
 
-			let limit2 = self.split_lz77(store).min(MAX_SPLIT_POINTS);
+			let limit2 = self.split_lz77(store)?.min(MAX_SPLIT_POINTS);
 			let mut cost2 = 0;
 			for i in 0..=limit2 {
 				let start = if i == 0 { 0 } else { self.slice2[i - 1] };
 				let end = if i < limit2 { self.slice2[i] } else { store.len() };
-				cost2 += calculate_block_size_auto_type(store, start, end);
+				cost2 += calculate_block_size_auto_type(store, start, end)?;
 			}
 
 			// It's better!
-			if cost2 < cost1 { &self.slice2[..limit2] }
-			else { &self.slice1[..limit] }
+			if cost2 < cost1 { Ok(&self.slice2[..limit2]) }
+			else { Ok(&self.slice1[..limit]) }
 		}
-		else { &self.slice2[..limit] }
+		else { Ok(&self.slice2[..limit]) }
 	}
 }
 
@@ -242,7 +245,7 @@ pub(crate) fn deflate_part(
 	arr: &[u8],
 	instart: usize,
 	out: &mut ZopfliOut,
-) {
+) -> Result<(), ZopfliError> {
 	// Find the split points.
 	let mut squeeze = SqueezeCache::new();
 	let mut store = LZ77Store::new();
@@ -252,7 +255,7 @@ pub(crate) fn deflate_part(
 		instart,
 		&mut store,
 		&mut squeeze,
-	);
+	)?;
 
 	// Write the data!
 	for i in 0..=best.len() {
@@ -267,8 +270,10 @@ pub(crate) fn deflate_part(
 			end,
 			0,
 			out,
-		);
+		)?;
 	}
+
+	Ok(())
 }
 
 
@@ -382,7 +387,7 @@ fn add_lz77_block(
 	lend: usize,
 	expected_data_size: usize,
 	out: &mut ZopfliOut,
-) {
+) -> Result<(), ZopfliError> {
 	// Uncompressed blocks are easy!
 	if matches!(btype, BlockType::Uncompressed) {
 		let length = get_lz77_byte_range(store, lstart, lend);
@@ -390,7 +395,7 @@ fn add_lz77_block(
 			if lstart >= lend { 0 }
 			else { store.entries[lstart].pos };
 		out.add_uncompressed_block(last_block, arr, pos, pos + length);
-		return;
+		return Ok(());
 	}
 
 	// Add some bits.
@@ -410,8 +415,8 @@ fn add_lz77_block(
 				lend,
 				&mut ll_lengths,
 				&mut d_lengths,
-			);
-			add_dynamic_tree(&ll_lengths, &d_lengths, out);
+			)?;
+			add_dynamic_tree(&ll_lengths, &d_lengths, out)?;
 			(ll_lengths, d_lengths)
 		};
 
@@ -426,10 +431,11 @@ fn add_lz77_block(
 		store, lstart, lend, expected_data_size,
 		&ll_symbols, &ll_lengths, &d_symbols, &d_lengths,
 		out
-	);
+	)?;
 
 	// Finish up by writting the end symbol.
 	out.add_huffman_bits(ll_symbols[256], ll_lengths[256]);
+	Ok(())
 }
 
 #[allow(
@@ -450,19 +456,19 @@ fn add_lz77_block_auto_type(
 	lend: usize,
 	expected_data_size: usize,
 	out: &mut ZopfliOut
-) {
+) -> Result<(), ZopfliError> {
 	// If the block is empty, we can assume a fixed-tree layout.
 	if lstart >= lend {
 		out.add_bits(u32::from(last_block), 1);
 		out.add_bits(1, 2);
 		out.add_bits(0, 7);
-		return;
+		return Ok(());
 	}
 
 	// Calculate the three costs.
-	let uncompressed_cost = calculate_block_size(store, lstart, lend, BlockType::Uncompressed);
-	let fixed_cost = calculate_block_size(store, lstart, lend, BlockType::Fixed);
-	let dynamic_cost = calculate_block_size(store, lstart, lend, BlockType::Dynamic);
+	let uncompressed_cost = calculate_block_size(store, lstart, lend, BlockType::Uncompressed)?;
+	let fixed_cost = calculate_block_size(store, lstart, lend, BlockType::Fixed)?;
+	let dynamic_cost = calculate_block_size(store, lstart, lend, BlockType::Dynamic)?;
 
 	// Fixed stores are only useful up to a point; we can skip the overhead
 	// if the store is big or the dynamic cost estimate is unimpressive.
@@ -472,9 +478,9 @@ fn add_lz77_block_auto_type(
 			store, squeeze, uncompressed_cost, dynamic_cost,
 			arr, lstart, lend, last_block,
 			expected_data_size, out,
-		)
+		)?
 	{
-		return;
+		return Ok(());
 	}
 
 	// Which type?
@@ -487,7 +493,7 @@ fn add_lz77_block_auto_type(
 	add_lz77_block(
 		btype, last_block, store, arr, lstart, lend,
 		expected_data_size, out,
-	);
+	)
 }
 
 #[inline(never)]
@@ -498,10 +504,11 @@ fn add_dynamic_tree(
 	ll_lengths: &[u32; ZOPFLI_NUM_LL],
 	d_lengths: &[u32; ZOPFLI_NUM_D],
 	out: &mut ZopfliOut
-) {
+) -> Result<(), ZopfliError> {
 	// Find the index that produces the best size.
-	let (i, _) = calculate_tree_size(ll_lengths, d_lengths);
-	encode_tree(ll_lengths, d_lengths, i, Some(out));
+	let (i, _) = calculate_tree_size(ll_lengths, d_lengths)?;
+	encode_tree(ll_lengths, d_lengths, i, Some(out))?;
+	Ok(())
 }
 
 #[allow(
@@ -523,13 +530,16 @@ fn add_lz77_data(
 	d_symbols: &[u32; ZOPFLI_NUM_D],
 	d_lengths: &[u32; ZOPFLI_NUM_D],
 	out: &mut ZopfliOut
-) {
+) -> Result<(), ZopfliError> {
 	let mut test_size = 0;
 	for e in &store.entries[lstart..lend] {
 		// Length only.
 		if e.dist <= 0 {
-			assert!((e.litlen as u16) < 256);
-			assert!(ll_lengths[e.litlen as usize] > 0);
+			if (e.litlen as u16) >= 256 {
+				return Err(ZopfliError::LitLenLiteral(e.litlen as u16));
+			}
+			if ll_lengths[e.litlen as usize] == 0 { return Err(ZopfliError::NoLength); }
+
 			out.add_huffman_bits(
 				ll_symbols[e.litlen as usize],
 				ll_lengths[e.litlen as usize],
@@ -539,7 +549,8 @@ fn add_lz77_data(
 		// Length and distance.
 		else {
 			let (symbol, bits, value) = LENGTH_SYMBOLS_BITS_VALUES[e.litlen as usize];
-			assert!(ll_lengths[symbol as usize] > 0);
+			if ll_lengths[symbol as usize] == 0 { return Err(ZopfliError::NoLength); }
+
 			out.add_huffman_bits(
 				ll_symbols[symbol as usize],
 				ll_lengths[symbol as usize],
@@ -547,7 +558,7 @@ fn add_lz77_data(
 			out.add_bits(u32::from(value), u32::from(bits));
 
 			// Now the distance bits.
-			assert!(d_lengths[e.d_symbol as usize] > 0);
+			if d_lengths[e.d_symbol as usize] == 0 { return Err(ZopfliError::NoDistance); }
 			out.add_huffman_bits(
 				d_symbols[e.d_symbol as usize],
 				d_lengths[e.d_symbol as usize],
@@ -561,7 +572,8 @@ fn add_lz77_data(
 		}
 	}
 
-	assert!(expected_data_size == 0 || test_size == expected_data_size);
+	if expected_data_size == 0 || test_size == expected_data_size { Ok(()) }
+	else { Err(ZopfliError::Write) }
 }
 
 /// # Calculate Block Size (in Bits).
@@ -570,23 +582,23 @@ fn calculate_block_size(
 	lstart: usize,
 	lend: usize,
 	btype: BlockType,
-) -> usize {
+) -> Result<usize, ZopfliError> {
 	match btype {
 		BlockType::Uncompressed => {
 			let length = get_lz77_byte_range(store, lstart, lend);
 			let blocks = length.div_ceil(65_535);
 
 			// Blocks larger than u16::MAX need to be split.
-			blocks * 40 + length * 8
+			Ok(blocks * 40 + length * 8)
 		},
 		BlockType::Fixed =>
-			calculate_block_symbol_size(
+			Ok(calculate_block_symbol_size(
 				&FIXED_TREE_LL,
 				&FIXED_TREE_D,
 				store,
 				lstart,
 				lend,
-			) + 3,
+			)? + 3),
 		BlockType::Dynamic => {
 			let mut ll_lengths = [0_u32; ZOPFLI_NUM_LL];
 			let mut d_lengths = [0_u32; ZOPFLI_NUM_D];
@@ -606,24 +618,24 @@ fn calculate_block_size_auto_type(
 	store: &LZ77Store,
 	lstart: usize,
 	lend: usize,
-) -> usize {
-	let uncompressed_cost = calculate_block_size(store, lstart, lend, BlockType::Uncompressed);
+) -> Result<usize, ZopfliError> {
+	let uncompressed_cost = calculate_block_size(store, lstart, lend, BlockType::Uncompressed)?;
 
 	// We can skip the expensive fixed-cost calculations for large blocks since
 	// they're unlikely ever to use it.
 	let fixed_cost =
 		if 1000 < store.len() { uncompressed_cost }
-		else { calculate_block_size(store, lstart, lend, BlockType::Fixed) };
+		else { calculate_block_size(store, lstart, lend, BlockType::Fixed)? };
 
-	let dynamic_cost = calculate_block_size(store, lstart, lend, BlockType::Dynamic);
+	let dynamic_cost = calculate_block_size(store, lstart, lend, BlockType::Dynamic)?;
 
 	// If uncompressed is better than everything, return it.
 	if uncompressed_cost < fixed_cost && uncompressed_cost < dynamic_cost {
-		uncompressed_cost
+		Ok(uncompressed_cost)
 	}
 	// Otherwise choose the smaller of fixed and dynamic.
-	else if fixed_cost < dynamic_cost { fixed_cost }
-	else { dynamic_cost }
+	else if fixed_cost < dynamic_cost { Ok(fixed_cost) }
+	else { Ok(dynamic_cost) }
 }
 
 /// # Calculate Block Symbol Size w/ Histogram.
@@ -633,19 +645,19 @@ fn calculate_block_symbol_size(
 	store: &LZ77Store,
 	lstart: usize,
 	lend: usize,
-) -> usize {
+) -> Result<usize, ZopfliError> {
 	if lstart + ZOPFLI_NUM_LL * 3 > lend {
-		calculate_block_symbol_size_small(
+		Ok(calculate_block_symbol_size_small(
 			ll_lengths,
 			d_lengths,
 			store,
 			lstart,
 			lend,
-		)
+		))
 	}
 	else {
-		let (ll_counts, d_counts) = store.histogram(lstart, lend);
-		calculate_block_symbol_size_given_counts(
+		let (ll_counts, d_counts) = store.histogram(lstart, lend)?;
+		Ok(calculate_block_symbol_size_given_counts(
 			&ll_counts,
 			&d_counts,
 			ll_lengths,
@@ -653,7 +665,7 @@ fn calculate_block_symbol_size(
 			store,
 			lstart,
 			lend,
-		)
+		))
 	}
 }
 
@@ -736,12 +748,12 @@ fn calculate_block_symbol_size_small(
 fn calculate_tree_size(
 	ll_lengths: &[u32; ZOPFLI_NUM_LL],
 	d_lengths: &[u32; ZOPFLI_NUM_D],
-) -> (u8, usize) {
+) -> Result<(u8, usize), ZopfliError> {
 	let mut best_size = 0;
 	let mut best_idx = 0;
 
 	for i in 0..8 {
-		let size = encode_tree(ll_lengths, d_lengths, i, None);
+		let size = encode_tree(ll_lengths, d_lengths, i, None)?;
 
 		if best_size == 0 || size < best_size {
 			best_size = size;
@@ -749,7 +761,7 @@ fn calculate_tree_size(
 		}
 	}
 
-	(best_idx, best_size)
+	Ok((best_idx, best_size))
 }
 
 #[allow(
@@ -771,7 +783,7 @@ fn encode_tree(
 	d_lengths: &[u32; ZOPFLI_NUM_D],
 	extra: u8,
 	out: Option<&mut ZopfliOut>,
-) -> usize {
+) -> Result<usize, ZopfliError> {
 	// Discombobulated cl_length/cl_count indexes.
 	const ORDER: [usize; 19] = [16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15];
 
@@ -874,7 +886,7 @@ fn encode_tree(
 	}
 
 	// Update the lengths and symbols given the counts.
-	zopfli_length_limited_code_lengths::<7, 19>(&cl_counts, &mut cl_lengths);
+	zopfli_length_limited_code_lengths::<7, 19>(&cl_counts, &mut cl_lengths)?;
 
 	// Find the last non-zero index of the counts table. Note that every ORDER
 	// value is between 0..19, so we can get_unchecked().
@@ -921,7 +933,9 @@ fn encode_tree(
 	}
 	size += cl_counts[16] * 2; // Extra bits.
 	size += cl_counts[17] * 3;
-	size +  cl_counts[18] * 7
+	size += cl_counts[18] * 7;
+
+	Ok(size)
 }
 
 #[allow(clippy::similar_names)]
@@ -964,7 +978,7 @@ fn find_minimum_cost(
 	store: &LZ77Store,
 	mut start: usize,
 	mut end: usize,
-) -> (usize, f64) {
+) -> Result<(usize, f64), ZopfliError> {
 	// Keep track of the original start/end points.
 	let split_start = start - 1;
 	let split_end = end;
@@ -975,13 +989,13 @@ fn find_minimum_cost(
 	// Small chunks don't need much.
 	if end - start < 1024 {
 		for i in start..end {
-			let cost = split_cost(store, split_start, i, split_end);
+			let cost = split_cost(store, split_start, i, split_end)?;
 			if cost < best_cost {
 				best_cost = cost;
 				best_idx = i;
 			}
 		}
-		return (best_idx, best_cost);
+		return Ok((best_idx, best_cost));
 	}
 
 	// Divide and conquer.
@@ -993,7 +1007,7 @@ fn find_minimum_cost(
 			*pp = start + (i + 1) * ((end - start).wrapping_div(MINIMUM_SPLIT_DISTANCE));
 			let line_cost =
 				if best_idx == *pp { last_best_cost }
-				else { split_cost(store, split_start, *pp, split_end) };
+				else { split_cost(store, split_start, *pp, split_end)? };
 
 			if i == 0 || line_cost < best_cost {
 				best_cost = line_cost;
@@ -1012,7 +1026,7 @@ fn find_minimum_cost(
 		last_best_cost = best_cost;
 	}
 
-	(best_idx, last_best_cost)
+	Ok((best_idx, last_best_cost))
 }
 
 /// # Calculate the Bit Lengths for Dynamic Block Symbols.
@@ -1029,13 +1043,13 @@ fn get_dynamic_lengths(
 	lend: usize,
 	ll_lengths: &mut [u32; ZOPFLI_NUM_LL],
 	d_lengths: &mut [u32; ZOPFLI_NUM_D],
-) -> usize {
+) -> Result<usize, ZopfliError> {
 	// Populate some counts.
-	let (mut ll_counts, d_counts) = store.histogram(lstart, lend);
+	let (mut ll_counts, d_counts) = store.histogram(lstart, lend)?;
 	ll_counts[256] = 1;
 
-	zopfli_length_limited_code_lengths::<15, ZOPFLI_NUM_LL>(&ll_counts, ll_lengths);
-	zopfli_length_limited_code_lengths::<15, ZOPFLI_NUM_D>(&d_counts, d_lengths);
+	zopfli_length_limited_code_lengths::<15, ZOPFLI_NUM_LL>(&ll_counts, ll_lengths)?;
+	zopfli_length_limited_code_lengths::<15, ZOPFLI_NUM_D>(&d_counts, d_lengths)?;
 
 	patch_distance_codes(d_lengths);
 	try_optimize_huffman_for_rle(
@@ -1081,9 +1095,9 @@ fn lz77_optimal(
 	store: &mut LZ77Store,
 	scratch_store: &mut LZ77Store,
 	squeeze: &mut SqueezeCache,
-) {
+) -> Result<(), ZopfliError> {
 	// Easy abort.
-	if instart >= arr.len() { return; }
+	if instart >= arr.len() { return Ok(()); }
 
 	// Reset the main cache for the current blocksize.
 	let blocksize = arr.len() - instart;
@@ -1098,7 +1112,7 @@ fn lz77_optimal(
 			instart,
 			scratch_store,
 			Some(instart),
-		);
+		)?;
 
 		// Create new stats with the store (updated by the greedy pass).
 		let mut current_stats = SymbolStats::new();
@@ -1126,7 +1140,7 @@ fn lz77_optimal(
 				Some(&current_stats),
 				squeeze,
 				scratch_store,
-			);
+			)?;
 
 			// This is the cost we actually care about.
 			let current_cost = calculate_block_size(
@@ -1134,7 +1148,7 @@ fn lz77_optimal(
 				0,
 				scratch_store.len(),
 				BlockType::Dynamic,
-			);
+			)?;
 
 			// We have a new best!
 			if current_cost < best_cost {
@@ -1166,7 +1180,9 @@ fn lz77_optimal(
 
 			last_cost = current_cost;
 		}
-	});
+
+		Ok(())
+	})
 }
 
 #[allow(clippy::cast_precision_loss)]
@@ -1174,11 +1190,11 @@ fn lz77_optimal(
 ///
 /// Return the sum of the estimated costs of the left and right sections of the
 /// data.
-fn split_cost(store: &LZ77Store, start: usize, mid: usize, end: usize) -> f64 {
-	(
-		calculate_block_size_auto_type(store, start, mid) +
-		calculate_block_size_auto_type(store, mid, end)
-	) as f64
+fn split_cost(store: &LZ77Store, start: usize, mid: usize, end: usize) -> Result<f64, ZopfliError> {
+	Ok((
+		calculate_block_size_auto_type(store, start, mid)? +
+		calculate_block_size_auto_type(store, mid, end)?
+	) as f64)
 }
 
 #[allow(unsafe_code, clippy::integer_division, clippy::cast_sign_loss)]
@@ -1287,7 +1303,7 @@ fn try_lz77_expensive_fixed(
 	last_block: bool,
 	expected_data_size: usize,
 	out: &mut ZopfliOut,
-) -> bool {
+) -> Result<bool, ZopfliError> {
 	let mut fixed_store = LZ77Store::new();
 	// Safety: the split points are asserted during their creation.
 	debug_assert!(lstart < store.entries.len());
@@ -1306,7 +1322,7 @@ fn try_lz77_expensive_fixed(
 		None,
 		squeeze,
 		&mut fixed_store,
-	));
+	))?;
 
 	// Find the resulting cost.
 	let fixed_cost = calculate_block_size(
@@ -1314,7 +1330,7 @@ fn try_lz77_expensive_fixed(
 		0,
 		fixed_store.len(),
 		BlockType::Fixed,
-	);
+	)?;
 
 	// If it is better than dynamic, and uncompressed isn't better than both
 	// fixed and dynamic, it's the best and worth writing!
@@ -1322,10 +1338,10 @@ fn try_lz77_expensive_fixed(
 		add_lz77_block(
 			BlockType::Fixed, last_block, &fixed_store, arr, 0, fixed_store.len(),
 			expected_data_size, out,
-		);
-		true
+		)?;
+		Ok(true)
 	}
-	else { false }
+	else { Ok(false) }
 }
 
 /// # Try Huffman RLE Optimization.
@@ -1343,8 +1359,8 @@ fn try_optimize_huffman_for_rle(
 	d_counts: &[usize; ZOPFLI_NUM_D],
 	ll_lengths: &mut [u32; ZOPFLI_NUM_LL],
 	d_lengths: &mut [u32; ZOPFLI_NUM_D],
-) -> usize {
-	let (_, treesize) = calculate_tree_size(ll_lengths, d_lengths);
+) -> Result<usize, ZopfliError> {
+	let (_, treesize) = calculate_tree_size(ll_lengths, d_lengths)?;
 
 	let datasize = calculate_block_symbol_size_given_counts(
 		ll_counts,
@@ -1363,11 +1379,11 @@ fn try_optimize_huffman_for_rle(
 	let mut d_counts2 = *d_counts;
 	optimize_huffman_for_rle(&mut ll_counts2);
 	optimize_huffman_for_rle(&mut d_counts2);
-	zopfli_length_limited_code_lengths::<15, ZOPFLI_NUM_LL>(&ll_counts2, &mut ll_lengths2);
-	zopfli_length_limited_code_lengths::<15, ZOPFLI_NUM_D>(&d_counts2, &mut d_lengths2);
+	zopfli_length_limited_code_lengths::<15, ZOPFLI_NUM_LL>(&ll_counts2, &mut ll_lengths2)?;
+	zopfli_length_limited_code_lengths::<15, ZOPFLI_NUM_D>(&d_counts2, &mut d_lengths2)?;
 	patch_distance_codes(&mut d_lengths2);
 
-	let (_, treesize2) = calculate_tree_size(&ll_lengths2, &d_lengths2);
+	let (_, treesize2) = calculate_tree_size(&ll_lengths2, &d_lengths2)?;
 	let datasize2 = calculate_block_symbol_size_given_counts(
 		ll_counts,
 		d_counts,
@@ -1380,11 +1396,11 @@ fn try_optimize_huffman_for_rle(
 
 	let sum = treesize + datasize;
 	let sum2 = treesize2 + datasize2;
-	if sum <= sum2 { sum }
+	if sum <= sum2 { Ok(sum) }
 	else {
 		ll_lengths.copy_from_slice(ll_lengths2.as_slice());
 		d_lengths.copy_from_slice(d_lengths2.as_slice());
-		sum2
+		Ok(sum2)
 	}
 }
 
diff --git a/src/image/zopflipng/cache.rs b/src/image/zopflipng/cache.rs
index 3a91795..fb6cc10 100644
--- a/src/image/zopflipng/cache.rs
+++ b/src/image/zopflipng/cache.rs
@@ -7,6 +7,7 @@ the thread-local LMC static.
 
 use std::cell::RefCell;
 use super::{
+	ZopfliError,
 	ZOPFLI_MIN_MATCH,
 	ZOPFLI_MAX_MATCH,
 };
@@ -101,10 +102,10 @@ impl MatchCache {
 		sublen: &mut [u16],
 		distance: &mut u16,
 		length: &mut u16,
-	) -> bool {
+	) -> Result<bool, ZopfliError> {
 		// If we have no distance, we have no cache.
 		let (cache_len, cache_dist) = self.ld(pos);
-		if cache_len != 0 && cache_dist == 0 { return false; }
+		if cache_len != 0 && cache_dist == 0 { return Ok(false); }
 
 		// Find the max sublength once, if ever.
 		let maxlength =
@@ -138,13 +139,17 @@ impl MatchCache {
 
 					// Sanity check: make sure the sublength distance at length
 					// matches the redundantly-cached distance.
-					if *limit == ZOPFLI_MAX_MATCH && usize::from(*length) >= ZOPFLI_MIN_MATCH {
-						assert_eq!(*distance, cache_dist);
+					if
+						*limit == ZOPFLI_MAX_MATCH &&
+						usize::from(*length) >= ZOPFLI_MIN_MATCH &&
+						*distance != cache_dist
+					{
+						return Err(ZopfliError::LMCDistance);
 					}
 				}
 
 				// We did stuff!
-				return true;
+				return Ok(true);
 			}
 
 			// Replace the limit with our sad cached length.
@@ -152,7 +157,7 @@ impl MatchCache {
 		}
 
 		// Nothing happened.
-		false
+		Ok(false)
 	}
 
 	/// # Get Length and Distance.
@@ -322,7 +327,8 @@ impl SqueezeCache {
 		let mut idx = costs.len() - 1;
 		while 0 < idx && idx < costs.len() {
 			let v = costs[idx].1;
-			assert!((1..=ZOPFLI_MAX_MATCH as u16).contains(&v));
+			debug_assert!((1..=ZOPFLI_MAX_MATCH as u16).contains(&v));
+			if ! (1..=ZOPFLI_MAX_MATCH as u16).contains(&v) { return None; }
 
 			// Only lengths of at least ZOPFLI_MIN_MATCH count as lengths
 			// after tracing.
diff --git a/src/image/zopflipng/error.rs b/src/image/zopflipng/error.rs
new file mode 100644
index 0000000..8c7120e
--- /dev/null
+++ b/src/image/zopflipng/error.rs
@@ -0,0 +1,44 @@
+/*!
+# Flaca: Zopflipng Errors.
+*/
+
+use std::fmt;
+
+#[derive(Debug, Clone, Copy)]
+pub(crate) enum ZopfliError {
+	HistogramRange,
+	LeafSize(usize, usize),
+	LitLen(u16),
+	LitLenLiteral(u16),
+	LMCDistance,
+	MatchRange(usize, usize, u16),
+	PathLength,
+	NoDistance,
+	NoLength,
+	SplitRange(usize, usize, usize),
+	SublenLength,
+	Write,
+}
+
+impl fmt::Display for ZopfliError {
+	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+		f.write_str("(zopfli) ")?;
+
+		match self {
+			Self::HistogramRange => f.write_str("invalid histogram range"),
+			Self::LeafSize(bits, leaves) => f.write_fmt(format_args!("{bits} insufficient for {leaves}")),
+			Self::LitLen(n) => f.write_fmt(format_args!("invalid LitLen: {n}")),
+			Self::LitLenLiteral(n) => f.write_fmt(format_args!("invalid LitLen literal: {n}")),
+			Self::LMCDistance => f.write_str("incorrect cache distance found"),
+			Self::MatchRange(start, end, pos) => f.write_fmt(format_args!("invalid match length for {start}..{end}: {pos}")),
+			Self::NoDistance => f.write_str("expected non-zero distance"),
+			Self::NoLength => f.write_str("expected non-zero length"),
+			Self::PathLength => f.write_str("path length/find mismatch"),
+			Self::SplitRange(start, end, pos) => f.write_fmt(format_args!("invalid split position for {start}..{end}: {pos}")),
+			Self::SublenLength => f.write_str("invalid sublen slice length"),
+			Self::Write => f.write_str("failed to write output"),
+		}
+	}
+}
+
+impl std::error::Error for ZopfliError {}
diff --git a/src/image/zopflipng/hash.rs b/src/image/zopflipng/hash.rs
index 44c9be4..46d053a 100644
--- a/src/image/zopflipng/hash.rs
+++ b/src/image/zopflipng/hash.rs
@@ -30,6 +30,7 @@ use super::{
 	SqueezeCache,
 	stats::SymbolStats,
 	SUBLEN_LEN,
+	ZopfliError,
 	ZOPFLI_MAX_MATCH,
 	ZOPFLI_MIN_MATCH,
 };
@@ -215,13 +216,14 @@ impl ZopfliHash {
 		stats: Option<&SymbolStats>,
 		squeeze: &mut SqueezeCache,
 		store: &mut LZ77Store,
-	) {
+	) -> Result<(), ZopfliError> {
 		let costs = squeeze.reset_costs();
-		let tmp = self.get_best_lengths(arr, instart, stats, costs);
+		let tmp = self.get_best_lengths(arr, instart, stats, costs)?;
 		debug_assert!(tmp < 1E30);
 		if let Some(paths) = squeeze.trace_paths() {
-			self.follow_paths(arr, instart, paths, store);
+			self.follow_paths(arr, instart, paths, store)?;
 		}
+		Ok(())
 	}
 
 	#[allow(
@@ -247,7 +249,7 @@ impl ZopfliHash {
 		instart: usize,
 		stats: Option<&SymbolStats>,
 		costs: &mut [(f32, u16)],
-	) -> f64 {
+	) -> Result<f64, ZopfliError> {
 		// Reset and warm the hash.
 		self.reset(arr, instart);
 
@@ -287,7 +289,7 @@ impl ZopfliHash {
 				&mut distance,
 				&mut length,
 				Some(instart),
-			);
+			)?;
 
 			// Literal. (Note: this condition should never not be true, but it
 			// doesn't hurt too much to be extra sure!)
@@ -338,7 +340,7 @@ impl ZopfliHash {
 
 		// Return the final cost!
 		debug_assert!(0.0 <= costs[costs.len() - 1].0);
-		f64::from(costs[costs.len() - 1].0)
+		Ok(f64::from(costs[costs.len() - 1].0))
 	}
 
 	#[allow(clippy::cast_possible_truncation)]
@@ -417,7 +419,7 @@ impl ZopfliHash {
 		instart: usize,
 		store: &mut LZ77Store,
 		cache: Option<usize>,
-	) {
+	) -> Result<(), ZopfliError> {
 		// Reset the hash.
 		self.reset(arr, instart);
 
@@ -444,7 +446,7 @@ impl ZopfliHash {
 				&mut distance,
 				&mut length,
 				cache,
-			);
+			)?;
 
 			// Lazy matching.
 			let length_score = get_length_score(length, distance);
@@ -460,7 +462,7 @@ impl ZopfliHash {
 						u16::from(unsafe { *arr.get_unchecked(i - 1) }),
 						0,
 						i - 1,
-					);
+					)?;
 					if length_score >= ZOPFLI_MIN_MATCH as u16 && length < ZOPFLI_MAX_MATCH as u16 {
 						match_available = true;
 						prev_length = length;
@@ -476,7 +478,7 @@ impl ZopfliHash {
 					distance = prev_distance;
 
 					// Write the values!
-					store.push(length, distance, i - 1);
+					store.push(length, distance, i - 1)?;
 
 					// Update the hash up through length and increment the loop
 					// position accordingly.
@@ -502,13 +504,13 @@ impl ZopfliHash {
 
 			// Write the current length/distance.
 			if length_score >= ZOPFLI_MIN_MATCH as u16 {
-				store.push(length, distance, i);
+				store.push(length, distance, i)?;
 			}
 			// Write from the source with no distance and reset the length to
 			// one.
 			else {
 				length = 1;
-				store.push(u16::from(arr[i]), 0, i);
+				store.push(u16::from(arr[i]), 0, i)?;
 			}
 
 			// Update the hash up through length and increment the loop
@@ -520,6 +522,8 @@ impl ZopfliHash {
 
 			i += 1;
 		}
+
+		Ok(())
 	}
 
 	#[allow(clippy::cast_possible_truncation)]
@@ -533,9 +537,9 @@ impl ZopfliHash {
 		instart: usize,
 		paths: &[u16],
 		store: &mut LZ77Store,
-	) {
+	) -> Result<(), ZopfliError> {
 		// Easy abort.
-		if instart >= arr.len() { return; }
+		if instart >= arr.len() { return Ok(()); }
 
 		// Reset the hash.
 		self.reset(arr, instart);
@@ -560,14 +564,16 @@ impl ZopfliHash {
 					&mut dist,
 					&mut test_length,
 					Some(instart),
-				);
+				)?;
 
 				// This logic is so screwy; I hesitate to make this a debug
 				// assertion!
-				assert!(! (test_length != length && length > 2 && test_length > 2));
+				if test_length != length && length > 2 && test_length > 2 {
+					return Err(ZopfliError::PathLength);
+				}
 
 				// Add it to the store.
-				store.push(length, dist, i);
+				store.push(length, dist, i)?;
 
 				// Hash the rest of the match.
 				for _ in 1..usize::from(length) {
@@ -577,11 +583,13 @@ impl ZopfliHash {
 			}
 			// Add it to the store.
 			else {
-				store.push(u16::from(arr[i]), 0, i);
+				store.push(u16::from(arr[i]), 0, i)?;
 			}
 
 			i += 1;
 		}
+
+		Ok(())
 	}
 }
 
@@ -604,7 +612,7 @@ impl ZopfliHash {
 		distance: &mut u16,
 		length: &mut u16,
 		cache: Option<usize>,
-	) {
+	) -> Result<(), ZopfliError> {
 		// Check the longest match cache first!
 		if let Some(blockstart) = cache {
 			if CACHE.with_borrow(|c| c.find(
@@ -613,9 +621,9 @@ impl ZopfliHash {
 				sublen,
 				distance,
 				length,
-			)) {
-				assert!(pos + usize::from(*length) <= arr.len());
-				return;
+			))? {
+				if pos + usize::from(*length) <= arr.len() { return Ok(()); }
+				return Err(ZopfliError::MatchRange(pos, arr.len(), *length));
 			}
 		}
 
@@ -627,7 +635,7 @@ impl ZopfliHash {
 		if pos + ZOPFLI_MIN_MATCH > arr.len() {
 			*length = 0;
 			*distance = 0;
-			return;
+			return Ok(());
 		}
 
 		// Cap the limit to fit if needed. Note that limit will always be at
@@ -635,7 +643,7 @@ impl ZopfliHash {
 		if pos + limit > arr.len() { limit = arr.len() - pos; }
 
 		// Calculate the best distance and length.
-		let (bestdist, bestlength) = self.find_loop(arr, pos, limit, sublen);
+		let (bestdist, bestlength) = self.find_loop(arr, pos, limit, sublen)?;
 
 		// Cache the results for next time, maybe.
 		if let Some(blockstart) = cache {
@@ -649,7 +657,10 @@ impl ZopfliHash {
 		// Update the values.
 		*distance = bestdist;
 		*length = bestlength;
-		assert!(pos + usize::from(*length) <= arr.len());
+		if pos + usize::from(*length) <= arr.len() { Ok(()) }
+		else {
+			Err(ZopfliError::MatchRange(pos, arr.len(), *length))
+		}
 	}
 
 	#[allow(
@@ -669,13 +680,15 @@ impl ZopfliHash {
 		pos: usize,
 		limit: usize,
 		sublen: &mut [u16],
-	) -> (u16, u16) {
+	) -> Result<(u16, u16), ZopfliError> {
 		// This is asserted by find() too, but it's a good reminder.
 		debug_assert!(limit <= ZOPFLI_MAX_MATCH);
 
 		// Help the compiler understand sublen has a fixed size if provided.
 		// (We can't do an Option<Array> because it's too big for Copy.)
-		assert!(sublen.is_empty() || sublen.len() == ZOPFLI_MAX_MATCH + 1);
+		if ! sublen.is_empty() && sublen.len() != ZOPFLI_MAX_MATCH + 1 {
+			return Err(ZopfliError::SublenLength);
+		}
 
 		let hpos = pos & ZOPFLI_WINDOW_MASK;
 
@@ -806,8 +819,8 @@ impl ZopfliHash {
 		} // Thus concludes the long-ass loop!
 
 		// Return the distance and length values.
-		if bestlength <= limit { (bestdist as u16, bestlength as u16) }
-		else { (0, 1) }
+		if bestlength <= limit { Ok((bestdist as u16, bestlength as u16)) }
+		else { Ok((0, 1)) }
 	}
 }
 
diff --git a/src/image/zopflipng/kat.rs b/src/image/zopflipng/kat.rs
index 271e90d..f219a26 100644
--- a/src/image/zopflipng/kat.rs
+++ b/src/image/zopflipng/kat.rs
@@ -14,6 +14,7 @@ use std::{
 	cmp::Ordering,
 	mem::MaybeUninit,
 };
+use super::ZopfliError;
 
 
 
@@ -36,7 +37,7 @@ thread_local!(
 pub(crate) fn zopfli_length_limited_code_lengths<const MAXBITS: usize, const SIZE: usize>(
 	frequencies: &[usize; SIZE],
 	bitlengths: &mut [u32; SIZE],
-) {
+) -> Result<(), ZopfliError> {
 	// Convert (used) frequencies to leaves. There will never be more than
 	// ZOPFLI_NUM_LL of them, but often there will be less, so we'll leverage
 	// MaybeUninit to save unnecessary writes.
@@ -55,12 +56,14 @@ pub(crate) fn zopfli_length_limited_code_lengths<const MAXBITS: usize, const SIZ
 	}
 
 	// Nothing to do!
-	if len_leaves == 0 { return; }
+	if len_leaves == 0 { return Ok(()); }
 
 	// This method is either called with 15 maxbits and 32 or 288 potential
 	// leaves, or 7 maxbits and 19 potential leaves; in either case, the max
 	// leaves are well within range.
-	assert!((1 << MAXBITS) >= len_leaves, "Insufficient maxbits for symbols.");
+	if (1 << MAXBITS) < len_leaves {
+		return Err(ZopfliError::LeafSize(MAXBITS, len_leaves));
+	}
 
 	// Set up the pool!
 	BUMP.with_borrow_mut(|nodes| {
@@ -73,7 +76,7 @@ pub(crate) fn zopfli_length_limited_code_lengths<const MAXBITS: usize, const SIZ
 		// otherwise we can just record their values as one and call it a day.
 		if len_leaves <= 2 {
 			for leaf in final_leaves { *leaf.bitlength = 1; }
-			return;
+			return Ok(());
 		}
 
 		// Sort the leaves.
@@ -118,7 +121,8 @@ pub(crate) fn zopfli_length_limited_code_lengths<const MAXBITS: usize, const SIZ
 
 		// Please be kind, rewind!
 		nodes.reset();
-	});
+		Ok(())
+	})
 }
 
 
diff --git a/src/image/zopflipng/lz77.rs b/src/image/zopflipng/lz77.rs
index 54419bc..c038607 100644
--- a/src/image/zopflipng/lz77.rs
+++ b/src/image/zopflipng/lz77.rs
@@ -10,6 +10,7 @@ use super::{
 	LENGTH_SYMBOLS_BITS_VALUES,
 	LitLen,
 	Lsym,
+	ZopfliError,
 	ZOPFLI_NUM_D,
 	ZOPFLI_NUM_LL,
 };
@@ -50,8 +51,8 @@ impl LZ77Store {
 	}
 
 	/// # Push Values.
-	pub(crate) fn push(&mut self, litlen: u16, dist: u16, pos: usize) {
-		self.push_entry(LZ77StoreEntry::new(litlen, dist, pos));
+	pub(crate) fn push(&mut self, litlen: u16, dist: u16, pos: usize) -> Result<(), ZopfliError> {
+		LZ77StoreEntry::new(litlen, dist, pos).map(|e| self.push_entry(e))
 	}
 
 	#[allow(unsafe_code)]
@@ -119,7 +120,7 @@ impl LZ77Store {
 
 	/// # Histogram.
 	pub(crate) fn histogram(&self, lstart: usize, lend: usize)
-	-> ([usize; ZOPFLI_NUM_LL], [usize; ZOPFLI_NUM_D]) {
+	-> Result<([usize; ZOPFLI_NUM_LL], [usize; ZOPFLI_NUM_D]), ZopfliError> {
 		// Count the symbols directly.
 		if lstart + ZOPFLI_NUM_LL * 3 > lend {
 			let mut ll_counts = [0_usize; ZOPFLI_NUM_LL];
@@ -132,23 +133,23 @@ impl LZ77Store {
 				}
 			}
 
-			(ll_counts, d_counts)
+			Ok((ll_counts, d_counts))
 		}
 		// Subtract the cumulative histograms at the start from the end to get the
 		// one for this range.
 		else {
-			let (mut ll_counts, mut d_counts) = self._histogram(lend - 1);
+			let (mut ll_counts, mut d_counts) = self._histogram(lend - 1)?;
 			if 0 < lstart {
 				self._histogram_sub(lstart - 1, &mut ll_counts, &mut d_counts);
 			}
 
-			(ll_counts, d_counts)
+			Ok((ll_counts, d_counts))
 		}
 	}
 
 	/// # Histogram at Position.
 	fn _histogram(&self, pos: usize)
-	-> ([usize; ZOPFLI_NUM_LL], [usize; ZOPFLI_NUM_D]) {
+	-> Result<([usize; ZOPFLI_NUM_LL], [usize; ZOPFLI_NUM_D]), ZopfliError> {
 		// The relative chunked positions.
 		let ll_start = ZOPFLI_NUM_LL * pos.wrapping_div(ZOPFLI_NUM_LL);
 		let d_start = ZOPFLI_NUM_D * pos.wrapping_div(ZOPFLI_NUM_D);
@@ -156,10 +157,12 @@ impl LZ77Store {
 		let d_end = d_start + ZOPFLI_NUM_D;
 
 		// Start by copying the counts directly from the nearest chunk.
-		let mut ll_counts = [0_usize; ZOPFLI_NUM_LL];
-		ll_counts.copy_from_slice(&self.ll_counts[ll_start..ll_end]);
-		let mut d_counts = [0_usize; ZOPFLI_NUM_D];
-		d_counts.copy_from_slice(&self.d_counts[d_start..d_end]);
+		let mut ll_counts: [usize; ZOPFLI_NUM_LL] = self.ll_counts.get(ll_start..ll_end)
+			.and_then(|c| c.try_into().ok())
+			.ok_or(ZopfliError::HistogramRange)?;
+		let mut d_counts: [usize; ZOPFLI_NUM_D] = self.d_counts.get(d_start..d_end)
+			.and_then(|c| c.try_into().ok())
+			.ok_or(ZopfliError::HistogramRange)?;
 
 		// Subtract the symbol occurences between (pos+1) and the end of the
 		// chunks.
@@ -173,7 +176,7 @@ impl LZ77Store {
 		}
 
 		// We have our answer!
-		(ll_counts, d_counts)
+		Ok((ll_counts, d_counts))
 	}
 
 	/// # Subtract Histogram.
@@ -232,8 +235,8 @@ impl LZ77StoreEntry {
 		clippy::cast_sign_loss,
 	)]
 	/// # New.
-	const fn new(litlen: u16, dist: u16, pos: usize) -> Self {
-		assert!(litlen < 259);
+	const fn new(litlen: u16, dist: u16, pos: usize) -> Result<Self, ZopfliError> {
+		if litlen >= 259 { return Err(ZopfliError::LitLen(litlen)); }
 		debug_assert!(dist < 32_768);
 
 		// Using the signed type helps the compiler understand the upper
@@ -249,13 +252,13 @@ impl LZ77StoreEntry {
 				DISTANCE_SYMBOLS[dist as usize],
 			)};
 
-		Self {
+		Ok(Self {
 			pos,
 			litlen: unsafe { std::mem::transmute(litlen) },
 			dist,
 			ll_symbol,
 			d_symbol,
-		}
+		})
 	}
 
 	/// # Length.
diff --git a/src/image/zopflipng/mod.rs b/src/image/zopflipng/mod.rs
index 4e5e78d..38690c7 100644
--- a/src/image/zopflipng/mod.rs
+++ b/src/image/zopflipng/mod.rs
@@ -16,6 +16,7 @@ performant.
 
 mod blocks;
 mod cache;
+mod error;
 mod hash;
 mod kat;
 mod lz77;
@@ -30,6 +31,7 @@ use cache::{
 	CACHE,
 	SqueezeCache,
 };
+use error::ZopfliError;
 use hash::HASH;
 use lz77::LZ77Store;
 use kat::zopfli_length_limited_code_lengths;

From 5d61f82714f685e5242f51dc94fc138450be45c3 Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Sat, 13 Apr 2024 19:46:06 -0700
Subject: [PATCH 02/31] new: -j option (multithreading level)

---
 Cargo.toml   |  6 ++++++
 README.md    |  6 +++++-
 src/error.rs |  3 +++
 src/main.rs  | 53 ++++++++++++++++++++++++++++++++++++++--------------
 4 files changed, 53 insertions(+), 15 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 23362d8..f391913 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -61,6 +61,12 @@ short = "-V"
 long = "--version"
 description = "Print version information and exit."
 
+[[package.metadata.bashman.options]]
+short = "-j"
+label = "<NUM>"
+description = "Limit parallelization to this many threads (instead of giving each logical core its own image to work on). If negative, the value will be subtracted from the total number of logical cores."
+path = false
+
 [[package.metadata.bashman.options]]
 short = "-l"
 long = "--list"
diff --git a/README.md b/README.md
index bad7234..ddc04b7 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,7 @@ The following flags and options are available:
 | Short | Long | Value | Description |
 | ----- | ---- | ----- | ----------- |
 | `-h` | `--help` | | Print help information and exit. |
+| `-j` | | `<NUM>` | Limit parallelization to this many threads (instead of using all logical cores). |
 | `-l` | `--list` | `<FILE>` | Read (absolute) image and/or directory paths from this text file — or STDIN if "-" — one entry per line, instead of or in addition to the trailing `<PATH(S)>`. |
 | | `--no-jpeg` | | Skip JPEG images. |
 | | `--no-png` | | Skip PNG Images. |
@@ -80,11 +81,14 @@ flaca /path/to/image.jpg
 flaca -p /path/to/assets
 
 # Tackle a whole folder, but only look for JPEG images.
-flaca -p --no-png /path/to/assets
+flaca --no-png /path/to/assets
 
 # Or load it up with a lot of places separately:
 flaca /path/to/assets /path/to/favicon.png …
 
+# Limit parallel processing to two images at a time.
+flaca -j2 /path/to/assets
+
 # Zopfli compression is slow and scales more or less linearly with the number
 # of iterations set. Flaca uses the same default as zopflipng: 60 for small
 # images, 20 for larger ones. If you're willing to trade longer processing 
diff --git a/src/error.rs b/src/error.rs
index dcd4b22..be07839 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -20,6 +20,8 @@ pub(super) enum FlacaError {
 	Killed,
 	/// # No Images.
 	NoImages,
+	/// # No Threads.
+	NoThreads,
 	/// # Progress Passthrough.
 	Progress(ProglessError),
 	/// # Invalid Zopfli Iterations.
@@ -58,6 +60,7 @@ impl FlacaError {
 			Self::Argue(e) => e.as_str(),
 			Self::Killed => "The process was aborted early.",
 			Self::NoImages => "No images were found.",
+			Self::NoThreads => "Unable to initialize threadpool.",
 			Self::Progress(e) => e.as_str(),
 			Self::ZopfliIterations => "The number of (zopfli) lz77 iterations must be between 1..=2_147_483_647.",
 		}
diff --git a/src/main.rs b/src/main.rs
index 2d7e526..b1ef970 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -51,6 +51,7 @@ use dactyl::{
 	NiceU64,
 	traits::{
 		BytesToSigned,
+		BytesToUnsigned,
 		NiceInflection,
 	},
 };
@@ -69,18 +70,21 @@ use rayon::iter::{
 	IntoParallelRefIterator,
 	ParallelIterator,
 };
-use std::sync::{
-	Arc,
-	atomic::{
-		AtomicBool,
-		AtomicU64,
-		Ordering::{
-			Acquire,
-			Relaxed,
-			SeqCst,
+use std::{
+	num::NonZeroUsize,
+	sync::{
+		Arc,
+		atomic::{
+			AtomicBool,
+			AtomicU64,
+			Ordering::{
+				Acquire,
+				Relaxed,
+				SeqCst,
+			},
 		},
+		OnceLock,
 	},
-	OnceLock,
 };
 
 
@@ -156,6 +160,23 @@ fn _main() -> Result<(), FlacaError> {
 	// location.
 	let oxi = image::oxipng_options();
 
+	// Set up a threadpool for our workload.
+	let mut threads = std::thread::available_parallelism().unwrap_or(NonZeroUsize::MIN);
+	if let Some(t) = args.option(b"-j") {
+		if let Some(t) = t.strip_prefix(b"-").and_then(NonZeroUsize::btou) {
+			threads = threads.get().checked_sub(t.get())
+				.and_then(NonZeroUsize::new)
+				.unwrap_or(NonZeroUsize::MIN);
+		}
+		else if let Some(t) = NonZeroUsize::btou(t) {
+			if t < threads { threads = t; }
+		}
+	}
+	let pool = rayon::ThreadPoolBuilder::new()
+		.num_threads(threads.get())
+		.build()
+		.map_err(|_| FlacaError::NoThreads)?;
+
 	// Sexy run-through.
 	if args.switch2(b"-p", b"--progress") {
 		// Boot up a progress bar.
@@ -168,7 +189,7 @@ fn _main() -> Result<(), FlacaError> {
 
 		// Process!
 		sigint(Arc::clone(&killed), Some(progress.clone()));
-		paths.par_iter().with_max_len(1).for_each(|x|
+		pool.install(|| paths.par_iter().with_max_len(1).for_each(|x|
 			if ! killed.load(Acquire) {
 				let tmp = x.to_string_lossy();
 				progress.add(&tmp);
@@ -194,7 +215,7 @@ fn _main() -> Result<(), FlacaError> {
 
 				progress.remove(&tmp);
 			}
-		);
+		));
 
 		// Print a summary.
 		let elapsed = progress.finish();
@@ -220,9 +241,9 @@ fn _main() -> Result<(), FlacaError> {
 	// Silent run-through.
 	else {
 		sigint(Arc::clone(&killed), None);
-		paths.par_iter().for_each(|x| if ! killed.load(Acquire) {
+		pool.install(|| paths.par_iter().for_each(|x| if ! killed.load(Acquire) {
 			let _res = image::encode(x, kinds, &oxi);
-		});
+		}));
 	}
 
 	// Early abort?
@@ -263,6 +284,10 @@ FLAGS:
     -V, --version     Print version information and exit.
 
 OPTIONS:
+    -j <NUM>          Limit parallelization to this many threads (instead of
+                      giving each logical core its own image to work on). If
+                      negative, the value will be subtracted from the total
+                      number of logical cores.
     -l, --list <FILE> Read (absolute) image and/or directory paths from this
                       text file — or STDIN if "-" — one entry per line, instead
                       of or in addition to (actually trailing) <PATH(S)>.

From 57cf6da1d3c736b88faf2ca4ae18e92775147c8c Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Sat, 13 Apr 2024 20:16:20 -0700
Subject: [PATCH 03/31] cleanup: simplify zopfli errors

---
 src/image/lodepng.rs          |  2 +-
 src/image/zopflipng/blocks.rs | 22 ++++++++-------
 src/image/zopflipng/error.rs  | 50 ++++++++++++++++++++---------------
 src/image/zopflipng/hash.rs   |  4 +--
 src/image/zopflipng/kat.rs    |  2 +-
 src/image/zopflipng/lz77.rs   |  2 +-
 6 files changed, 46 insertions(+), 36 deletions(-)

diff --git a/src/image/lodepng.rs b/src/image/lodepng.rs
index d85e48f..cb764ea 100644
--- a/src/image/lodepng.rs
+++ b/src/image/lodepng.rs
@@ -81,7 +81,7 @@ pub(crate) extern "C" fn flaca_png_deflate(
 
 		if res.is_err() {
 			// Force a panic in debug mode.
-			#[cfg(debug_assertions)] panic!("{res:?}");
+			#[cfg(debug_assertions)] panic!("(zopfli) {res:?}");
 
 			// Otherwise just let lodepng know we failed.
 			return 1;
diff --git a/src/image/zopflipng/blocks.rs b/src/image/zopflipng/blocks.rs
index fd87571..56d011e 100644
--- a/src/image/zopflipng/blocks.rs
+++ b/src/image/zopflipng/blocks.rs
@@ -128,7 +128,7 @@ impl SplitPoints {
 		loop {
 			let (llpos, llcost) = find_minimum_cost(store, lstart + 1, lend)?;
 			if llpos <= lstart || llpos >= lend {
-				return Err(ZopfliError::SplitRange(lstart, lend, llpos));
+				return Err(ZopfliError::SplitRange);
 			}
 
 			// Ignore points we've already covered.
@@ -190,7 +190,7 @@ impl SplitPoints {
 			// Make another store.
 			store2.clear();
 			lz77_optimal(
-				// Safety: split_raw asserts splits are in range.
+				// Safety: the split points are checked at creation.
 				unsafe { arr.get_unchecked(..end) },
 				start,
 				numiterations,
@@ -536,7 +536,7 @@ fn add_lz77_data(
 		// Length only.
 		if e.dist <= 0 {
 			if (e.litlen as u16) >= 256 {
-				return Err(ZopfliError::LitLenLiteral(e.litlen as u16));
+				return Err(ZopfliError::LitLenLiteral);
 			}
 			if ll_lengths[e.litlen as usize] == 0 { return Err(ZopfliError::NoLength); }
 
@@ -801,7 +801,7 @@ fn encode_tree(
 
 	// Find the last non-zero length index.
 	let mut hlit = 29;
-	while hlit > 0 && ll_lengths[257 + hlit - 1] == 0 { hlit -= 1; }
+	while hlit > 0 && ll_lengths[256 + hlit] == 0 { hlit -= 1; }
 	let hlit2 = hlit + 257; // Same as hlit, but in symbol form.
 
 	// And do the same for distance.
@@ -828,6 +828,7 @@ fn encode_tree(
 	while i < lld_total {
 		let mut count = 1;
 		let symbol = length_or_distance(i);
+		if symbol >= 19 { return Err(ZopfliError::TreeSymbol); }
 
 		// Peek ahead; we may be able to do more in one go.
 		if use_16 || (symbol == 0 && (use_17 || use_18)) {
@@ -888,8 +889,8 @@ fn encode_tree(
 	// Update the lengths and symbols given the counts.
 	zopfli_length_limited_code_lengths::<7, 19>(&cl_counts, &mut cl_lengths)?;
 
-	// Find the last non-zero index of the counts table. Note that every ORDER
-	// value is between 0..19, so we can get_unchecked().
+	// Find the last non-zero index of the counts table.
+	// Safety: all ORDER values are between 0..19.
 	let mut hclen = 15;
 	while hclen > 0 && unsafe { *cl_counts.get_unchecked(ORDER[hclen + 3]) } == 0 {
 		hclen -= 1;
@@ -1072,8 +1073,7 @@ fn get_lz77_byte_range(
 ) -> usize {
 	if lstart >= lend { 0 }
 	else {
-		// Safety: split points are asserted to be in range during the carving
-		// stage.
+		// Safety: the split points are checked at creation.
 		debug_assert!(lend <= store.entries.len());
 		let e = unsafe { store.entries.get_unchecked(lend - 1) };
 		e.length() as usize + e.pos - store.entries[lstart].pos
@@ -1305,7 +1305,7 @@ fn try_lz77_expensive_fixed(
 	out: &mut ZopfliOut,
 ) -> Result<bool, ZopfliError> {
 	let mut fixed_store = LZ77Store::new();
-	// Safety: the split points are asserted during their creation.
+	// Safety: the split points are checked at creation.
 	debug_assert!(lstart < store.entries.len());
 	let instart = unsafe { store.entries.get_unchecked(lstart).pos };
 	let inend = instart + get_lz77_byte_range(store, lstart, lend);
@@ -1360,8 +1360,8 @@ fn try_optimize_huffman_for_rle(
 	ll_lengths: &mut [u32; ZOPFLI_NUM_LL],
 	d_lengths: &mut [u32; ZOPFLI_NUM_D],
 ) -> Result<usize, ZopfliError> {
+	// Calculate the tree and data sizes as are.
 	let (_, treesize) = calculate_tree_size(ll_lengths, d_lengths)?;
-
 	let datasize = calculate_block_symbol_size_given_counts(
 		ll_counts,
 		d_counts,
@@ -1383,6 +1383,7 @@ fn try_optimize_huffman_for_rle(
 	zopfli_length_limited_code_lengths::<15, ZOPFLI_NUM_D>(&d_counts2, &mut d_lengths2)?;
 	patch_distance_codes(&mut d_lengths2);
 
+	// Calculate the optimized tree and data sizes.
 	let (_, treesize2) = calculate_tree_size(&ll_lengths2, &d_lengths2)?;
 	let datasize2 = calculate_block_symbol_size_given_counts(
 		ll_counts,
@@ -1394,6 +1395,7 @@ fn try_optimize_huffman_for_rle(
 		lend,
 	);
 
+	// Return whichever's better.
 	let sum = treesize + datasize;
 	let sum2 = treesize2 + datasize2;
 	if sum <= sum2 { Ok(sum) }
diff --git a/src/image/zopflipng/error.rs b/src/image/zopflipng/error.rs
index 8c7120e..3adce8d 100644
--- a/src/image/zopflipng/error.rs
+++ b/src/image/zopflipng/error.rs
@@ -7,38 +7,46 @@ use std::fmt;
 #[derive(Debug, Clone, Copy)]
 pub(crate) enum ZopfliError {
 	HistogramRange,
-	LeafSize(usize, usize),
-	LitLen(u16),
-	LitLenLiteral(u16),
+	LeafSize,
+	LitLen,
+	LitLenLiteral,
 	LMCDistance,
-	MatchRange(usize, usize, u16),
-	PathLength,
+	MatchRange,
 	NoDistance,
 	NoLength,
-	SplitRange(usize, usize, usize),
+	PathLength,
+	SplitRange,
 	SublenLength,
+	TreeSymbol,
 	Write,
 }
 
 impl fmt::Display for ZopfliError {
+	#[inline]
 	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-		f.write_str("(zopfli) ")?;
+		f.write_str(self.as_str())
+	}
+}
 
+impl std::error::Error for ZopfliError {}
+
+impl ZopfliError {
+	/// # As String Slice.
+	pub(crate) const fn as_str(self) -> &'static str {
 		match self {
-			Self::HistogramRange => f.write_str("invalid histogram range"),
-			Self::LeafSize(bits, leaves) => f.write_fmt(format_args!("{bits} insufficient for {leaves}")),
-			Self::LitLen(n) => f.write_fmt(format_args!("invalid LitLen: {n}")),
-			Self::LitLenLiteral(n) => f.write_fmt(format_args!("invalid LitLen literal: {n}")),
-			Self::LMCDistance => f.write_str("incorrect cache distance found"),
-			Self::MatchRange(start, end, pos) => f.write_fmt(format_args!("invalid match length for {start}..{end}: {pos}")),
-			Self::NoDistance => f.write_str("expected non-zero distance"),
-			Self::NoLength => f.write_str("expected non-zero length"),
-			Self::PathLength => f.write_str("path length/find mismatch"),
-			Self::SplitRange(start, end, pos) => f.write_fmt(format_args!("invalid split position for {start}..{end}: {pos}")),
-			Self::SublenLength => f.write_str("invalid sublen slice length"),
-			Self::Write => f.write_str("failed to write output"),
+			Self::HistogramRange => "invalid histogram range",
+			Self::LeafSize => "insufficient maxbits for leaves",
+			Self::LitLen => "invalid litlen",
+			Self::LitLenLiteral => "invalid litlen literal",
+			Self::LMCDistance => "LMC returned an unexpected distance",
+			Self::MatchRange => "invalid match range",
+			Self::NoDistance => "expected non-zero distance",
+			Self::NoLength => "expected non-zero length",
+			Self::PathLength => "invalid path length",
+			Self::SplitRange => "invalid split range",
+			Self::SublenLength => "incorrectly sized sublength array",
+			Self::TreeSymbol => "invalid tree symbol",
+			Self::Write => "failed to write output",
 		}
 	}
 }
-
-impl std::error::Error for ZopfliError {}
diff --git a/src/image/zopflipng/hash.rs b/src/image/zopflipng/hash.rs
index 46d053a..d6ea9a3 100644
--- a/src/image/zopflipng/hash.rs
+++ b/src/image/zopflipng/hash.rs
@@ -623,7 +623,7 @@ impl ZopfliHash {
 				length,
 			))? {
 				if pos + usize::from(*length) <= arr.len() { return Ok(()); }
-				return Err(ZopfliError::MatchRange(pos, arr.len(), *length));
+				return Err(ZopfliError::MatchRange);
 			}
 		}
 
@@ -659,7 +659,7 @@ impl ZopfliHash {
 		*length = bestlength;
 		if pos + usize::from(*length) <= arr.len() { Ok(()) }
 		else {
-			Err(ZopfliError::MatchRange(pos, arr.len(), *length))
+			Err(ZopfliError::MatchRange)
 		}
 	}
 
diff --git a/src/image/zopflipng/kat.rs b/src/image/zopflipng/kat.rs
index f219a26..a1d460f 100644
--- a/src/image/zopflipng/kat.rs
+++ b/src/image/zopflipng/kat.rs
@@ -62,7 +62,7 @@ pub(crate) fn zopfli_length_limited_code_lengths<const MAXBITS: usize, const SIZ
 	// leaves, or 7 maxbits and 19 potential leaves; in either case, the max
 	// leaves are well within range.
 	if (1 << MAXBITS) < len_leaves {
-		return Err(ZopfliError::LeafSize(MAXBITS, len_leaves));
+		return Err(ZopfliError::LeafSize);
 	}
 
 	// Set up the pool!
diff --git a/src/image/zopflipng/lz77.rs b/src/image/zopflipng/lz77.rs
index c038607..6747200 100644
--- a/src/image/zopflipng/lz77.rs
+++ b/src/image/zopflipng/lz77.rs
@@ -236,7 +236,7 @@ impl LZ77StoreEntry {
 	)]
 	/// # New.
 	const fn new(litlen: u16, dist: u16, pos: usize) -> Result<Self, ZopfliError> {
-		if litlen >= 259 { return Err(ZopfliError::LitLen(litlen)); }
+		if litlen >= 259 { return Err(ZopfliError::LitLen); }
 		debug_assert!(dist < 32_768);
 
 		// Using the signed type helps the compiler understand the upper

From 6f03f775708f4bdc43a1d2fc8f4ee74775762a98 Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Sun, 14 Apr 2024 11:19:56 -0700
Subject: [PATCH 04/31] test: zopfli_length_limited_code_lengths

---
 src/image/zopflipng/kat.rs | 72 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/src/image/zopflipng/kat.rs b/src/image/zopflipng/kat.rs
index a1d460f..ea2bc45 100644
--- a/src/image/zopflipng/kat.rs
+++ b/src/image/zopflipng/kat.rs
@@ -292,3 +292,75 @@ impl<'a> Pool<'a> {
 		}
 	}
 }
+
+
+
+#[cfg(test)]
+mod tests {
+	use super::*;
+
+	// The original zopfli has no unit tests, but the zopfli-rs Rust port has
+	// a few. The 3/4/7/15 maxbit tests below have been adapted from those.
+	// They work, so that's promising!
+	// <https://github.com/zopfli-rs/zopfli/blob/main/src/katajainen.rs>
+
+	#[test]
+	fn t_kat3() {
+		let f = [1, 1, 5, 7, 10, 14];
+		let mut b = [0; 6];
+		assert!(zopfli_length_limited_code_lengths::<3, 6>(&f, &mut b).is_ok());
+		assert_eq!(b, [3, 3, 3, 3, 2, 2]);
+	}
+
+	#[test]
+	fn t_kat4() {
+		let f = [1, 1, 5, 7, 10, 14];
+		let mut b = [0; 6];
+		assert!(zopfli_length_limited_code_lengths::<4, 6>(&f, &mut b).is_ok());
+		assert_eq!(b, [4, 4, 3, 2, 2, 2]);
+	}
+
+	#[test]
+	fn t_kat7() {
+		let f = [252, 0, 1, 6, 9, 10, 6, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+		let mut b = [0; 19];
+		assert!(zopfli_length_limited_code_lengths::<7, 19>(&f, &mut b).is_ok());
+		assert_eq!(b, [1, 0, 6, 4, 3, 3, 3, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+	}
+
+	#[test]
+	fn t_kat15() {
+		let f = [
+			0, 0, 0, 0, 0, 0, 18, 0, 6, 0, 12, 2, 14, 9, 27, 15,
+			23, 15, 17, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		];
+		let mut b = [0; 32];
+		assert!(zopfli_length_limited_code_lengths::<15, 32>(&f, &mut b).is_ok());
+		assert_eq!(
+			b,
+			[
+				0, 0, 0, 0, 0, 0, 3, 0, 5, 0, 4, 6, 4, 4, 3, 4,
+				3, 3, 3, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+			]
+		);
+	}
+
+	#[test]
+	fn t_kat_limited() {
+		// No frequencies.
+		let mut f = [0; 19];
+		let mut b = [5; 19]; // Also test the bits get reset correctly.
+		assert!(zopfli_length_limited_code_lengths::<7, 19>(&f, &mut b).is_ok());
+		assert_eq!(b, [0; 19]);
+
+		// One frequency.
+		f[2] = 10;
+		assert!(zopfli_length_limited_code_lengths::<7, 19>(&f, &mut b).is_ok());
+		assert_eq!(b, [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+
+		// Two frequencies.
+		f[0] = 248;
+		assert!(zopfli_length_limited_code_lengths::<7, 19>(&f, &mut b).is_ok());
+		assert_eq!(b, [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+	}
+}

From 16a03c629a650cfd7ebcbef0757faf452d28ae14 Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Tue, 16 Apr 2024 17:45:21 -0700
Subject: [PATCH 05/31] misc: keep a record of png checksums for reference

---
 skel/png.b3 | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 skel/png.b3

diff --git a/skel/png.b3 b/skel/png.b3
new file mode 100644
index 0000000..d60d8dc
--- /dev/null
+++ b/skel/png.b3
@@ -0,0 +1,11 @@
+b53eca68578abeab039f7492aa08cc08aca2ee8373f5f35e4d51cfec69ab5c89  /tmp/bench-data/png/01.png
+385eb82425d1d39d84155ad812d7a2e9c7f4f989b9be8901f227d9b0b3716e60  /tmp/bench-data/png/02.png
+a71b84ad1980fb75b0357c9a2440a53af16363f66aaed484559ea95075f85d48  /tmp/bench-data/png/03.png
+71f3a81b8f0800987434a87a256f88fc0fa992a0082e9dc94cd47a7bf69cb7f6  /tmp/bench-data/png/04.png
+f305df54a410b781b5798c96c2423a90fd80744fa6a1b6e42938b866ae465bcc  /tmp/bench-data/png/05.png
+3cbbc3be80e8de2d9a3c1d97b11ca110d2922c0dcfcb7de1ff6952fa0addbe0b  /tmp/bench-data/png/06.png
+97fff17e466418ba255a5a1ecbe463c72d03bfca7c7c597f0ee8f831588a51e5  /tmp/bench-data/png/poe.png
+4da3d1e65804d00be0d376234c3c1ee25469105bf32966e05adb65e870d6eb5e  /tmp/bench-data/png/small.png
+e0eb307123913aa8b3a864790ab904d340a7e4e85192d27cb1d16e15c14d3319  /tmp/bench-data/png/small-bw.png
+3ca8474dff6526979ca43cba37a71b9ffdaa9646bd89811c3abdce0b56b85ca2  /tmp/bench-data/png/small-bwa.png
+6d73e93975587d16ce907ace61f7ac818aaa2c0f2884ecb489112c8d6f5c5aac  /tmp/bench-data/wolf.jpg

From 2fcfa0444efed531e76802b013de65bb6fa2680c Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Tue, 16 Apr 2024 17:45:42 -0700
Subject: [PATCH 06/31] cleanup: elide const parameters

---
 src/image/zopflipng/stats.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/image/zopflipng/stats.rs b/src/image/zopflipng/stats.rs
index 47f65b2..27fea54 100644
--- a/src/image/zopflipng/stats.rs
+++ b/src/image/zopflipng/stats.rs
@@ -127,8 +127,8 @@ impl SymbolStats {
 			}
 		}
 
-		calculate_entropy::<ZOPFLI_NUM_LL>(&self.litlens, &mut self.ll_symbols);
-		calculate_entropy::<ZOPFLI_NUM_D>(&self.dists, &mut self.d_symbols);
+		calculate_entropy(&self.litlens, &mut self.ll_symbols);
+		calculate_entropy(&self.dists, &mut self.d_symbols);
 	}
 
 	#[allow(clippy::similar_names)]
@@ -165,8 +165,8 @@ impl SymbolStats {
 				}
 			}
 		}
-		randomize_freqs::<ZOPFLI_NUM_LL>(&mut self.litlens, state);
-		randomize_freqs::<ZOPFLI_NUM_D>(&mut self.dists, state);
+		randomize_freqs(&mut self.litlens, state);
+		randomize_freqs(&mut self.dists, state);
 
 		// Set the end symbol.
 		self.litlens[256] = 1;

From e888438e4a294d2726765cff274a37e5699c7c0e Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Tue, 16 Apr 2024 20:20:38 -0700
Subject: [PATCH 07/31] cleanup: remove useless legacy type casting cleanup:
 remove unused lint overrides docs: improve commenting perf: inline fixed/stat
 cost methods to moot unsafe, reduce conditional ops

---
 src/image/lodepng.rs          |   2 -
 src/image/zopflipng/blocks.rs |  43 ++++---
 src/image/zopflipng/cache.rs  |   5 +-
 src/image/zopflipng/hash.rs   | 210 ++++++++++++++++++----------------
 src/image/zopflipng/kat.rs    |   2 +-
 src/image/zopflipng/lz77.rs   |   8 +-
 src/image/zopflipng/mod.rs    |   2 +-
 src/image/zopflipng/stats.rs  |  65 ++++++-----
 8 files changed, 173 insertions(+), 164 deletions(-)

diff --git a/src/image/lodepng.rs b/src/image/lodepng.rs
index cb764ea..0b41bdd 100644
--- a/src/image/lodepng.rs
+++ b/src/image/lodepng.rs
@@ -27,8 +27,6 @@ use super::{
 // Generated by build.rs.
 include!(concat!(env!("OUT_DIR"), "/lodepng-bindgen.rs"));
 
-
-
 const ZOPFLI_MASTER_BLOCK_SIZE: usize = 1_000_000;
 
 
diff --git a/src/image/zopflipng/blocks.rs b/src/image/zopflipng/blocks.rs
index 56d011e..6270b7b 100644
--- a/src/image/zopflipng/blocks.rs
+++ b/src/image/zopflipng/blocks.rs
@@ -110,7 +110,6 @@ impl SplitPoints {
 		else { Ok(len) }
 	}
 
-	#[allow(clippy::cast_precision_loss)]
 	/// # LZ77 Split Pass.
 	///
 	/// This sets the LZ77 split points according to convoluted cost
@@ -132,7 +131,7 @@ impl SplitPoints {
 			}
 
 			// Ignore points we've already covered.
-			if llpos == lstart + 1 || (calculate_block_size_auto_type(store, lstart, lend)? as f64) < llcost {
+			if llpos == lstart + 1 || calculate_block_size_auto_type(store, lstart, lend)? < llcost {
 				self.done.insert(lstart);
 			}
 			else {
@@ -334,7 +333,9 @@ impl<'a> Iterator for GoodForRle<'a> {
 		let scratch = self.counts[0];
 		let mut stride = 0;
 		while let [count, rest @ ..] = self.counts {
-			// Note the reptition and circle back around.
+			// Note the reptition and circle back around. This will always
+			// trigger on the first pass, so stride will always be at least
+			// one.
 			if *count == scratch {
 				stride += 1;
 				self.counts = rest;
@@ -505,7 +506,6 @@ fn add_dynamic_tree(
 	d_lengths: &[u32; ZOPFLI_NUM_D],
 	out: &mut ZopfliOut
 ) -> Result<(), ZopfliError> {
-	// Find the index that produces the best size.
 	let (i, _) = calculate_tree_size(ll_lengths, d_lengths)?;
 	encode_tree(ll_lengths, d_lengths, i, Some(out))?;
 	Ok(())
@@ -513,7 +513,6 @@ fn add_dynamic_tree(
 
 #[allow(
 	clippy::cast_sign_loss,
-	clippy::similar_names,
 	clippy::too_many_arguments,
 )]
 /// # Add LZ77 Data.
@@ -741,7 +740,6 @@ fn calculate_block_symbol_size_small(
 	result
 }
 
-#[allow(unsafe_code)]
 /// # Calculate the Exact Tree Size (in Bits).
 ///
 /// This returns the index that produced the smallest size, and its size.
@@ -784,7 +782,7 @@ fn encode_tree(
 	extra: u8,
 	out: Option<&mut ZopfliOut>,
 ) -> Result<usize, ZopfliError> {
-	// Discombobulated cl_length/cl_count indexes.
+	// Discombobulated cl_length/cl_count indexes, because DEFLATE is hateful.
 	const ORDER: [usize; 19] = [16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15];
 
 	// To use or not to use the extended part of the alphabet.
@@ -808,15 +806,15 @@ fn encode_tree(
 	let mut hdist = 29;
 	while hdist > 0 && d_lengths[hdist] == 0 { hdist -= 1; }
 
-	// We process length and distance symbols in the same loop, in that order.
-	// With that in mind, this returns the symbol from the appropriate table.
+	// We process length and (then) distance symbols in the same loop. This
+	// returns the symbol for one or the other depending on how far we've
+	// gotten.
 	let length_or_distance = |idx: usize| {
 		// The compiler is smart enough to know hlit2 is valid for ll_lengths.
 		if idx < hlit2 { ll_lengths[idx] }
 		else {
-			// Safety: the index will always be between 0..L+D, where L
-			// is within range for ll_lengths and D is in range for
-			// d_lengths. If we're past the L point, what's left is a valid D.
+			// Safety: the index will always be between 0..L+D; if we're
+			// past the Ls, we're onto the Ds.
 			unsafe { *d_lengths.get_unchecked(idx - hlit2) }
 		}
 	};
@@ -979,12 +977,12 @@ fn find_minimum_cost(
 	store: &LZ77Store,
 	mut start: usize,
 	mut end: usize,
-) -> Result<(usize, f64), ZopfliError> {
+) -> Result<(usize, usize), ZopfliError> {
 	// Keep track of the original start/end points.
 	let split_start = start - 1;
 	let split_end = end;
 
-	let mut best_cost = f64::INFINITY;
+	let mut best_cost = usize::MAX;
 	let mut best_idx = start;
 
 	// Small chunks don't need much.
@@ -1001,7 +999,7 @@ fn find_minimum_cost(
 
 	// Divide and conquer.
 	let mut p = [0_usize; MINIMUM_SPLIT_DISTANCE - 1];
-	let mut last_best_cost = f64::INFINITY;
+	let mut last_best_cost = usize::MAX;
 	while MINIMUM_SPLIT_DISTANCE <= end - start {
 		let mut best_p_idx = 0;
 		for (i, pp) in p.iter_mut().enumerate() {
@@ -1185,19 +1183,18 @@ fn lz77_optimal(
 	})
 }
 
-#[allow(clippy::cast_precision_loss)]
 /// # Split Block Cost.
 ///
 /// Return the sum of the estimated costs of the left and right sections of the
 /// data.
-fn split_cost(store: &LZ77Store, start: usize, mid: usize, end: usize) -> Result<f64, ZopfliError> {
-	Ok((
+fn split_cost(store: &LZ77Store, start: usize, mid: usize, end: usize) -> Result<usize, ZopfliError> {
+	Ok(
 		calculate_block_size_auto_type(store, start, mid)? +
 		calculate_block_size_auto_type(store, mid, end)?
-	) as f64)
+	)
 }
 
-#[allow(unsafe_code, clippy::integer_division, clippy::cast_sign_loss)]
+#[allow(unsafe_code, clippy::integer_division)]
 /// # Optimize Huffman RLE Compression.
 ///
 /// Change the population counts to improve Huffman tree compression,
@@ -1238,6 +1235,8 @@ fn optimize_huffman_for_rle(mut counts: &mut [usize]) {
 			// current), take a sort of weighted average of them.
 			if i < four {
 				scratch = (
+					// Safety: the compiler doesn't understand (i < four) means
+					// we have at least i+3 entries left.
 					unsafe { *counts.get_unchecked(i + 3) } +
 					counts[i + 2] +
 					counts[i + 1] +
@@ -1338,8 +1337,8 @@ fn try_lz77_expensive_fixed(
 		add_lz77_block(
 			BlockType::Fixed, last_block, &fixed_store, arr, 0, fixed_store.len(),
 			expected_data_size, out,
-		)?;
-		Ok(true)
+		)
+			.map(|()| true)
 	}
 	else { Ok(false) }
 }
diff --git a/src/image/zopflipng/cache.rs b/src/image/zopflipng/cache.rs
index fb6cc10..3c51138 100644
--- a/src/image/zopflipng/cache.rs
+++ b/src/image/zopflipng/cache.rs
@@ -7,9 +7,9 @@ the thread-local LMC static.
 
 use std::cell::RefCell;
 use super::{
-	ZopfliError,
-	ZOPFLI_MIN_MATCH,
 	ZOPFLI_MAX_MATCH,
+	ZOPFLI_MIN_MATCH,
+	ZopfliError,
 };
 
 
@@ -299,7 +299,6 @@ impl SqueezeCache {
 		}
 	}
 
-	#[allow(unsafe_code)]
 	/// # Reset Costs.
 	///
 	/// This nudges all costs to infinity except the first, which is set to
diff --git a/src/image/zopflipng/hash.rs b/src/image/zopflipng/hash.rs
index d6ea9a3..d881962 100644
--- a/src/image/zopflipng/hash.rs
+++ b/src/image/zopflipng/hash.rs
@@ -30,9 +30,9 @@ use super::{
 	SqueezeCache,
 	stats::SymbolStats,
 	SUBLEN_LEN,
-	ZopfliError,
 	ZOPFLI_MAX_MATCH,
 	ZOPFLI_MIN_MATCH,
+	ZopfliError,
 };
 
 const ZOPFLI_WINDOW_SIZE: usize = 32_768;
@@ -218,19 +218,14 @@ impl ZopfliHash {
 		store: &mut LZ77Store,
 	) -> Result<(), ZopfliError> {
 		let costs = squeeze.reset_costs();
-		let tmp = self.get_best_lengths(arr, instart, stats, costs)?;
-		debug_assert!(tmp < 1E30);
+		self.get_best_lengths(arr, instart, stats, costs)?;
 		if let Some(paths) = squeeze.trace_paths() {
 			self.follow_paths(arr, instart, paths, store)?;
 		}
 		Ok(())
 	}
 
-	#[allow(
-		unsafe_code,
-		clippy::cast_possible_truncation,
-		clippy::cast_possible_wrap,
-	)]
+	#[allow(clippy::cast_possible_truncation)]
 	#[inline]
 	/// # Get Best Lengths.
 	///
@@ -249,7 +244,7 @@ impl ZopfliHash {
 		instart: usize,
 		stats: Option<&SymbolStats>,
 		costs: &mut [(f32, u16)],
-	) -> Result<f64, ZopfliError> {
+	) -> Result<(), ZopfliError> {
 		// Reset and warm the hash.
 		self.reset(arr, instart);
 
@@ -291,46 +286,85 @@ impl ZopfliHash {
 				Some(instart),
 			)?;
 
-			// Literal. (Note: this condition should never not be true, but it
-			// doesn't hurt too much to be extra sure!)
-			let cost_j = f64::from(unsafe { costs.get_unchecked(j).0 });
-			if i < arr.len() && j + 1 < costs.len() {
-				let new_cost = stats.map_or(
-					if arr[i] <= 143 { 8.0 } else { 9.0 },
-					|s| s.ll_symbols[usize::from(arr[i])],
-				) + cost_j;
-				debug_assert!(0.0 <= new_cost);
-
-				// Update it if lower.
-				if new_cost < f64::from(costs[j + 1].0) {
-					costs[j + 1].0 = new_cost as f32;
-					costs[j + 1].1 = 1;
-				}
+			// This should never trigger; it is mainly a reminder to the
+			// compiler that our i/j indices are still applicable.
+			if i >= arr.len() || j + 1 >= costs.len() { break; }
+
+			let cost_j = f64::from(costs[j].0);
+			let new_cost = stats.map_or_else(
+				|| if arr[i] <= 143 { 8.0 } else { 9.0 },
+				|s| s.ll_symbols[usize::from(arr[i])],
+			) + cost_j;
+			debug_assert!(0.0 <= new_cost);
+
+			// Update it if lower.
+			if new_cost < f64::from(costs[j + 1].0) {
+				costs[j + 1].0 = new_cost as f32;
+				costs[j + 1].1 = 1;
 			}
 
-			// Lengths and Sublengths.
-			let limit = usize::from(length).min(arr.len() - i);
+			// If a long match was found, peek forward to recalculate those
+			// costs, at least the ones who could benefit from the expense of
+			// all that effort.
+			let limit = usize::from(length).min(costs.len().saturating_sub(j + 1));
 			if (ZOPFLI_MIN_MATCH..=ZOPFLI_MAX_MATCH).contains(&limit) {
 				let min_cost_add = min_cost + cost_j;
 				let mut k = ZOPFLI_MIN_MATCH;
-				for (v, c) in sublen[k..=limit].iter().copied().zip(costs.iter_mut().skip(j + k)) {
-					// The expensive cost calculations are only worth
-					// performing if the stored cost is larger than the
-					// minimum cost we found earlier.
-					if min_cost_add < f64::from(c.0) {
-						let new_cost = stats.map_or_else(
-							|| get_fixed_cost(k as u16, v),
-							|s| get_stat_cost(k, v, s)
-						) + cost_j;
-						debug_assert!(0.0 <= new_cost);
-
-						// Update it if lower.
-						if new_cost < f64::from(c.0) {
-							c.0 = new_cost as f32;
-							c.1 = k as u16;
+				let iter = sublen[k..=limit].iter()
+					.copied()
+					.zip(costs.iter_mut().skip(j + k));
+
+				// Stat-based cost calculations.
+				if let Some(s) = stats {
+					for (dist, c) in iter {
+						if min_cost_add < f64::from(c.0) {
+							let mut new_cost = cost_j;
+							if dist == 0 {
+								new_cost += s.ll_symbols[k];
+							}
+							else {
+								let dsym = DISTANCE_SYMBOLS[(dist & 32_767) as usize];
+								new_cost += f64::from(DISTANCE_BITS[dsym as usize]);
+								new_cost += s.d_symbols[dsym as usize];
+								new_cost += s.ll_symbols[LENGTH_SYMBOLS_BITS_VALUES[k].0 as usize];
+								new_cost += f64::from(LENGTH_SYMBOLS_BITS_VALUES[k].1);
+							}
+
+							// Update it if lower.
+							if (0.0..f64::from(c.0)).contains(&new_cost) {
+								c.0 = new_cost as f32;
+								c.1 = k as u16;
+							}
 						}
+						k += 1;
+					}
+				}
+				// Fixed cost calculations.
+				else {
+					for (dist, c) in iter {
+						if min_cost_add < f64::from(c.0) {
+							let mut new_cost = cost_j;
+							if dist == 0 {
+								if k <= 143 { new_cost += 8.0; }
+								else { new_cost += 9.0; }
+							}
+							else {
+								if 114 < k { new_cost += 13.0; }
+								else { new_cost += 12.0; }
+
+								let dsym = DISTANCE_SYMBOLS[(dist & 32_767) as usize];
+								new_cost += f64::from(DISTANCE_BITS[dsym as usize]);
+								new_cost += f64::from(LENGTH_SYMBOLS_BITS_VALUES[k].1);
+							}
+
+							// Update it if lower.
+							if (0.0..f64::from(c.0)).contains(&new_cost) {
+								c.0 = new_cost as f32;
+								c.1 = k as u16;
+							}
+						}
+						k += 1;
 					}
-					k += 1;
 				}
 			}
 
@@ -338,9 +372,9 @@ impl ZopfliHash {
 			i += 1;
 		}
 
-		// Return the final cost!
-		debug_assert!(0.0 <= costs[costs.len() - 1].0);
-		Ok(f64::from(costs[costs.len() - 1].0))
+		// The final cost should in a reasonable range.
+		debug_assert!((0.0..1E30).contains(&costs[costs.len() - 1].0));
+		Ok(())
 	}
 
 	#[allow(clippy::cast_possible_truncation)]
@@ -892,37 +926,6 @@ impl ZopfliHashChain {
 
 
 
-#[allow(
-	unsafe_code,
-	clippy::cast_possible_truncation,
-	clippy::similar_names,
-)]
-/// # Fixed Cost Model.
-///
-/// This models the cost using a fixed tree.
-fn get_fixed_cost(len: u16, dist: u16) -> f64 {
-	if dist == 0 {
-		if len <= 143 { 8.0 }
-		else { 9.0 }
-	}
-	else {
-		let (lsym, lbits, _) = unsafe {
-			// Safety: this is only ever called with lengths between
-			// ZOPFLI_MIN_MATCH..=ZOPFLI_MAX_MATCH, so values are always in
-			// range.
-			*LENGTH_SYMBOLS_BITS_VALUES.get_unchecked(usize::from(len))
-		};
-		let dbits =
-			if dist < 5 { 0 }
-			else { (dist - 1).ilog2() as u16 - 1 };
-		let base =
-			if 279 < lsym as u16 { 13 }
-			else { 12 };
-
-		f64::from(base + dbits + u16::from(lbits))
-	}
-}
-
 /// # Distance-Based Length Score.
 ///
 /// This is a simplistic cost model for the "greedy" LZ77 pass that helps it
@@ -957,30 +960,39 @@ fn get_minimum_cost(stats: &SymbolStats) -> f64 {
 	length_cost + dist_cost
 }
 
-#[allow(
-	unsafe_code,
-	clippy::cast_possible_truncation,
-	clippy::similar_names,
-)]
-#[inline]
-/// # Statistical Cost Model.
-///
-/// This models the cost using the gathered symbol statistics.
-fn get_stat_cost(len: usize, dist: u16, stats: &SymbolStats) -> f64 {
-	// Safety: this is only ever called with lengths between
-	// ZOPFLI_MIN_MATCH..=ZOPFLI_MAX_MATCH so values are always in range.
-	if len > ZOPFLI_MAX_MATCH {
-		unsafe { core::hint::unreachable_unchecked(); }
-	}
 
-	if dist == 0 { stats.ll_symbols[len] }
-	else {
-		let (lsym, lbits, _) = LENGTH_SYMBOLS_BITS_VALUES[len];
-		let dsym = DISTANCE_SYMBOLS[usize::from(dist & 32_767)];
-		let dbits = DISTANCE_BITS[dsym as usize];
 
-		f64::from(lbits + dbits) +
-		stats.ll_symbols[lsym as usize] +
-		stats.d_symbols[dsym as usize]
+#[cfg(test)]
+mod tests {
+	use super::*;
+
+	#[test]
+	fn t_fixed_cost() {
+		// Get the largest dbit and lbit values.
+		let d_max: u8 = DISTANCE_BITS.iter().copied().max().unwrap();
+		let l_max: u8 = LENGTH_SYMBOLS_BITS_VALUES.iter()
+			.map(|(_, a, _)| *a)
+			.max()
+			.unwrap();
+
+		// Make sure their sum (along with the largest base) fits within
+		// the u8 space, since that's what we're using at runtime.
+		let max = u16::from(d_max) + u16::from(l_max) + 13;
+		assert!(
+			max <= 255,
+			"maximum get_fixed_cost() is too big for u8: {max}"
+		);
+
+		// The original base is calculated by checking (279 < symbol), but we
+		// instead test (114 < litlen) because the symbol isn't otherwise
+		// needed. This sanity check makes sure the two conditions are indeed
+		// interchangeable.
+		for (i, (sym, _, _)) in LENGTH_SYMBOLS_BITS_VALUES.iter().copied().enumerate() {
+			assert_eq!(
+				279 < (sym as u16),
+				114 < i,
+				"get_fixed_cost() base logic is wrong: len {i} has symbol {}", sym as u16
+			);
+		}
 	}
 }
diff --git a/src/image/zopflipng/kat.rs b/src/image/zopflipng/kat.rs
index ea2bc45..787b0f0 100644
--- a/src/image/zopflipng/kat.rs
+++ b/src/image/zopflipng/kat.rs
@@ -29,7 +29,7 @@ thread_local!(
 
 
 
-#[allow(unsafe_code, clippy::cast_sign_loss)]
+#[allow(unsafe_code)]
 /// # Length Limited Code Lengths.
 ///
 /// This writes minimum-redundancy length-limited code bitlengths for symbols
diff --git a/src/image/zopflipng/lz77.rs b/src/image/zopflipng/lz77.rs
index 6747200..04de9f3 100644
--- a/src/image/zopflipng/lz77.rs
+++ b/src/image/zopflipng/lz77.rs
@@ -10,9 +10,9 @@ use super::{
 	LENGTH_SYMBOLS_BITS_VALUES,
 	LitLen,
 	Lsym,
-	ZopfliError,
 	ZOPFLI_NUM_D,
 	ZOPFLI_NUM_LL,
+	ZopfliError,
 };
 
 
@@ -80,7 +80,8 @@ impl LZ77Store {
 
 		// If the distance is zero, we just need to bump the litlen count.
 		if entry.dist <= 0 {
-			// Safety: the counts were just resized a few lines back.
+			// Safety: the counts were pre-allocated a few lines back (if
+			// needed).
 			unsafe {
 				*self.ll_counts.get_unchecked_mut(ll_start + entry.litlen as usize) += 1;
 			}
@@ -88,7 +89,8 @@ impl LZ77Store {
 		// If it is non-zero, we need to set the correct symbols and bump both
 		// counts.
 		else {
-			// Safety: the counts were just resized a few lines back.
+			// Safety: the counts were pre-allocated a few lines back (if
+			// needed).
 			unsafe {
 				*self.ll_counts.get_unchecked_mut(ll_start + entry.ll_symbol as usize) += 1;
 				*self.d_counts.get_unchecked_mut(d_start + entry.d_symbol as usize) += 1;
diff --git a/src/image/zopflipng/mod.rs b/src/image/zopflipng/mod.rs
index 38690c7..76d5c89 100644
--- a/src/image/zopflipng/mod.rs
+++ b/src/image/zopflipng/mod.rs
@@ -41,8 +41,8 @@ use super::{
 		DecodedImage,
 		LodePNGColorType,
 		LodePNGFilterStrategy,
-		ZopfliOut,
 		LodePNGState,
+		ZopfliOut,
 	},
 };
 use symbols::{
diff --git a/src/image/zopflipng/stats.rs b/src/image/zopflipng/stats.rs
index 27fea54..5c97dfd 100644
--- a/src/image/zopflipng/stats.rs
+++ b/src/image/zopflipng/stats.rs
@@ -32,11 +32,12 @@ impl RanState {
 
 	/// # Generate Random Number.
 	///
-	/// This uses the 32-bit "Multiply-With-Carry" generator (G. Marsaglia).
+	/// A simple, repeatable [MWC PRNG](https://en.wikipedia.org/wiki/Multiply-with-carry_pseudorandom_number_generator),
+	/// used to shuffle frequencies between runs.
 	fn randomize(&mut self) -> u32 {
 		self.m_z = 36_969 * (self.m_z & 65_535) + (self.m_z >> 16);
 		self.m_w = 18_000 * (self.m_w & 65_535) + (self.m_w >> 16);
-		(self.m_z << 16).wrapping_add(self.m_w) // 32-bit result.
+		(self.m_z << 16).wrapping_add(self.m_w)
 	}
 }
 
@@ -48,8 +49,8 @@ impl RanState {
 /// This holds the length and distance symbols and costs for a given block,
 /// data that can be used to improve compression on subsequent passes.
 pub(crate) struct SymbolStats {
-	litlens: [usize; ZOPFLI_NUM_LL],
-	dists:   [usize; ZOPFLI_NUM_D],
+	ll_counts: [usize; ZOPFLI_NUM_LL],
+	d_counts:  [usize; ZOPFLI_NUM_D],
 
 	pub(crate) ll_symbols: [f64; ZOPFLI_NUM_LL],
 	pub(crate) d_symbols:  [f64; ZOPFLI_NUM_D],
@@ -59,8 +60,8 @@ impl SymbolStats {
 	/// # New Instance.
 	pub(crate) const fn new() -> Self {
 		Self {
-			litlens:    [0; ZOPFLI_NUM_LL],
-			dists:      [0; ZOPFLI_NUM_D],
+			ll_counts:  [0; ZOPFLI_NUM_LL],
+			d_counts:   [0; ZOPFLI_NUM_D],
 
 			ll_symbols: [0.0; ZOPFLI_NUM_LL],
 			d_symbols:  [0.0; ZOPFLI_NUM_D],
@@ -71,38 +72,38 @@ impl SymbolStats {
 impl SymbolStats {
 	/// # Add Previous Stats (Weighted).
 	///
-	/// This is essentially an `AddAssign` for `litlens` and `dists`. Each
+	/// This is essentially an `AddAssign` for `ll_counts` and `d_counts`. Each
 	/// previous value is halved and added to the corresponding current value.
 	pub(crate) fn add_last(
 		&mut self,
-		litlens: &[usize; ZOPFLI_NUM_LL],
-		dists: &[usize; ZOPFLI_NUM_D],
+		ll_counts: &[usize; ZOPFLI_NUM_LL],
+		d_counts: &[usize; ZOPFLI_NUM_D],
 	) {
-		for (l, r) in self.litlens.iter_mut().zip(litlens.iter().copied()) {
+		for (l, r) in self.ll_counts.iter_mut().zip(ll_counts.iter().copied()) {
 			*l += r.wrapping_div(2);
 		}
-		for (l, r) in self.dists.iter_mut().zip(dists.iter().copied()) {
+		for (l, r) in self.d_counts.iter_mut().zip(d_counts.iter().copied()) {
 			*l += r.wrapping_div(2);
 		}
 
 		// Set the end symbol.
-		self.litlens[256] = 1;
+		self.ll_counts[256] = 1;
 	}
 
 	/// # Clear Frequencies.
 	///
-	/// Set all `litlens` and `dists` to zero and return the originals.
+	/// Set all `ll_counts` and `d_counts` to zero and return the originals.
 	pub(crate) fn clear(&mut self) -> ([usize; ZOPFLI_NUM_LL], [usize; ZOPFLI_NUM_D]) {
-		let mut new_litlens = [0; ZOPFLI_NUM_LL];
-		let mut new_dists = [0; ZOPFLI_NUM_D];
-		std::mem::swap(&mut self.litlens, &mut new_litlens);
-		std::mem::swap(&mut self.dists, &mut new_dists);
-		(new_litlens, new_dists)
+		let mut last_ll = [0; ZOPFLI_NUM_LL];
+		let mut last_d = [0; ZOPFLI_NUM_D];
+		std::mem::swap(&mut self.ll_counts, &mut last_ll);
+		std::mem::swap(&mut self.d_counts, &mut last_d);
+		(last_ll, last_d)
 	}
 
 	/// # Calculate/Set Statistics.
 	///
-	/// This calculates the "entropy" of the `litlens` and `dists`, storing the
+	/// This calculates the "entropy" of the `ll_counts` and `d_counts`, storing the
 	/// results in the corresponding symbols arrays.
 	pub(crate) fn crunch(&mut self) {
 		#[allow(clippy::cast_precision_loss)]
@@ -119,36 +120,34 @@ impl SymbolStats {
 				for (&c, b) in count.iter().zip(bitlengths.iter_mut()) {
 					if c == 0 { *b = log2sum; }
 					else {
-						let mut v = log2sum - (c as f64).log2();
-						if v.is_sign_negative() { v = 0.0; }
-						*b = v;
+						*b = log2sum - (c as f64).log2();
+						if b.is_sign_negative() { *b = 0.0; }
 					}
 				}
 			}
 		}
 
-		calculate_entropy(&self.litlens, &mut self.ll_symbols);
-		calculate_entropy(&self.dists, &mut self.d_symbols);
+		calculate_entropy(&self.ll_counts, &mut self.ll_symbols);
+		calculate_entropy(&self.d_counts, &mut self.d_symbols);
 	}
 
-	#[allow(clippy::similar_names)]
 	/// # Load Statistics.
 	///
-	/// This updates the `litlens` and `dists` stats using the data from the
+	/// This updates the `ll_counts` and `d_counts` stats using the data from the
 	/// `ZopfliLZ77Store` store, then crunches the results.
 	pub(crate) fn load_store(&mut self, store: &LZ77Store) {
 		for e in &store.entries {
 			if e.dist <= 0 {
-				self.litlens[e.litlen as usize] += 1;
+				self.ll_counts[e.litlen as usize] += 1;
 			}
 			else {
-				self.litlens[e.ll_symbol as usize] += 1;
-				self.dists[e.d_symbol as usize] += 1;
+				self.ll_counts[e.ll_symbol as usize] += 1;
+				self.d_counts[e.d_symbol as usize] += 1;
 			}
 		}
 
 		// Set the end symbol and crunch.
-		self.litlens[256] = 1;
+		self.ll_counts[256] = 1;
 		self.crunch();
 	}
 
@@ -165,10 +164,10 @@ impl SymbolStats {
 				}
 			}
 		}
-		randomize_freqs(&mut self.litlens, state);
-		randomize_freqs(&mut self.dists, state);
+		randomize_freqs(&mut self.ll_counts, state);
+		randomize_freqs(&mut self.d_counts, state);
 
 		// Set the end symbol.
-		self.litlens[256] = 1;
+		self.ll_counts[256] = 1;
 	}
 }

From 348e423ccf8432b8d85b0ab1d2a2ea55849ec30e Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Wed, 17 Apr 2024 11:13:34 -0700
Subject: [PATCH 08/31] misc: consolidate caches into new ZopfliState structure

---
 src/image/lodepng.rs          |   3 +
 src/image/mod.rs              |   1 +
 src/image/zopflipng/blocks.rs | 173 +++++++-------
 src/image/zopflipng/cache.rs  |  95 +-------
 src/image/zopflipng/hash.rs   | 412 ++++++++++++++++++++--------------
 src/image/zopflipng/mod.rs    |   7 +-
 6 files changed, 325 insertions(+), 366 deletions(-)

diff --git a/src/image/lodepng.rs b/src/image/lodepng.rs
index 0b41bdd..774dffd 100644
--- a/src/image/lodepng.rs
+++ b/src/image/lodepng.rs
@@ -20,6 +20,7 @@ use super::{
 	deflate_part,
 	ffi::EncodedImage,
 	SplitPoints,
+	ZopfliState,
 };
 
 
@@ -51,6 +52,7 @@ pub(crate) extern "C" fn flaca_png_deflate(
 	}
 
 	// Initialize a reusable split-point buffer.
+	let mut state = ZopfliState::new();
 	let mut splits = SplitPoints::new();
 	let mut dst = ZopfliOut {
 		bp: 0,
@@ -69,6 +71,7 @@ pub(crate) extern "C" fn flaca_png_deflate(
 
 		// Crunch the part!
 		let res = deflate_part(
+			&mut state,
 			&mut splits,
 			numiterations,
 			last_part,
diff --git a/src/image/mod.rs b/src/image/mod.rs
index d018d72..737e5b3 100644
--- a/src/image/mod.rs
+++ b/src/image/mod.rs
@@ -16,6 +16,7 @@ use std::path::Path;
 use zopflipng::{
 	deflate_part,
 	SplitPoints,
+	ZopfliState,
 };
 
 
diff --git a/src/image/zopflipng/blocks.rs b/src/image/zopflipng/blocks.rs
index 6270b7b..2cbe744 100644
--- a/src/image/zopflipng/blocks.rs
+++ b/src/image/zopflipng/blocks.rs
@@ -8,15 +8,12 @@ and ends that didn't make it into other modules.
 use dactyl::NoHash;
 use std::collections::HashSet;
 use super::{
-	CACHE,
 	DISTANCE_BITS,
 	DISTANCE_VALUES,
 	FIXED_TREE_D,
 	FIXED_TREE_LL,
-	HASH,
 	LENGTH_SYMBOLS_BITS_VALUES,
 	LZ77Store,
-	SqueezeCache,
 	stats::{
 		RanState,
 		SymbolStats,
@@ -27,6 +24,7 @@ use super::{
 	ZOPFLI_NUM_LL,
 	ZopfliError,
 	ZopfliOut,
+	ZopfliState,
 };
 
 
@@ -78,16 +76,12 @@ impl SplitPoints {
 	///
 	/// In terms of order-of-operations, this must be called _before_ the
 	/// second-stage LZ77 pass as it would otherwise blow away that data.
-	fn split_raw(&mut self, arr: &[u8], instart: usize) -> Result<usize, ZopfliError> {
+	fn split_raw(&mut self, arr: &[u8], instart: usize, state: &mut ZopfliState)
+	-> Result<usize, ZopfliError> {
 		// Populate an LZ77 store from a greedy pass. This results in better
 		// block choices than a full optimal pass.
 		let mut store = LZ77Store::new();
-		HASH.with_borrow_mut(|h| h.greedy(
-			arr,
-			instart,
-			&mut store,
-			None,
-		))?;
+		state.greedy(arr, instart, &mut store, None)?;
 
 		// Do an LZ77 pass.
 		let len = self.split_lz77(&store)?;
@@ -173,10 +167,10 @@ impl SplitPoints {
 		arr: &[u8],
 		instart: usize,
 		store: &mut LZ77Store,
-		squeeze: &mut SqueezeCache,
+		state: &mut ZopfliState,
 	) -> Result<&[usize], ZopfliError> {
 		// Start by splitting uncompressed.
-		let limit = self.split_raw(arr, instart)?.min(MAX_SPLIT_POINTS);
+		let limit = self.split_raw(arr, instart, state)?.min(MAX_SPLIT_POINTS);
 
 		let mut cost1 = 0;
 		let mut store2 = LZ77Store::new();
@@ -195,7 +189,7 @@ impl SplitPoints {
 				numiterations,
 				&mut store2,
 				&mut store3,
-				squeeze,
+				state,
 			)?;
 			cost1 += calculate_block_size_auto_type(&store2, 0, store2.len())?;
 
@@ -238,6 +232,7 @@ impl SplitPoints {
 /// More specifically, this explores different possible split points for the
 /// chunk, then writes the resulting blocks to the output file.
 pub(crate) fn deflate_part(
+	state: &mut ZopfliState,
 	splits: &mut SplitPoints,
 	numiterations: i32,
 	last_block: bool,
@@ -246,14 +241,13 @@ pub(crate) fn deflate_part(
 	out: &mut ZopfliOut,
 ) -> Result<(), ZopfliError> {
 	// Find the split points.
-	let mut squeeze = SqueezeCache::new();
 	let mut store = LZ77Store::new();
 	let best = splits.split(
 		numiterations,
 		arr,
 		instart,
 		&mut store,
-		&mut squeeze,
+		state,
 	)?;
 
 	// Write the data!
@@ -263,7 +257,7 @@ pub(crate) fn deflate_part(
 		add_lz77_block_auto_type(
 			i == best.len() && last_block,
 			&store,
-			&mut squeeze,
+			state,
 			arr.as_ptr(),
 			start,
 			end,
@@ -451,7 +445,7 @@ fn add_lz77_block(
 fn add_lz77_block_auto_type(
 	last_block: bool,
 	store: &LZ77Store,
-	squeeze: &mut SqueezeCache,
+	state: &mut ZopfliState,
 	arr: *const u8,
 	lstart: usize,
 	lend: usize,
@@ -476,7 +470,7 @@ fn add_lz77_block_auto_type(
 	if
 		(store.len() < 1000 || (fixed_cost as f64) <= (dynamic_cost as f64) * 1.1) &&
 		try_lz77_expensive_fixed(
-			store, squeeze, uncompressed_cost, dynamic_cost,
+			store, state, uncompressed_cost, dynamic_cost,
 			arr, lstart, lend, last_block,
 			expected_data_size, out,
 		)?
@@ -1092,95 +1086,85 @@ fn lz77_optimal(
 	numiterations: i32,
 	store: &mut LZ77Store,
 	scratch_store: &mut LZ77Store,
-	squeeze: &mut SqueezeCache,
+	state: &mut ZopfliState,
 ) -> Result<(), ZopfliError> {
 	// Easy abort.
 	if instart >= arr.len() { return Ok(()); }
 
 	// Reset the main cache for the current blocksize.
-	let blocksize = arr.len() - instart;
-	CACHE.with_borrow_mut(|c| c.init(blocksize));
-	squeeze.init(blocksize + 1);
+	state.init_lmc(arr.len() - instart);
+
+	// Greedy run.
+	scratch_store.clear();
+	state.greedy(arr, instart, scratch_store, Some(instart))?;
+
+	// Create new stats with the store (updated by the greedy pass).
+	let mut current_stats = SymbolStats::new();
+	current_stats.load_store(scratch_store);
+
+	// Set up dummy stats we can use to track best and last.
+	let mut ran = RanState::new();
+	let mut best_stats = SymbolStats::new();
+
+	// We'll also want dummy best and last costs.
+	let mut last_cost = 0;
+	let mut best_cost = usize::MAX;
 
-	HASH.with_borrow_mut(|h| {
-		// Greedy run.
+	// Repeat statistics with the cost model from the previous
+	// stat run.
+	let mut last_ran = -1;
+	for i in 0..numiterations.max(0) {
+		// Reset the LZ77 store.
 		scratch_store.clear();
-		h.greedy(
+
+		// Optimal run.
+		state.optimal_run(
 			arr,
 			instart,
+			Some(&current_stats),
 			scratch_store,
-			Some(instart),
 		)?;
 
-		// Create new stats with the store (updated by the greedy pass).
-		let mut current_stats = SymbolStats::new();
-		current_stats.load_store(scratch_store);
-
-		// Set up dummy stats we can use to track best and last.
-		let mut ran = RanState::new();
-		let mut best_stats = SymbolStats::new();
-
-		// We'll also want dummy best and last costs.
-		let mut last_cost = 0;
-		let mut best_cost = usize::MAX;
-
-		// Repeat statistics with the cost model from the previous
-		// stat run.
-		let mut last_ran = -1;
-		for i in 0..numiterations.max(0) {
-			// Reset the LZ77 store.
-			scratch_store.clear();
-
-			// Optimal run.
-			h.optimal_run(
-				arr,
-				instart,
-				Some(&current_stats),
-				squeeze,
-				scratch_store,
-			)?;
-
-			// This is the cost we actually care about.
-			let current_cost = calculate_block_size(
-				scratch_store,
-				0,
-				scratch_store.len(),
-				BlockType::Dynamic,
-			)?;
-
-			// We have a new best!
-			if current_cost < best_cost {
-				store.replace(scratch_store);
-				best_stats = current_stats;
-				best_cost = current_cost;
-			}
+		// This is the cost we actually care about.
+		let current_cost = calculate_block_size(
+			scratch_store,
+			0,
+			scratch_store.len(),
+			BlockType::Dynamic,
+		)?;
 
-			// Copy the stats to last_stats, clear them, and repopulate
-			// with the current store.
-			let (last_litlens, last_dists) = current_stats.clear();
-			current_stats.load_store(scratch_store);
+		// We have a new best!
+		if current_cost < best_cost {
+			store.replace(scratch_store);
+			best_stats = current_stats;
+			best_cost = current_cost;
+		}
 
-			// Once the randomness has kicked in, improve convergence by
-			// weighting the current and previous stats.
-			if last_ran != -1 {
-				current_stats.add_last(&last_litlens, &last_dists);
-				current_stats.crunch();
-			}
+		// Copy the stats to last_stats, clear them, and repopulate
+		// with the current store.
+		let (last_litlens, last_dists) = current_stats.clear();
+		current_stats.load_store(scratch_store);
 
-			// Replace the current stats with the best stats, randomize,
-			// and see what happens.
-			if 5 < i && current_cost == last_cost {
-				current_stats = best_stats;
-				current_stats.randomize(&mut ran);
-				current_stats.crunch();
-				last_ran = i;
-			}
+		// Once the randomness has kicked in, improve convergence by
+		// weighting the current and previous stats.
+		if last_ran != -1 {
+			current_stats.add_last(&last_litlens, &last_dists);
+			current_stats.crunch();
+		}
 
-			last_cost = current_cost;
+		// Replace the current stats with the best stats, randomize,
+		// and see what happens.
+		if 5 < i && current_cost == last_cost {
+			current_stats = best_stats;
+			current_stats.randomize(&mut ran);
+			current_stats.crunch();
+			last_ran = i;
 		}
 
-		Ok(())
-	})
+		last_cost = current_cost;
+	}
+
+	Ok(())
 }
 
 /// # Split Block Cost.
@@ -1293,7 +1277,7 @@ fn patch_distance_codes(d_lengths: &mut [u32; ZOPFLI_NUM_D]) {
 /// Returns `true` if data was written.
 fn try_lz77_expensive_fixed(
 	store: &LZ77Store,
-	squeeze: &mut SqueezeCache,
+	state: &mut ZopfliState,
 	uncompressed_cost: usize,
 	dynamic_cost: usize,
 	arr: *const u8,
@@ -1308,20 +1292,17 @@ fn try_lz77_expensive_fixed(
 	debug_assert!(lstart < store.entries.len());
 	let instart = unsafe { store.entries.get_unchecked(lstart).pos };
 	let inend = instart + get_lz77_byte_range(store, lstart, lend);
-	let blocksize = inend - instart;
 
 	// Run all the expensive fixed-cost checks.
-	CACHE.with_borrow_mut(|c| c.init(blocksize));
-	squeeze.init(blocksize + 1);
+	state.init_lmc(inend - instart);
 
 	// Pull the hasher.
-	HASH.with_borrow_mut(|h| h.optimal_run(
+	state.optimal_run(
 		unsafe { std::slice::from_raw_parts(arr, inend) },
 		instart,
 		None,
-		squeeze,
 		&mut fixed_store,
-	))?;
+	)?;
 
 	// Find the resulting cost.
 	let fixed_cost = calculate_block_size(
diff --git a/src/image/zopflipng/cache.rs b/src/image/zopflipng/cache.rs
index 3c51138..0cf337c 100644
--- a/src/image/zopflipng/cache.rs
+++ b/src/image/zopflipng/cache.rs
@@ -5,7 +5,6 @@ This module defines the longest match and squeeze cache structures, and hosts
 the thread-local LMC static.
 */
 
-use std::cell::RefCell;
 use super::{
 	ZOPFLI_MAX_MATCH,
 	ZOPFLI_MIN_MATCH,
@@ -14,14 +13,6 @@ use super::{
 
 
 
-thread_local!(
-	/// # Static Cache.
-	///
-	/// There is only ever one instance of the longest match cache active per
-	/// thread, so we might as well persist it to save on the allocations!
-	pub(crate) static CACHE: RefCell<MatchCache> = const { RefCell::new(MatchCache::new()) }
-);
-
 /// # Default Length (1) and Distance (0).
 ///
 /// Length and distance are always fetched/stored together, so are grouped into
@@ -50,7 +41,7 @@ pub(crate) struct MatchCache {
 
 impl MatchCache {
 	/// # New.
-	const fn new() -> Self {
+	pub(super) const fn new() -> Self {
 		Self {
 			ld: Vec::new(),
 			sublen: Vec::new(),
@@ -261,90 +252,6 @@ impl MatchCache {
 
 
 
-/// # Squeeze Scratchpad.
-///
-/// This structure is used to keep track of the data gathered during the
-/// forward/backward "squeeze" passes.
-///
-/// Similar to `MatchCache`, this structure is only ever really needed once per
-/// image so is frequently reset/reused to reduce allocation overhead.
-pub(crate) struct SqueezeCache {
-	pub(crate) costs: Vec<(f32, u16)>,
-	pub(crate) paths: Vec<u16>,
-}
-
-impl SqueezeCache {
-	/// # New.
-	pub(crate) const fn new() -> Self {
-		Self {
-			costs: Vec::new(),
-			paths: Vec::new(),
-		}
-	}
-
-	/// # Initialize/Reset.
-	///
-	/// This (potentially) resizes the cost and length vectors for the given
-	/// blocksize — which is `(inend - instart + 1)` by the way.
-	///
-	/// Unlike the `MatchCache`, this doesn't worry about setting the
-	/// appropriate values; `SqueezeCache::reset_costs` handles that.
-	///
-	/// The paths are unchanged by this method; subsequent calls to
-	/// `SqueezeCache::trace_paths` gets them sorted.
-	pub(crate) fn init(&mut self, blocksize: usize) {
-		// Resize if needed.
-		if blocksize != self.costs.len() {
-			self.costs.resize(blocksize, (f32::INFINITY, 0));
-		}
-	}
-
-	/// # Reset Costs.
-	///
-	/// This nudges all costs to infinity except the first, which is set to
-	/// zero instead.
-	pub(crate) fn reset_costs(&mut self) -> &mut [(f32, u16)] {
-		let slice = self.costs.as_mut_slice();
-		if ! slice.is_empty() {
-			for c in slice.iter_mut() { c.0 = f32::INFINITY; }
-			slice[0].0 = 0.0;
-		}
-		slice
-	}
-
-	#[allow(clippy::cast_possible_truncation)]
-	#[inline]
-	/// # Trace Paths.
-	///
-	/// Calculate the optimal path of lz77 lengths to use, from the
-	/// lengths gathered during the `ZopfliHash::get_best_lengths` pass.
-	pub(crate) fn trace_paths(&mut self) -> Option<&[u16]> {
-		let costs = self.costs.as_slice();
-		if costs.len() < 2 { return None; }
-
-		self.paths.truncate(0);
-		let mut idx = costs.len() - 1;
-		while 0 < idx && idx < costs.len() {
-			let v = costs[idx].1;
-			debug_assert!((1..=ZOPFLI_MAX_MATCH as u16).contains(&v));
-			if ! (1..=ZOPFLI_MAX_MATCH as u16).contains(&v) { return None; }
-
-			// Only lengths of at least ZOPFLI_MIN_MATCH count as lengths
-			// after tracing.
-			self.paths.push(
-				if v < ZOPFLI_MIN_MATCH as u16 { 1 } else { v }
-			);
-
-			// Move onto the next length or finish.
-			idx = idx.saturating_sub(usize::from(v));
-		}
-
-		Some(self.paths.as_slice())
-	}
-}
-
-
-
 /// # Max Sublength.
 ///
 /// Return the maximum sublength length for a given chunk.
diff --git a/src/image/zopflipng/hash.rs b/src/image/zopflipng/hash.rs
index d881962..c36a291 100644
--- a/src/image/zopflipng/hash.rs
+++ b/src/image/zopflipng/hash.rs
@@ -11,10 +11,7 @@ use std::{
 		handle_alloc_error,
 		Layout,
 	},
-	cell::{
-		Cell,
-		RefCell,
-	},
+	cell::Cell,
 	ptr::{
 		addr_of,
 		addr_of_mut,
@@ -22,12 +19,11 @@ use std::{
 	},
 };
 use super::{
-	CACHE,
 	DISTANCE_BITS,
 	DISTANCE_SYMBOLS,
 	LENGTH_SYMBOLS_BITS_VALUES,
 	LZ77Store,
-	SqueezeCache,
+	MatchCache,
 	stats::SymbolStats,
 	SUBLEN_LEN,
 	ZOPFLI_MAX_MATCH,
@@ -50,13 +46,232 @@ const MIN_COST_DISTANCES: [u8; 30] = [
 
 
 
-thread_local!(
-	/// # Static Hash.
+/// # Zopfli State.
+///
+/// This consolidates the Longest Match, Squeeze, and Hash caches into a single
+/// structure, cutting down on the number of references being bounced around
+/// from method to method.
+pub(crate) struct ZopfliState {
+	lmc: MatchCache,
+	hash: Box<ZopfliHash>,
+	costs: Vec<(f32, u16)>,
+	paths: Vec<u16>,
+}
+
+impl ZopfliState {
+	/// # New.
+	pub(crate) fn new() -> Self {
+		Self {
+			lmc: MatchCache::new(),
+			hash: ZopfliHash::new(),
+			costs: Vec::new(),
+			paths: Vec::new(),
+		}
+	}
+
+	/// # Initialize LMC/Squeeze Caches.
+	pub(crate) fn init_lmc(&mut self, blocksize: usize) {
+		self.lmc.init(blocksize);
+		if blocksize + 1 != self.costs.len() {
+			self.costs.resize(blocksize + 1, (f32::INFINITY, 0));
+		}
+	}
+}
+
+impl ZopfliState {
+	/// # Greedy Run.
+	#[allow(unsafe_code, clippy::cast_possible_truncation)]
+	/// # Greedy LZ77 Run.
 	///
-	/// There is only ever one instance of the hash active per thread, so we
-	/// might as well persist it to save on the allocations!
-	pub(super) static HASH: RefCell<Box<ZopfliHash>> = RefCell::new(ZopfliHash::new());
-);
+	/// This method looks for best-length matches in the data (and/or cache),
+	/// updating the store with the results.
+	///
+	/// This is one of two entrypoints into the inner `ZopfliHash` data.
+	pub(crate) fn greedy(
+		&mut self,
+		arr: &[u8],
+		instart: usize,
+		store: &mut LZ77Store,
+		cache: Option<usize>,
+	) -> Result<(), ZopfliError> {
+		// Reset the hash.
+		self.hash.reset(arr, instart);
+
+		// We'll need a few more variables…
+		let mut sublen = [0_u16; SUBLEN_LEN];
+		let mut length: u16 = 0;
+		let mut distance: u16 = 0;
+		let mut prev_length: u16 = 0;
+		let mut prev_distance: u16 = 0;
+		let mut match_available = false;
+
+		// Loop the data!
+		let mut i = instart;
+		while i < arr.len() {
+			// Update the hash.
+			self.hash.update_hash(&arr[i..], i);
+
+			// Run the finder.
+			self.hash.find(
+				arr,
+				i,
+				ZOPFLI_MAX_MATCH,
+				&mut sublen,
+				&mut distance,
+				&mut length,
+				&mut self.lmc,
+				cache,
+			)?;
+
+			// Lazy matching.
+			let length_score = get_length_score(length, distance);
+			let prev_length_score = get_length_score(prev_length, prev_distance);
+			if match_available {
+				match_available = false;
+
+				if length_score > prev_length_score + 1 {
+					// Safety: match_available starts false so even if instart
+					// is zero, we won't reach this part until we've iterated
+					// at least once.
+					store.push(
+						u16::from(unsafe { *arr.get_unchecked(i - 1) }),
+						0,
+						i - 1,
+					)?;
+					if length_score >= ZOPFLI_MIN_MATCH as u16 && length < ZOPFLI_MAX_MATCH as u16 {
+						match_available = true;
+						prev_length = length;
+						prev_distance = distance;
+
+						i += 1;
+						continue;
+					}
+				}
+				else {
+					// Old is new.
+					length = prev_length;
+					distance = prev_distance;
+
+					// Write the values!
+					store.push(length, distance, i - 1)?;
+
+					// Update the hash up through length and increment the loop
+					// position accordingly.
+					for _ in 2..length {
+						i += 1;
+						self.hash.update_hash(&arr[i..], i);
+					}
+
+					i += 1;
+					continue;
+				}
+			}
+			// No previous match, but maybe we can set it for the next
+			// iteration?
+			else if length_score >= ZOPFLI_MIN_MATCH as u16 && length < ZOPFLI_MAX_MATCH as u16 {
+				match_available = true;
+				prev_length = length;
+				prev_distance = distance;
+
+				i += 1;
+				continue;
+			}
+
+			// Write the current length/distance.
+			if length_score >= ZOPFLI_MIN_MATCH as u16 {
+				store.push(length, distance, i)?;
+			}
+			// Write from the source with no distance and reset the length to
+			// one.
+			else {
+				length = 1;
+				store.push(u16::from(arr[i]), 0, i)?;
+			}
+
+			// Update the hash up through length and increment the loop
+			// position accordingly.
+			for _ in 1..length {
+				i += 1;
+				self.hash.update_hash(&arr[i..], i);
+			}
+
+			i += 1;
+		}
+
+		Ok(())
+	}
+
+	/// # Optimal Run.
+	///
+	/// This performs backward/forward squeeze passes on the data, optionally
+	/// considering existing histogram data. The `store` is updated with the
+	/// best-length match data.
+	///
+	/// This is one of two entrypoints into the inner `ZopfliHash` data.
+	pub(crate) fn optimal_run(
+		&mut self,
+		arr: &[u8],
+		instart: usize,
+		stats: Option<&SymbolStats>,
+		store: &mut LZ77Store,
+	) -> Result<(), ZopfliError> {
+		let costs = self.costs.as_mut_slice();
+		if ! costs.is_empty() {
+			// Reset the costs.
+			for c in costs.iter_mut().skip(1) { c.0 = f32::INFINITY; }
+			costs[0].0 = 0.0;
+
+			// Reset and warm the hash.
+			self.hash.reset(arr, instart);
+
+			// Forward and backward squeeze passes.
+			self.hash.get_best_lengths(arr, instart, stats, costs, &mut self.lmc)?;
+			if self.trace_paths() {
+				self.hash.follow_paths(
+					arr,
+					instart,
+					self.paths.as_slice(),
+					store,
+					&mut self.lmc,
+				)?;
+			}
+		}
+
+		Ok(())
+	}
+}
+
+impl ZopfliState {
+	#[allow(clippy::cast_possible_truncation)]
+	#[inline]
+	/// # Trace Paths.
+	///
+	/// Calculate the optimal path of lz77 lengths to use, from the
+	/// lengths gathered during the `ZopfliHash::get_best_lengths` pass.
+	fn trace_paths(&mut self) -> bool {
+		let costs = self.costs.as_slice();
+		if costs.len() < 2 { return false; }
+
+		self.paths.truncate(0);
+		let mut idx = costs.len() - 1;
+		while 0 < idx && idx < costs.len() {
+			let v = costs[idx].1;
+			debug_assert!((1..=ZOPFLI_MAX_MATCH as u16).contains(&v));
+			if ! (1..=ZOPFLI_MAX_MATCH as u16).contains(&v) { return false; }
+
+			// Only lengths of at least ZOPFLI_MIN_MATCH count as lengths
+			// after tracing.
+			self.paths.push(
+				if v < ZOPFLI_MIN_MATCH as u16 { 1 } else { v }
+			);
+
+			// Move onto the next length or finish.
+			idx = idx.saturating_sub(usize::from(v));
+		}
+
+		true
+	}
+}
 
 
 
@@ -68,7 +283,7 @@ thread_local!(
 ///
 /// It is functionally equivalent to the original `hash.c` structure, but with
 /// more consistent member typing, sizing, and naming.
-pub(crate) struct ZopfliHash {
+struct ZopfliHash {
 	chain1: ZopfliHashChain,
 	chain2: ZopfliHashChain,
 
@@ -93,10 +308,10 @@ impl ZopfliHash {
 	/// The return value is allocated but **uninitialized**. Re/initialization
 	/// occurs subsequently when `ZopfliHash::reset` is called.
 	///
-	/// There are only two entrypoints into the thread-local static holding
-	/// this data — `ZopfliHash::greedy` and `ZopfliHash::optimal_run` — and
-	/// both reset as their _first order of business_, so in practice
-	/// everything is A-OK!
+	/// The only (crate)-facing entrypoints into this data are
+	/// `ZopfliState::greedy` and `ZopfliState::optimal_run`, both of which
+	/// call reset as their first order business, so in practice everything is
+	/// A-OK!
 	fn new() -> Box<Self> {
 		const LAYOUT: Layout = Layout::new::<ZopfliHash>();
 
@@ -201,30 +416,6 @@ impl ZopfliHash {
 }
 
 impl ZopfliHash {
-	/// # Optimal Run.
-	///
-	/// This performs backward/forward squeeze passes on the data, optionally
-	/// considering existing histogram data. The `store` is updated with the
-	/// best-length match data.
-	///
-	/// This is one of two possible entrypoints into the thread-local
-	/// `ZopfliHash` instance.
-	pub(crate) fn optimal_run(
-		&mut self,
-		arr: &[u8],
-		instart: usize,
-		stats: Option<&SymbolStats>,
-		squeeze: &mut SqueezeCache,
-		store: &mut LZ77Store,
-	) -> Result<(), ZopfliError> {
-		let costs = squeeze.reset_costs();
-		self.get_best_lengths(arr, instart, stats, costs)?;
-		if let Some(paths) = squeeze.trace_paths() {
-			self.follow_paths(arr, instart, paths, store)?;
-		}
-		Ok(())
-	}
-
 	#[allow(clippy::cast_possible_truncation)]
 	#[inline]
 	/// # Get Best Lengths.
@@ -244,10 +435,8 @@ impl ZopfliHash {
 		instart: usize,
 		stats: Option<&SymbolStats>,
 		costs: &mut [(f32, u16)],
+		lmc: &mut MatchCache,
 	) -> Result<(), ZopfliError> {
-		// Reset and warm the hash.
-		self.reset(arr, instart);
-
 		// Costs and lengths are resized prior to this point; they should be
 		// one larger than the data of interest (and equal to each other).
 		debug_assert!(costs.len() == arr.len() - instart + 1);
@@ -283,6 +472,7 @@ impl ZopfliHash {
 				&mut sublen,
 				&mut distance,
 				&mut length,
+				lmc,
 				Some(instart),
 			)?;
 
@@ -439,127 +629,6 @@ impl ZopfliHash {
 		else { false }
 	}
 
-	#[allow(unsafe_code, clippy::cast_possible_truncation)]
-	/// # Greedy LZ77 Run.
-	///
-	/// This method looks for best-length matches in the data (and/or cache),
-	/// updating the store with the results.
-	///
-	/// This is one of two entrypoints into the thread-local `ZopfliHash`
-	/// instance.
-	pub(crate) fn greedy(
-		&mut self,
-		arr: &[u8],
-		instart: usize,
-		store: &mut LZ77Store,
-		cache: Option<usize>,
-	) -> Result<(), ZopfliError> {
-		// Reset the hash.
-		self.reset(arr, instart);
-
-		// We'll need a few more variables…
-		let mut sublen = [0_u16; SUBLEN_LEN];
-		let mut length: u16 = 0;
-		let mut distance: u16 = 0;
-		let mut prev_length: u16 = 0;
-		let mut prev_distance: u16 = 0;
-		let mut match_available = false;
-
-		// Loop the data!
-		let mut i = instart;
-		while i < arr.len() {
-			// Update the hash.
-			self.update_hash(&arr[i..], i);
-
-			// Run the finder.
-			self.find(
-				arr,
-				i,
-				ZOPFLI_MAX_MATCH,
-				&mut sublen,
-				&mut distance,
-				&mut length,
-				cache,
-			)?;
-
-			// Lazy matching.
-			let length_score = get_length_score(length, distance);
-			let prev_length_score = get_length_score(prev_length, prev_distance);
-			if match_available {
-				match_available = false;
-
-				if length_score > prev_length_score + 1 {
-					// Safety: match_available starts false so even if instart
-					// is zero, we won't reach this part until we've iterated
-					// at least once.
-					store.push(
-						u16::from(unsafe { *arr.get_unchecked(i - 1) }),
-						0,
-						i - 1,
-					)?;
-					if length_score >= ZOPFLI_MIN_MATCH as u16 && length < ZOPFLI_MAX_MATCH as u16 {
-						match_available = true;
-						prev_length = length;
-						prev_distance = distance;
-
-						i += 1;
-						continue;
-					}
-				}
-				else {
-					// Old is new.
-					length = prev_length;
-					distance = prev_distance;
-
-					// Write the values!
-					store.push(length, distance, i - 1)?;
-
-					// Update the hash up through length and increment the loop
-					// position accordingly.
-					for _ in 2..length {
-						i += 1;
-						self.update_hash(&arr[i..], i);
-					}
-
-					i += 1;
-					continue;
-				}
-			}
-			// No previous match, but maybe we can set it for the next
-			// iteration?
-			else if length_score >= ZOPFLI_MIN_MATCH as u16 && length < ZOPFLI_MAX_MATCH as u16 {
-				match_available = true;
-				prev_length = length;
-				prev_distance = distance;
-
-				i += 1;
-				continue;
-			}
-
-			// Write the current length/distance.
-			if length_score >= ZOPFLI_MIN_MATCH as u16 {
-				store.push(length, distance, i)?;
-			}
-			// Write from the source with no distance and reset the length to
-			// one.
-			else {
-				length = 1;
-				store.push(u16::from(arr[i]), 0, i)?;
-			}
-
-			// Update the hash up through length and increment the loop
-			// position accordingly.
-			for _ in 1..length {
-				i += 1;
-				self.update_hash(&arr[i..], i);
-			}
-
-			i += 1;
-		}
-
-		Ok(())
-	}
-
 	#[allow(clippy::cast_possible_truncation)]
 	/// # Follow Paths.
 	///
@@ -571,6 +640,7 @@ impl ZopfliHash {
 		instart: usize,
 		paths: &[u16],
 		store: &mut LZ77Store,
+		lmc: &mut MatchCache,
 	) -> Result<(), ZopfliError> {
 		// Easy abort.
 		if instart >= arr.len() { return Ok(()); }
@@ -597,6 +667,7 @@ impl ZopfliHash {
 					&mut [],
 					&mut dist,
 					&mut test_length,
+					lmc,
 					Some(instart),
 				)?;
 
@@ -645,17 +716,18 @@ impl ZopfliHash {
 		sublen: &mut [u16],
 		distance: &mut u16,
 		length: &mut u16,
+		lmc: &mut MatchCache,
 		cache: Option<usize>,
 	) -> Result<(), ZopfliError> {
 		// Check the longest match cache first!
 		if let Some(blockstart) = cache {
-			if CACHE.with_borrow(|c| c.find(
+			if lmc.find(
 				pos - blockstart,
 				&mut limit,
 				sublen,
 				distance,
 				length,
-			))? {
+			)? {
 				if pos + usize::from(*length) <= arr.len() { return Ok(()); }
 				return Err(ZopfliError::MatchRange);
 			}
@@ -682,9 +754,7 @@ impl ZopfliHash {
 		// Cache the results for next time, maybe.
 		if let Some(blockstart) = cache {
 			if limit == ZOPFLI_MAX_MATCH && ! sublen.is_empty() {
-				CACHE.with_borrow_mut(|c|
-					c.set_sublen(pos - blockstart, sublen, bestdist, bestlength)
-				);
+				lmc.set_sublen(pos - blockstart, sublen, bestdist, bestlength);
 			}
 		}
 
@@ -872,7 +942,7 @@ impl ZopfliHash {
 /// The remaining "sign" bit is logically repurposed to serve as a sort of
 /// `None`, allowing us to cheaply identify unwritten values. (Testing for that
 /// takes care of bounds checking on the lower end.)
-pub(crate) struct ZopfliHashChain {
+struct ZopfliHashChain {
 	/// Hash value to (most recent) index.
 	///
 	/// Note: the original (head/head2) `hash.c` implementation was
diff --git a/src/image/zopflipng/mod.rs b/src/image/zopflipng/mod.rs
index 76d5c89..42f33fd 100644
--- a/src/image/zopflipng/mod.rs
+++ b/src/image/zopflipng/mod.rs
@@ -27,12 +27,9 @@ pub(crate) use blocks::{
 	deflate_part,
 	SplitPoints,
 };
-use cache::{
-	CACHE,
-	SqueezeCache,
-};
+use cache::MatchCache;
 use error::ZopfliError;
-use hash::HASH;
+pub(crate) use hash::ZopfliState;
 use lz77::LZ77Store;
 use kat::zopfli_length_limited_code_lengths;
 use super::{

From 55f56f3e25b343ffc9655ebbb10ff51e7c463bbd Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Wed, 17 Apr 2024 12:17:23 -0700
Subject: [PATCH 09/31] perf: improve store reuse

---
 src/image/zopflipng/blocks.rs | 36 +++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/src/image/zopflipng/blocks.rs b/src/image/zopflipng/blocks.rs
index 2cbe744..51cf93f 100644
--- a/src/image/zopflipng/blocks.rs
+++ b/src/image/zopflipng/blocks.rs
@@ -69,6 +69,7 @@ impl SplitPoints {
 }
 
 impl SplitPoints {
+	#[inline]
 	/// # Uncompressed Split Pass.
 	///
 	/// This sets the uncompressed split points, by way of first setting the
@@ -76,15 +77,14 @@ impl SplitPoints {
 	///
 	/// In terms of order-of-operations, this must be called _before_ the
 	/// second-stage LZ77 pass as it would otherwise blow away that data.
-	fn split_raw(&mut self, arr: &[u8], instart: usize, state: &mut ZopfliState)
+	fn split_raw(&mut self, arr: &[u8], instart: usize, state: &mut ZopfliState, store: &mut LZ77Store)
 	-> Result<usize, ZopfliError> {
 		// Populate an LZ77 store from a greedy pass. This results in better
 		// block choices than a full optimal pass.
-		let mut store = LZ77Store::new();
-		state.greedy(arr, instart, &mut store, None)?;
+		state.greedy(arr, instart, store, None)?;
 
 		// Do an LZ77 pass.
-		let len = self.split_lz77(&store)?;
+		let len = self.split_lz77(store)?;
 
 		// Find the corresponding uncompressed positions.
 		if 0 < len && len <= MAX_SPLIT_POINTS {
@@ -157,6 +157,7 @@ impl SplitPoints {
 	}
 
 	#[allow(unsafe_code)]
+	#[inline]
 	/// # Split Best.
 	///
 	/// Compare the optimal raw split points with a dedicated lz77 pass and
@@ -167,13 +168,13 @@ impl SplitPoints {
 		arr: &[u8],
 		instart: usize,
 		store: &mut LZ77Store,
+		store2: &mut LZ77Store,
 		state: &mut ZopfliState,
 	) -> Result<&[usize], ZopfliError> {
 		// Start by splitting uncompressed.
-		let limit = self.split_raw(arr, instart, state)?.min(MAX_SPLIT_POINTS);
+		let limit = self.split_raw(arr, instart, state, store2)?.min(MAX_SPLIT_POINTS);
 
 		let mut cost1 = 0;
-		let mut store2 = LZ77Store::new();
 		let mut store3 = LZ77Store::new();
 		for i in 0..=limit {
 			let start = if i == 0 { instart } else { self.slice1[i - 1] };
@@ -187,14 +188,14 @@ impl SplitPoints {
 				unsafe { arr.get_unchecked(..end) },
 				start,
 				numiterations,
-				&mut store2,
+				store2,
 				&mut store3,
 				state,
 			)?;
-			cost1 += calculate_block_size_auto_type(&store2, 0, store2.len())?;
+			cost1 += calculate_block_size_auto_type(store2, 0, store2.len())?;
 
 			// Append its data to our main store.
-			store.append(&store2);
+			store.append(store2);
 
 			// Save the chunk size to our best.
 			if i < limit { self.slice2[i] = store.len(); }
@@ -242,11 +243,13 @@ pub(crate) fn deflate_part(
 ) -> Result<(), ZopfliError> {
 	// Find the split points.
 	let mut store = LZ77Store::new();
+	let mut store2 = LZ77Store::new();
 	let best = splits.split(
 		numiterations,
 		arr,
 		instart,
 		&mut store,
+		&mut store2,
 		state,
 	)?;
 
@@ -257,6 +260,7 @@ pub(crate) fn deflate_part(
 		add_lz77_block_auto_type(
 			i == best.len() && last_block,
 			&store,
+			&mut store2,
 			state,
 			arr.as_ptr(),
 			start,
@@ -438,6 +442,7 @@ fn add_lz77_block(
 	clippy::cast_sign_loss,
 	clippy::too_many_arguments,
 )]
+#[inline]
 /// # Add LZ77 Block (Automatic Type).
 ///
 /// This calculates the expected output sizes for all three block types, then
@@ -445,6 +450,7 @@ fn add_lz77_block(
 fn add_lz77_block_auto_type(
 	last_block: bool,
 	store: &LZ77Store,
+	fixed_store: &mut LZ77Store,
 	state: &mut ZopfliState,
 	arr: *const u8,
 	lstart: usize,
@@ -470,7 +476,7 @@ fn add_lz77_block_auto_type(
 	if
 		(store.len() < 1000 || (fixed_cost as f64) <= (dynamic_cost as f64) * 1.1) &&
 		try_lz77_expensive_fixed(
-			store, state, uncompressed_cost, dynamic_cost,
+			store, fixed_store, state, uncompressed_cost, dynamic_cost,
 			arr, lstart, lend, last_block,
 			expected_data_size, out,
 		)?
@@ -1269,6 +1275,7 @@ fn patch_distance_codes(d_lengths: &mut [u32; ZOPFLI_NUM_D]) {
 }
 
 #[allow(unsafe_code, clippy::too_many_arguments)]
+#[inline(never)]
 /// # (Maybe) Add LZ77 Expensive Fixed Block.
 ///
 /// This runs the full suite of fixed-tree tests on the data and writes it to
@@ -1277,6 +1284,7 @@ fn patch_distance_codes(d_lengths: &mut [u32; ZOPFLI_NUM_D]) {
 /// Returns `true` if data was written.
 fn try_lz77_expensive_fixed(
 	store: &LZ77Store,
+	fixed_store: &mut LZ77Store,
 	state: &mut ZopfliState,
 	uncompressed_cost: usize,
 	dynamic_cost: usize,
@@ -1287,7 +1295,6 @@ fn try_lz77_expensive_fixed(
 	expected_data_size: usize,
 	out: &mut ZopfliOut,
 ) -> Result<bool, ZopfliError> {
-	let mut fixed_store = LZ77Store::new();
 	// Safety: the split points are checked at creation.
 	debug_assert!(lstart < store.entries.len());
 	let instart = unsafe { store.entries.get_unchecked(lstart).pos };
@@ -1297,16 +1304,17 @@ fn try_lz77_expensive_fixed(
 	state.init_lmc(inend - instart);
 
 	// Pull the hasher.
+	fixed_store.clear();
 	state.optimal_run(
 		unsafe { std::slice::from_raw_parts(arr, inend) },
 		instart,
 		None,
-		&mut fixed_store,
+		fixed_store,
 	)?;
 
 	// Find the resulting cost.
 	let fixed_cost = calculate_block_size(
-		&fixed_store,
+		fixed_store,
 		0,
 		fixed_store.len(),
 		BlockType::Fixed,
@@ -1316,7 +1324,7 @@ fn try_lz77_expensive_fixed(
 	// fixed and dynamic, it's the best and worth writing!
 	if fixed_cost < dynamic_cost && (fixed_cost <= uncompressed_cost || dynamic_cost <= uncompressed_cost) {
 		add_lz77_block(
-			BlockType::Fixed, last_block, &fixed_store, arr, 0, fixed_store.len(),
+			BlockType::Fixed, last_block, fixed_store, arr, 0, fixed_store.len(),
 			expected_data_size, out,
 		)
 			.map(|()| true)

From 9975cd018c0771a806ea8c89d695b7491bb9cc91 Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Thu, 18 Apr 2024 10:28:16 -0700
Subject: [PATCH 10/31] ci: add png bench recipe

---
 justfile                 |  9 +++++++++
 skel/{ => assets}/png.b3 | 22 +++++++++++-----------
 2 files changed, 20 insertions(+), 11 deletions(-)
 rename skel/{ => assets}/png.b3 (67%)

diff --git a/justfile b/justfile
index b7bc412..f4d0b50 100644
--- a/justfile
+++ b/justfile
@@ -57,6 +57,15 @@ export CXX := "clang++"
 	mv "{{ justfile_directory() }}/target" "{{ cargo_dir }}"
 
 
+# Bench PNG Compression.
+[no-cd]
+@bench-png BIN:
+	[ -f "{{ BIN }}" ] || exit 1
+	just _bench-reset
+	"{{ absolute_path(BIN) }}" -p --no-jpeg "{{ bench_dir }}"
+	cd "{{ bench_dir }}" && b3sum -c png.b3 --quiet
+
+
 @clean:
 	# Most things go here.
 	[ ! -d "{{ cargo_dir }}" ] || rm -rf "{{ cargo_dir }}"
diff --git a/skel/png.b3 b/skel/assets/png.b3
similarity index 67%
rename from skel/png.b3
rename to skel/assets/png.b3
index d60d8dc..fe0f5de 100644
--- a/skel/png.b3
+++ b/skel/assets/png.b3
@@ -1,11 +1,11 @@
-b53eca68578abeab039f7492aa08cc08aca2ee8373f5f35e4d51cfec69ab5c89  /tmp/bench-data/png/01.png
-385eb82425d1d39d84155ad812d7a2e9c7f4f989b9be8901f227d9b0b3716e60  /tmp/bench-data/png/02.png
-a71b84ad1980fb75b0357c9a2440a53af16363f66aaed484559ea95075f85d48  /tmp/bench-data/png/03.png
-71f3a81b8f0800987434a87a256f88fc0fa992a0082e9dc94cd47a7bf69cb7f6  /tmp/bench-data/png/04.png
-f305df54a410b781b5798c96c2423a90fd80744fa6a1b6e42938b866ae465bcc  /tmp/bench-data/png/05.png
-3cbbc3be80e8de2d9a3c1d97b11ca110d2922c0dcfcb7de1ff6952fa0addbe0b  /tmp/bench-data/png/06.png
-97fff17e466418ba255a5a1ecbe463c72d03bfca7c7c597f0ee8f831588a51e5  /tmp/bench-data/png/poe.png
-4da3d1e65804d00be0d376234c3c1ee25469105bf32966e05adb65e870d6eb5e  /tmp/bench-data/png/small.png
-e0eb307123913aa8b3a864790ab904d340a7e4e85192d27cb1d16e15c14d3319  /tmp/bench-data/png/small-bw.png
-3ca8474dff6526979ca43cba37a71b9ffdaa9646bd89811c3abdce0b56b85ca2  /tmp/bench-data/png/small-bwa.png
-6d73e93975587d16ce907ace61f7ac818aaa2c0f2884ecb489112c8d6f5c5aac  /tmp/bench-data/wolf.jpg
+b53eca68578abeab039f7492aa08cc08aca2ee8373f5f35e4d51cfec69ab5c89  ./png/01.png
+385eb82425d1d39d84155ad812d7a2e9c7f4f989b9be8901f227d9b0b3716e60  ./png/02.png
+a71b84ad1980fb75b0357c9a2440a53af16363f66aaed484559ea95075f85d48  ./png/03.png
+71f3a81b8f0800987434a87a256f88fc0fa992a0082e9dc94cd47a7bf69cb7f6  ./png/04.png
+f305df54a410b781b5798c96c2423a90fd80744fa6a1b6e42938b866ae465bcc  ./png/05.png
+3cbbc3be80e8de2d9a3c1d97b11ca110d2922c0dcfcb7de1ff6952fa0addbe0b  ./png/06.png
+97fff17e466418ba255a5a1ecbe463c72d03bfca7c7c597f0ee8f831588a51e5  ./png/poe.png
+4da3d1e65804d00be0d376234c3c1ee25469105bf32966e05adb65e870d6eb5e  ./png/small.png
+e0eb307123913aa8b3a864790ab904d340a7e4e85192d27cb1d16e15c14d3319  ./png/small-bw.png
+3ca8474dff6526979ca43cba37a71b9ffdaa9646bd89811c3abdce0b56b85ca2  ./png/small-bwa.png
+6d73e93975587d16ce907ace61f7ac818aaa2c0f2884ecb489112c8d6f5c5aac  ./wolf.jpg

From 281e8f1acdf147a79a8ca5fe17d26fc295911d49 Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Thu, 18 Apr 2024 10:34:05 -0700
Subject: [PATCH 11/31] perf: set LODEPNG_NO_COMPILE_ANCILLARY_CHUNKS (since
 we're stripping them anyway)

---
 build.rs             | 12 +++++++++---
 src/image/lodepng.rs |  2 --
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/build.rs b/build.rs
index 140d1f9..a103e7b 100644
--- a/build.rs
+++ b/build.rs
@@ -73,19 +73,20 @@ fn build_ffi() {
 
 	// Build Zopfli first.
 	let mut c = cc::Build::new();
-	c.includes(&[repo, &lodepng_src, &zopfli_src])
+	c.includes([repo, &lodepng_src, &zopfli_src])
 		.cpp(false)
 		.flag_if_supported("-W")
 		.flag_if_supported("-ansi")
 		.flag_if_supported("-pedantic")
 		.pic(true)
 		.static_flag(true)
-		.files(&[
+		.files([
 			zopfli_src.join("zopfli.c"),
 			lodepng_src.join("lodepng.c"),
 		])
-		.define("LODEPNG_NO_COMPILE_DISK", None)
+		.define("LODEPNG_NO_COMPILE_ANCILLARY_CHUNKS", None)
 		.define("LODEPNG_NO_COMPILE_CPP", None)
+		.define("LODEPNG_NO_COMPILE_DISK", None)
 		.compile("zopflipng");
 
 	bindings(repo, &lodepng_src, &zopfli_src);
@@ -187,6 +188,11 @@ pub(crate) const DISTANCE_VALUES: &[u16; 32_768] = &[");
 /// commented-out code can be re-enabled if they ever need to be updated.
 fn bindings(repo: &Path, lodepng_src: &Path, zopfli_src: &Path) {
 	let bindings = bindgen::Builder::default()
+		.clang_args([
+			"-DLODEPNG_NO_COMPILE_ANCILLARY_CHUNKS",
+			"-DLODEPNG_NO_COMPILE_CPP",
+			"-DLODEPNG_NO_COMPILE_DISK",
+		])
 		.header(lodepng_src.join("lodepng.h").to_string_lossy())
 		.header(repo.join("rust.h").to_string_lossy())
 		.header(zopfli_src.join("zopfli.h").to_string_lossy())
diff --git a/src/image/lodepng.rs b/src/image/lodepng.rs
index 774dffd..0a242ad 100644
--- a/src/image/lodepng.rs
+++ b/src/image/lodepng.rs
@@ -287,8 +287,6 @@ impl LodePNGState {
 
 		enc.encoder.filter_palette_zero = 0;
 		enc.encoder.filter_strategy = strategy;
-		enc.encoder.add_id = 0;
-		enc.encoder.text_compression = 1;
 
 		// For final compression, enable the custom zopfli deflater.
 		if slow {

From 9be5f6a1395eaefc43290e042d688850d3c71fec Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Thu, 18 Apr 2024 12:04:13 -0700
Subject: [PATCH 12/31] cleanup: refactor ZopfliError to make it more
 debug_assertion-like misc: promote debug assertions to condition/Err, unless
 super hot

---
 src/image/lodepng.rs          | 10 ++--
 src/image/zopflipng/blocks.rs | 67 ++++++++++++-------------
 src/image/zopflipng/cache.rs  | 27 +++++-----
 src/image/zopflipng/error.rs  | 92 ++++++++++++++++++++++-------------
 src/image/zopflipng/hash.rs   | 18 +++----
 src/image/zopflipng/kat.rs    |  7 ++-
 src/image/zopflipng/lz77.rs   | 51 +++++++++----------
 src/image/zopflipng/mod.rs    |  5 +-
 8 files changed, 150 insertions(+), 127 deletions(-)

diff --git a/src/image/lodepng.rs b/src/image/lodepng.rs
index 0a242ad..600604a 100644
--- a/src/image/lodepng.rs
+++ b/src/image/lodepng.rs
@@ -80,13 +80,11 @@ pub(crate) extern "C" fn flaca_png_deflate(
 			&mut dst,
 		);
 
-		if res.is_err() {
-			// Force a panic in debug mode.
-			#[cfg(debug_assertions)] panic!("(zopfli) {res:?}");
+		#[cfg(debug_assertions)] if let Err(e) = res { panic!("{e}"); }
 
-			// Otherwise just let lodepng know we failed.
-			return 1;
-		}
+		// For non-debug purposes, just let lodepng know we failed when thre's
+		// an error so it can skip the rest of the processing.
+		if res.is_err() { return 1; }
 
 		// Onward and upward!
 		i += size;
diff --git a/src/image/zopflipng/blocks.rs b/src/image/zopflipng/blocks.rs
index 51cf93f..246e6f6 100644
--- a/src/image/zopflipng/blocks.rs
+++ b/src/image/zopflipng/blocks.rs
@@ -18,6 +18,7 @@ use super::{
 		RanState,
 		SymbolStats,
 	},
+	zopfli_error,
 	zopfli_length_limited_code_lengths,
 	zopfli_lengths_to_symbols,
 	ZOPFLI_NUM_D,
@@ -121,7 +122,7 @@ impl SplitPoints {
 		loop {
 			let (llpos, llcost) = find_minimum_cost(store, lstart + 1, lend)?;
 			if llpos <= lstart || llpos >= lend {
-				return Err(ZopfliError::SplitRange);
+				return Err(zopfli_error!());
 			}
 
 			// Ignore points we've already covered.
@@ -179,6 +180,9 @@ impl SplitPoints {
 		for i in 0..=limit {
 			let start = if i == 0 { instart } else { self.slice1[i - 1] };
 			let end = if i < limit { self.slice1[i] } else { arr.len() };
+
+			// This assertion is redundant as we explicitly check range sanity
+			// earlier and later in the pipeline.
 			debug_assert!(start <= end && end <= arr.len());
 
 			// Make another store.
@@ -262,7 +266,7 @@ pub(crate) fn deflate_part(
 			&store,
 			&mut store2,
 			state,
-			arr.as_ptr(),
+			arr,
 			start,
 			end,
 			0,
@@ -381,7 +385,7 @@ fn add_lz77_block(
 	btype: BlockType,
 	last_block: bool,
 	store: &LZ77Store,
-	arr: *const u8,
+	arr: &[u8],
 	lstart: usize,
 	lend: usize,
 	expected_data_size: usize,
@@ -389,11 +393,8 @@ fn add_lz77_block(
 ) -> Result<(), ZopfliError> {
 	// Uncompressed blocks are easy!
 	if matches!(btype, BlockType::Uncompressed) {
-		let length = get_lz77_byte_range(store, lstart, lend);
-		let pos =
-			if lstart >= lend { 0 }
-			else { store.entries[lstart].pos };
-		out.add_uncompressed_block(last_block, arr, pos, pos + length);
+		let (instart, inend) = get_lz77_byte_range(store, lstart, lend)?;
+		out.add_uncompressed_block(last_block, arr.as_ptr(), instart, inend);
 		return Ok(());
 	}
 
@@ -452,7 +453,7 @@ fn add_lz77_block_auto_type(
 	store: &LZ77Store,
 	fixed_store: &mut LZ77Store,
 	state: &mut ZopfliState,
-	arr: *const u8,
+	arr: &[u8],
 	lstart: usize,
 	lend: usize,
 	expected_data_size: usize,
@@ -535,9 +536,9 @@ fn add_lz77_data(
 		// Length only.
 		if e.dist <= 0 {
 			if (e.litlen as u16) >= 256 {
-				return Err(ZopfliError::LitLenLiteral);
+				return Err(zopfli_error!());
 			}
-			if ll_lengths[e.litlen as usize] == 0 { return Err(ZopfliError::NoLength); }
+			if ll_lengths[e.litlen as usize] == 0 { return Err(zopfli_error!()); }
 
 			out.add_huffman_bits(
 				ll_symbols[e.litlen as usize],
@@ -548,7 +549,7 @@ fn add_lz77_data(
 		// Length and distance.
 		else {
 			let (symbol, bits, value) = LENGTH_SYMBOLS_BITS_VALUES[e.litlen as usize];
-			if ll_lengths[symbol as usize] == 0 { return Err(ZopfliError::NoLength); }
+			if ll_lengths[symbol as usize] == 0 { return Err(zopfli_error!()); }
 
 			out.add_huffman_bits(
 				ll_symbols[symbol as usize],
@@ -557,7 +558,7 @@ fn add_lz77_data(
 			out.add_bits(u32::from(value), u32::from(bits));
 
 			// Now the distance bits.
-			if d_lengths[e.d_symbol as usize] == 0 { return Err(ZopfliError::NoDistance); }
+			if d_lengths[e.d_symbol as usize] == 0 { return Err(zopfli_error!()); }
 			out.add_huffman_bits(
 				d_symbols[e.d_symbol as usize],
 				d_lengths[e.d_symbol as usize],
@@ -572,7 +573,7 @@ fn add_lz77_data(
 	}
 
 	if expected_data_size == 0 || test_size == expected_data_size { Ok(()) }
-	else { Err(ZopfliError::Write) }
+	else { Err(zopfli_error!()) }
 }
 
 /// # Calculate Block Size (in Bits).
@@ -584,11 +585,12 @@ fn calculate_block_size(
 ) -> Result<usize, ZopfliError> {
 	match btype {
 		BlockType::Uncompressed => {
-			let length = get_lz77_byte_range(store, lstart, lend);
-			let blocks = length.div_ceil(65_535);
+			let (instart, inend) = get_lz77_byte_range(store, lstart, lend)?;
+			let blocksize = inend - instart;
 
 			// Blocks larger than u16::MAX need to be split.
-			Ok(blocks * 40 + length * 8)
+			let blocks = blocksize.div_ceil(65_535);
+			Ok(blocks * 40 + blocksize * 8)
 		},
 		BlockType::Fixed =>
 			Ok(calculate_block_symbol_size(
@@ -826,7 +828,7 @@ fn encode_tree(
 	while i < lld_total {
 		let mut count = 1;
 		let symbol = length_or_distance(i);
-		if symbol >= 19 { return Err(ZopfliError::TreeSymbol); }
+		if symbol >= 19 { return Err(zopfli_error!()); }
 
 		// Peek ahead; we may be able to do more in one go.
 		if use_16 || (symbol == 0 && (use_17 || use_18)) {
@@ -1062,20 +1064,21 @@ fn get_dynamic_lengths(
 	)
 }
 
-#[allow(unsafe_code)]
-/// # Symbol Spans in Raw Bytes.
+/// # Symbol Span Range.
+///
+/// Convert an LZ77 range to the start/end positions of the block.
 fn get_lz77_byte_range(
 	store: &LZ77Store,
 	lstart: usize,
 	lend: usize,
-) -> usize {
-	if lstart >= lend { 0 }
-	else {
-		// Safety: the split points are checked at creation.
-		debug_assert!(lend <= store.entries.len());
-		let e = unsafe { store.entries.get_unchecked(lend - 1) };
-		e.length() as usize + e.pos - store.entries[lstart].pos
+) -> Result<(usize, usize), ZopfliError> {
+	let slice = store.entries.as_slice();
+	if lstart < lend && lend <= slice.len() {
+		let instart = slice[lstart].pos;
+		let e = slice[lend - 1];
+		Ok((instart, e.length() as usize + e.pos))
 	}
+	else { Err(zopfli_error!()) }
 }
 
 /// # Optimal LZ77.
@@ -1274,7 +1277,7 @@ fn patch_distance_codes(d_lengths: &mut [u32; ZOPFLI_NUM_D]) {
 	}
 }
 
-#[allow(unsafe_code, clippy::too_many_arguments)]
+#[allow(clippy::too_many_arguments)]
 #[inline(never)]
 /// # (Maybe) Add LZ77 Expensive Fixed Block.
 ///
@@ -1288,7 +1291,7 @@ fn try_lz77_expensive_fixed(
 	state: &mut ZopfliState,
 	uncompressed_cost: usize,
 	dynamic_cost: usize,
-	arr: *const u8,
+	arr: &[u8],
 	lstart: usize,
 	lend: usize,
 	last_block: bool,
@@ -1296,9 +1299,7 @@ fn try_lz77_expensive_fixed(
 	out: &mut ZopfliOut,
 ) -> Result<bool, ZopfliError> {
 	// Safety: the split points are checked at creation.
-	debug_assert!(lstart < store.entries.len());
-	let instart = unsafe { store.entries.get_unchecked(lstart).pos };
-	let inend = instart + get_lz77_byte_range(store, lstart, lend);
+	let (instart, inend) = get_lz77_byte_range(store, lstart, lend)?;
 
 	// Run all the expensive fixed-cost checks.
 	state.init_lmc(inend - instart);
@@ -1306,7 +1307,7 @@ fn try_lz77_expensive_fixed(
 	// Pull the hasher.
 	fixed_store.clear();
 	state.optimal_run(
-		unsafe { std::slice::from_raw_parts(arr, inend) },
+		arr.get(..inend).ok_or(zopfli_error!())?,
 		instart,
 		None,
 		fixed_store,
diff --git a/src/image/zopflipng/cache.rs b/src/image/zopflipng/cache.rs
index 0cf337c..407e8c3 100644
--- a/src/image/zopflipng/cache.rs
+++ b/src/image/zopflipng/cache.rs
@@ -6,6 +6,7 @@ the thread-local LMC static.
 */
 
 use super::{
+	zopfli_error,
 	ZOPFLI_MAX_MATCH,
 	ZOPFLI_MIN_MATCH,
 	ZopfliError,
@@ -135,7 +136,7 @@ impl MatchCache {
 						usize::from(*length) >= ZOPFLI_MIN_MATCH &&
 						*distance != cache_dist
 					{
-						return Err(ZopfliError::LMCDistance);
+						return Err(zopfli_error!());
 					}
 				}
 
@@ -174,28 +175,20 @@ impl MatchCache {
 		sublen: &[u16],
 		distance: u16,
 		length: u16,
-	) {
-		let (cache_len, cache_dist) = self.ld(pos);
-		if cache_len == 0 || cache_dist != 0 { return; }
-		debug_assert_eq!(
-			(cache_len, cache_dist),
-			(1, 0),
-			"Length and/or distance are already cached!"
-		);
+	) -> Result<(), ZopfliError> {
+		let old_ld = self.ld(pos);
+		if old_ld.0 == 0 || old_ld.1 != 0 { return Ok(()); }
+		else if old_ld != (1, 0) { return Err(zopfli_error!()); }
 
 		// The sublength isn't cacheable, but that fact is itself worth
 		// caching!
 		if usize::from(length) < ZOPFLI_MIN_MATCH {
 			self.set_ld(pos, 0, 0);
-			return;
+			return Ok(());
 		}
 
 		// Save the length/distance bit.
-		debug_assert_ne!(
-			distance,
-			0,
-			"Distance cannot be zero when length > ZOPFLI_MIN_MATCH!"
-		);
+		if distance == 0 { return Err(zopfli_error!()); }
 		self.set_ld(pos, length, distance);
 
 		// The cache gets written three bytes at a time; this iterator will
@@ -208,7 +201,7 @@ impl MatchCache {
 		// Write all mismatched pairs.
 		for (i, pair) in sublen.windows(2).skip(ZOPFLI_MIN_MATCH).take(usize::from(length) - 3).enumerate() {
 			if pair[0] != pair[1] {
-				let Some(next) = dst.next() else { return; };
+				let Some(next) = dst.next() else { return Ok(()); };
 				next[0] = i as u8;
 				next[1..].copy_from_slice(pair[0].to_le_bytes().as_slice());
 			}
@@ -225,6 +218,8 @@ impl MatchCache {
 				*c1 = (length - 3) as u8;
 			}
 		}
+
+		Ok(())
 	}
 
 	/// # Write Sublength.
diff --git a/src/image/zopflipng/error.rs b/src/image/zopflipng/error.rs
index 3adce8d..5bb4ea4 100644
--- a/src/image/zopflipng/error.rs
+++ b/src/image/zopflipng/error.rs
@@ -5,48 +5,70 @@
 use std::fmt;
 
 #[derive(Debug, Clone, Copy)]
-pub(crate) enum ZopfliError {
-	HistogramRange,
-	LeafSize,
-	LitLen,
-	LitLenLiteral,
-	LMCDistance,
-	MatchRange,
-	NoDistance,
-	NoLength,
-	PathLength,
-	SplitRange,
-	SublenLength,
-	TreeSymbol,
-	Write,
+/// # Zopfli Error.
+///
+/// This struct is used for logical failings (bugs) in the ported zopfli
+/// functionality. This shouldn't ever be instantiated in practice…
+///
+/// When compiled with `debug-assertions = true`, an error will panic with the
+/// offending source file and line number details to aid investigation.
+///
+/// Otherwise it simply serves as a flag for lodepng, letting it know that
+/// something went wrong so it can abandon its compressive efforts for the
+/// given image.
+///
+/// The macro `zopfli_error!` is used internally to populate the appropriate
+/// details or not.
+pub(crate) struct ZopfliError {
+	#[cfg(debug_assertions)] file: &'static str,
+	#[cfg(debug_assertions)] line: u32,
+}
+
+impl ZopfliError {
+	#[cfg(debug_assertions)]
+	/// # New Error.
+	pub(crate) const fn new(file: &'static str, line: u32) -> Self {
+		Self { file, line }
+	}
 }
 
 impl fmt::Display for ZopfliError {
-	#[inline]
+	#[cfg(debug_assertions)]
+	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+		f.write_fmt(format_args!(
+			"Zopfli BUG!!! Sanity check failed at {}:{}",
+			self.file,
+			self.line,
+		))
+	}
+
+	#[cfg(not(debug_assertions))]
 	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-		f.write_str(self.as_str())
+		f.write_str("zopfli bug")
 	}
 }
 
 impl std::error::Error for ZopfliError {}
 
-impl ZopfliError {
-	/// # As String Slice.
-	pub(crate) const fn as_str(self) -> &'static str {
-		match self {
-			Self::HistogramRange => "invalid histogram range",
-			Self::LeafSize => "insufficient maxbits for leaves",
-			Self::LitLen => "invalid litlen",
-			Self::LitLenLiteral => "invalid litlen literal",
-			Self::LMCDistance => "LMC returned an unexpected distance",
-			Self::MatchRange => "invalid match range",
-			Self::NoDistance => "expected non-zero distance",
-			Self::NoLength => "expected non-zero length",
-			Self::PathLength => "invalid path length",
-			Self::SplitRange => "invalid split range",
-			Self::SublenLength => "incorrectly sized sublength array",
-			Self::TreeSymbol => "invalid tree symbol",
-			Self::Write => "failed to write output",
-		}
-	}
+
+
+#[cfg(debug_assertions)]
+/// # Error Macro.
+///
+/// Initialize a new error with the appropriate environmental argument(s)
+/// according to `debug-assertions`.
+macro_rules! zopfli_error {
+	() => (ZopfliError::new(file!(), line!()));
 }
+
+#[cfg(not(debug_assertions))]
+/// # Error Macro.
+///
+/// Initialize a new error with the appropriate environmental argument(s)
+/// according to `debug-assertions`.
+macro_rules! zopfli_error {
+	() => (ZopfliError {});
+}
+
+/// # Expose it to the rest of the module.
+pub(super) use zopfli_error;
diff --git a/src/image/zopflipng/hash.rs b/src/image/zopflipng/hash.rs
index c36a291..5484573 100644
--- a/src/image/zopflipng/hash.rs
+++ b/src/image/zopflipng/hash.rs
@@ -26,6 +26,7 @@ use super::{
 	MatchCache,
 	stats::SymbolStats,
 	SUBLEN_LEN,
+	zopfli_error,
 	ZOPFLI_MAX_MATCH,
 	ZOPFLI_MIN_MATCH,
 	ZopfliError,
@@ -674,7 +675,7 @@ impl ZopfliHash {
 				// This logic is so screwy; I hesitate to make this a debug
 				// assertion!
 				if test_length != length && length > 2 && test_length > 2 {
-					return Err(ZopfliError::PathLength);
+					return Err(zopfli_error!());
 				}
 
 				// Add it to the store.
@@ -729,11 +730,12 @@ impl ZopfliHash {
 				length,
 			)? {
 				if pos + usize::from(*length) <= arr.len() { return Ok(()); }
-				return Err(ZopfliError::MatchRange);
+				return Err(zopfli_error!());
 			}
 		}
 
-		// These are both hard-coded or asserted by the caller.
+		// This is explicitly checked by the caller and again by find_loop()
+		// below, so we can leave this as a redundant debug assertion.
 		debug_assert!((ZOPFLI_MIN_MATCH..=ZOPFLI_MAX_MATCH).contains(&limit));
 
 		// We'll need at least ZOPFLI_MIN_MATCH bytes for a search; if we don't
@@ -754,7 +756,7 @@ impl ZopfliHash {
 		// Cache the results for next time, maybe.
 		if let Some(blockstart) = cache {
 			if limit == ZOPFLI_MAX_MATCH && ! sublen.is_empty() {
-				lmc.set_sublen(pos - blockstart, sublen, bestdist, bestlength);
+				lmc.set_sublen(pos - blockstart, sublen, bestdist, bestlength)?;
 			}
 		}
 
@@ -762,9 +764,7 @@ impl ZopfliHash {
 		*distance = bestdist;
 		*length = bestlength;
 		if pos + usize::from(*length) <= arr.len() { Ok(()) }
-		else {
-			Err(ZopfliError::MatchRange)
-		}
+		else { Err(zopfli_error!()) }
 	}
 
 	#[allow(
@@ -786,12 +786,12 @@ impl ZopfliHash {
 		sublen: &mut [u16],
 	) -> Result<(u16, u16), ZopfliError> {
 		// This is asserted by find() too, but it's a good reminder.
-		debug_assert!(limit <= ZOPFLI_MAX_MATCH);
+		if ZOPFLI_MAX_MATCH < limit { return Err(zopfli_error!()); }
 
 		// Help the compiler understand sublen has a fixed size if provided.
 		// (We can't do an Option<Array> because it's too big for Copy.)
 		if ! sublen.is_empty() && sublen.len() != ZOPFLI_MAX_MATCH + 1 {
-			return Err(ZopfliError::SublenLength);
+			return Err(zopfli_error!());
 		}
 
 		let hpos = pos & ZOPFLI_WINDOW_MASK;
diff --git a/src/image/zopflipng/kat.rs b/src/image/zopflipng/kat.rs
index 787b0f0..4590aa6 100644
--- a/src/image/zopflipng/kat.rs
+++ b/src/image/zopflipng/kat.rs
@@ -14,7 +14,10 @@ use std::{
 	cmp::Ordering,
 	mem::MaybeUninit,
 };
-use super::ZopfliError;
+use super::{
+	zopfli_error,
+	ZopfliError,
+};
 
 
 
@@ -62,7 +65,7 @@ pub(crate) fn zopfli_length_limited_code_lengths<const MAXBITS: usize, const SIZ
 	// leaves, or 7 maxbits and 19 potential leaves; in either case, the max
 	// leaves are well within range.
 	if (1 << MAXBITS) < len_leaves {
-		return Err(ZopfliError::LeafSize);
+		return Err(zopfli_error!());
 	}
 
 	// Set up the pool!
diff --git a/src/image/zopflipng/lz77.rs b/src/image/zopflipng/lz77.rs
index 04de9f3..bd2919c 100644
--- a/src/image/zopflipng/lz77.rs
+++ b/src/image/zopflipng/lz77.rs
@@ -10,6 +10,7 @@ use super::{
 	LENGTH_SYMBOLS_BITS_VALUES,
 	LitLen,
 	Lsym,
+	zopfli_error,
 	ZOPFLI_NUM_D,
 	ZOPFLI_NUM_LL,
 	ZopfliError,
@@ -161,10 +162,10 @@ impl LZ77Store {
 		// Start by copying the counts directly from the nearest chunk.
 		let mut ll_counts: [usize; ZOPFLI_NUM_LL] = self.ll_counts.get(ll_start..ll_end)
 			.and_then(|c| c.try_into().ok())
-			.ok_or(ZopfliError::HistogramRange)?;
+			.ok_or(zopfli_error!())?;
 		let mut d_counts: [usize; ZOPFLI_NUM_D] = self.d_counts.get(d_start..d_end)
 			.and_then(|c| c.try_into().ok())
-			.ok_or(ZopfliError::HistogramRange)?;
+			.ok_or(zopfli_error!())?;
 
 		// Subtract the symbol occurences between (pos+1) and the end of the
 		// chunks.
@@ -238,29 +239,29 @@ impl LZ77StoreEntry {
 	)]
 	/// # New.
 	const fn new(litlen: u16, dist: u16, pos: usize) -> Result<Self, ZopfliError> {
-		if litlen >= 259 { return Err(ZopfliError::LitLen); }
-		debug_assert!(dist < 32_768);
-
-		// Using the signed type helps the compiler understand the upper
-		// range fits ZOPFLI_WINDOW_MAX.
-		let dist = dist as i16;
-		let (ll_symbol, d_symbol) =
-			if dist <= 0 {
-				// Safety: the maximum Lsym is 285.
-				(unsafe { std::mem::transmute(litlen) }, Dsym::D00)
-			}
-			else {(
-				LENGTH_SYMBOLS_BITS_VALUES[litlen as usize].0,
-				DISTANCE_SYMBOLS[dist as usize],
-			)};
-
-		Ok(Self {
-			pos,
-			litlen: unsafe { std::mem::transmute(litlen) },
-			dist,
-			ll_symbol,
-			d_symbol,
-		})
+		if litlen < 259 && dist < 32_768 {
+			// Using the signed type helps the compiler understand the upper
+			// range fits ZOPFLI_WINDOW_MAX.
+			let dist = dist as i16;
+			let (ll_symbol, d_symbol) =
+				if dist <= 0 {
+					// Safety: the maximum Lsym is 285.
+					(unsafe { std::mem::transmute(litlen) }, Dsym::D00)
+				}
+				else {(
+					LENGTH_SYMBOLS_BITS_VALUES[litlen as usize].0,
+					DISTANCE_SYMBOLS[dist as usize],
+				)};
+
+			Ok(Self {
+				pos,
+				litlen: unsafe { std::mem::transmute(litlen) },
+				dist,
+				ll_symbol,
+				d_symbol,
+			})
+		}
+		else { Err(zopfli_error!()) }
 	}
 
 	/// # Length.
diff --git a/src/image/zopflipng/mod.rs b/src/image/zopflipng/mod.rs
index 42f33fd..198f151 100644
--- a/src/image/zopflipng/mod.rs
+++ b/src/image/zopflipng/mod.rs
@@ -28,7 +28,10 @@ pub(crate) use blocks::{
 	SplitPoints,
 };
 use cache::MatchCache;
-use error::ZopfliError;
+use error::{
+	zopfli_error,
+	ZopfliError,
+};
 pub(crate) use hash::ZopfliState;
 use lz77::LZ77Store;
 use kat::zopfli_length_limited_code_lengths;

From 87871fe9c60d8221b64325df171e36114f5cfbde Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Thu, 18 Apr 2024 20:31:56 -0700
Subject: [PATCH 13/31] cleanup: refactor away some unsafe blocks

---
 build.rs                       | 14 ++++++++++--
 src/image/zopflipng/blocks.rs  | 40 ++++++++++++++++------------------
 src/image/zopflipng/mod.rs     |  1 +
 src/image/zopflipng/symbols.rs | 11 ++++++++++
 4 files changed, 43 insertions(+), 23 deletions(-)

diff --git a/build.rs b/build.rs
index a103e7b..1ceb055 100644
--- a/build.rs
+++ b/build.rs
@@ -102,11 +102,21 @@ fn build_ffi() {
 fn build_symbols() {
 	use std::fmt::Write;
 
-	let mut out = r"#[allow(dead_code)]
+	let mut out = r"#[repr(usize)]
+#[derive(Clone, Copy)]
+/// # Whackadoodle Deflate Indices.
+pub(crate) enum DeflateSym {".to_owned();
+	for i in 0..19 {
+		write!(&mut out, "\n\tD{i:02} = {i}_usize,").unwrap();
+	}
+	out.push_str(r"
+}
+
+#[allow(dead_code)]
 #[repr(u16)]
 #[derive(Clone, Copy)]
 /// # Distance Symbols.
-pub(crate) enum Dsym {".to_owned();
+pub(crate) enum Dsym {");
 	for i in 0..32 {
 		write!(&mut out, "\n\tD{i:02} = {i}_u16,").unwrap();
 	}
diff --git a/src/image/zopflipng/blocks.rs b/src/image/zopflipng/blocks.rs
index 246e6f6..f7d7ad1 100644
--- a/src/image/zopflipng/blocks.rs
+++ b/src/image/zopflipng/blocks.rs
@@ -8,6 +8,7 @@ and ends that didn't make it into other modules.
 use dactyl::NoHash;
 use std::collections::HashSet;
 use super::{
+	DEFLATE_ORDER,
 	DISTANCE_BITS,
 	DISTANCE_VALUES,
 	FIXED_TREE_D,
@@ -765,7 +766,6 @@ fn calculate_tree_size(
 }
 
 #[allow(
-	unsafe_code,
 	clippy::cast_possible_truncation,
 	clippy::cognitive_complexity, // Yeah, this is terrible!
 	clippy::similar_names,
@@ -784,9 +784,6 @@ fn encode_tree(
 	extra: u8,
 	out: Option<&mut ZopfliOut>,
 ) -> Result<usize, ZopfliError> {
-	// Discombobulated cl_length/cl_count indexes, because DEFLATE is hateful.
-	const ORDER: [usize; 19] = [16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15];
-
 	// To use or not to use the extended part of the alphabet.
 	let use_16: bool = 0 != extra & 1;
 	let use_17: bool = 0 != extra & 2;
@@ -808,18 +805,19 @@ fn encode_tree(
 	let mut hdist = 29;
 	while hdist > 0 && d_lengths[hdist] == 0 { hdist -= 1; }
 
-	// We process length and (then) distance symbols in the same loop. This
-	// returns the symbol for one or the other depending on how far we've
-	// gotten.
-	let length_or_distance = |idx: usize| {
-		// The compiler is smart enough to know hlit2 is valid for ll_lengths.
-		if idx < hlit2 { ll_lengths[idx] }
-		else {
-			// Safety: the index will always be between 0..L+D; if we're
-			// past the Ls, we're onto the Ds.
-			unsafe { *d_lengths.get_unchecked(idx - hlit2) }
-		}
-	};
+	// The upcoming loop processes lengths and distances together (one after
+	// the other). This macro returns the symbol from the appropriate table
+	// based on the index.
+	macro_rules! length_or_distance {
+		($idx:ident) => (
+			if $idx < hlit2 { ll_lengths[$idx] }
+			else {
+				// This mask isn't necessary but the compiler doesn't
+				// understand what we're trying to do.
+				d_lengths[($idx - hlit2).min(29)]
+			}
+		);
+	}
 
 	// Run through all the length symbols, then the distance symbols, with the
 	// odd skip to keep us on our toes.
@@ -827,13 +825,13 @@ fn encode_tree(
 	let mut i = 0;
 	while i < lld_total {
 		let mut count = 1;
-		let symbol = length_or_distance(i);
+		let symbol = length_or_distance!(i);
 		if symbol >= 19 { return Err(zopfli_error!()); }
 
 		// Peek ahead; we may be able to do more in one go.
 		if use_16 || (symbol == 0 && (use_17 || use_18)) {
 			let mut j = i + 1;
-			while j < lld_total && symbol == length_or_distance(j) {
+			while j < lld_total && symbol == length_or_distance!(j) {
 				count += 1;
 				j += 1;
 			}
@@ -892,7 +890,7 @@ fn encode_tree(
 	// Find the last non-zero index of the counts table.
 	// Safety: all ORDER values are between 0..19.
 	let mut hclen = 15;
-	while hclen > 0 && unsafe { *cl_counts.get_unchecked(ORDER[hclen + 3]) } == 0 {
+	while hclen > 0 && cl_counts[DEFLATE_ORDER[hclen + 3] as usize] == 0 {
 		hclen -= 1;
 	}
 
@@ -908,9 +906,9 @@ fn encode_tree(
 		out.add_bits(hclen as u32, 4);
 
 		// Write each cl_length in the jumbled DEFLATE order.
-		for &o in &ORDER[..hclen + 4] {
+		for &o in &DEFLATE_ORDER[..hclen + 4] {
 			// Safety: all ORDER values are between 0..19.
-			out.add_bits(unsafe { *cl_lengths.get_unchecked(o) }, 3);
+			out.add_bits(cl_lengths[o as usize], 3);
 		}
 
 		// Write each symbol in order of appearance along with its extra bits,
diff --git a/src/image/zopflipng/mod.rs b/src/image/zopflipng/mod.rs
index 198f151..2194947 100644
--- a/src/image/zopflipng/mod.rs
+++ b/src/image/zopflipng/mod.rs
@@ -46,6 +46,7 @@ use super::{
 	},
 };
 use symbols::{
+	DEFLATE_ORDER,
 	DISTANCE_BITS,
 	DISTANCE_SYMBOLS,
 	DISTANCE_VALUES,
diff --git a/src/image/zopflipng/symbols.rs b/src/image/zopflipng/symbols.rs
index 37d833b..414cfbb 100644
--- a/src/image/zopflipng/symbols.rs
+++ b/src/image/zopflipng/symbols.rs
@@ -19,6 +19,17 @@ pub(crate) const DISTANCE_BITS: [u8; 32] = [
 	7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 0, 0,
 ];
 
+/// # Whackadoodle DEFLATE Ordering.
+///
+/// Tree encoding writes data out-of-order for some reason. This is that order.
+pub(crate) const DEFLATE_ORDER: [DeflateSym; 19] = [
+	DeflateSym::D16, DeflateSym::D17, DeflateSym::D18, DeflateSym::D00,
+	DeflateSym::D08, DeflateSym::D07, DeflateSym::D09, DeflateSym::D06,
+	DeflateSym::D10, DeflateSym::D05, DeflateSym::D11, DeflateSym::D04,
+	DeflateSym::D12, DeflateSym::D03, DeflateSym::D13, DeflateSym::D02,
+	DeflateSym::D14, DeflateSym::D01, DeflateSym::D15,
+];
+
 /// # Length Symbols, Extra Bits, and Bit Values.
 ///
 /// This contains all symbols, bits, and bit values indexed by their

From 32b1aa911a07985dffd5384e8a49810569551b0d Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Thu, 18 Apr 2024 21:31:29 -0700
Subject: [PATCH 14/31] misc: replace unreachable! w/ error

---
 src/image/zopflipng/blocks.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/image/zopflipng/blocks.rs b/src/image/zopflipng/blocks.rs
index f7d7ad1..b801d1a 100644
--- a/src/image/zopflipng/blocks.rs
+++ b/src/image/zopflipng/blocks.rs
@@ -101,11 +101,12 @@ impl SplitPoints {
 				pos += e.length() as usize;
 			}
 
-			unreachable!();
+			Err(zopfli_error!())
 		}
 		else { Ok(len) }
 	}
 
+	#[inline(never)]
 	/// # LZ77 Split Pass.
 	///
 	/// This sets the LZ77 split points according to convoluted cost
@@ -230,6 +231,7 @@ impl SplitPoints {
 
 
 
+#[inline]
 /// # Deflate a Part.
 ///
 /// Image compression is done in chunks of a million bytes. This does all the

From 33d5c322e05c80518ee9262a3a197f83a81e2f87 Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Thu, 18 Apr 2024 21:31:55 -0700
Subject: [PATCH 15/31] perf: elide bound check w/ condition

---
 src/image/zopflipng/blocks.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/image/zopflipng/blocks.rs b/src/image/zopflipng/blocks.rs
index b801d1a..79f481d 100644
--- a/src/image/zopflipng/blocks.rs
+++ b/src/image/zopflipng/blocks.rs
@@ -579,6 +579,7 @@ fn add_lz77_data(
 	else { Err(zopfli_error!()) }
 }
 
+#[inline(never)]
 /// # Calculate Block Size (in Bits).
 fn calculate_block_size(
 	store: &LZ77Store,
@@ -617,6 +618,7 @@ fn calculate_block_size(
 	}
 }
 
+#[inline]
 /// # Calculate Best Block Size (in Bits).
 fn calculate_block_size_auto_type(
 	store: &LZ77Store,
@@ -642,6 +644,7 @@ fn calculate_block_size_auto_type(
 	else { Ok(dynamic_cost) }
 }
 
+#[inline]
 /// # Calculate Block Symbol Size w/ Histogram.
 fn calculate_block_symbol_size(
 	ll_lengths: &[u32; ZOPFLI_NUM_LL],
@@ -673,6 +676,7 @@ fn calculate_block_symbol_size(
 	}
 }
 
+#[inline]
 /// # Calculate Block Symbol Size w/ Histogram and Counts.
 fn calculate_block_symbol_size_given_counts(
 	ll_counts: &[usize; ZOPFLI_NUM_LL],
@@ -715,6 +719,7 @@ fn calculate_block_symbol_size_given_counts(
 	result
 }
 
+#[inline]
 /// # Calculate Small Block Symbol Size.
 fn calculate_block_symbol_size_small(
 	ll_lengths: &[u32; ZOPFLI_NUM_LL],
@@ -727,9 +732,10 @@ fn calculate_block_symbol_size_small(
 	let mut result = ll_lengths[256] as usize;
 
 	// Loop the store if we have data to loop.
-	if lstart < lend {
+	let slice = store.entries.as_slice();
+	if lstart < lend && lend <= slice.len() {
 		// Make sure the end does not exceed the store!
-		for e in &store.entries[lstart..lend] {
+		for e in &slice[lstart..lend] {
 			if e.dist <= 0 {
 				result += ll_lengths[e.litlen as usize] as usize;
 			}

From 1247e55e84ee8c4eaeeeedf344d6ca2700838c09 Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Sat, 20 Apr 2024 10:49:30 -0700
Subject: [PATCH 16/31] cleanup: remove unnecessary comparisons

---
 src/image/zopflipng/blocks.rs | 4 ++--
 src/image/zopflipng/hash.rs   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/image/zopflipng/blocks.rs b/src/image/zopflipng/blocks.rs
index 79f481d..cdcdbe5 100644
--- a/src/image/zopflipng/blocks.rs
+++ b/src/image/zopflipng/blocks.rs
@@ -758,13 +758,13 @@ fn calculate_tree_size(
 	ll_lengths: &[u32; ZOPFLI_NUM_LL],
 	d_lengths: &[u32; ZOPFLI_NUM_D],
 ) -> Result<(u8, usize), ZopfliError> {
-	let mut best_size = 0;
+	let mut best_size = usize::MAX;
 	let mut best_idx = 0;
 
 	for i in 0..8 {
 		let size = encode_tree(ll_lengths, d_lengths, i, None)?;
 
-		if best_size == 0 || size < best_size {
+		if size < best_size {
 			best_size = size;
 			best_idx = i;
 		}
diff --git a/src/image/zopflipng/hash.rs b/src/image/zopflipng/hash.rs
index 5484573..0e48818 100644
--- a/src/image/zopflipng/hash.rs
+++ b/src/image/zopflipng/hash.rs
@@ -674,7 +674,7 @@ impl ZopfliHash {
 
 				// This logic is so screwy; I hesitate to make this a debug
 				// assertion!
-				if test_length != length && length > 2 && test_length > 2 {
+				if test_length != length && test_length > 2 {
 					return Err(zopfli_error!());
 				}
 

From d8faee72b8a6ef1ebd68e94db4407db437752fff Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Sat, 20 Apr 2024 10:50:51 -0700
Subject: [PATCH 17/31] cleanup: inline/simplify calculate_block_symbol_size

---
 src/image/zopflipng/blocks.rs | 45 ++++++-----------------------------
 1 file changed, 7 insertions(+), 38 deletions(-)

diff --git a/src/image/zopflipng/blocks.rs b/src/image/zopflipng/blocks.rs
index cdcdbe5..9dcad9c 100644
--- a/src/image/zopflipng/blocks.rs
+++ b/src/image/zopflipng/blocks.rs
@@ -596,14 +596,18 @@ fn calculate_block_size(
 			let blocks = blocksize.div_ceil(65_535);
 			Ok(blocks * 40 + blocksize * 8)
 		},
-		BlockType::Fixed =>
-			Ok(calculate_block_symbol_size(
+		BlockType::Fixed => {
+			let (ll_counts, d_counts) = store.histogram(lstart, lend)?;
+			Ok(calculate_block_symbol_size_given_counts(
+				&ll_counts,
+				&d_counts,
 				&FIXED_TREE_LL,
 				&FIXED_TREE_D,
 				store,
 				lstart,
 				lend,
-			)? + 3),
+			) + 3)
+		},
 		BlockType::Dynamic => {
 			let mut ll_lengths = [0_u32; ZOPFLI_NUM_LL];
 			let mut d_lengths = [0_u32; ZOPFLI_NUM_D];
@@ -644,38 +648,6 @@ fn calculate_block_size_auto_type(
 	else { Ok(dynamic_cost) }
 }
 
-#[inline]
-/// # Calculate Block Symbol Size w/ Histogram.
-fn calculate_block_symbol_size(
-	ll_lengths: &[u32; ZOPFLI_NUM_LL],
-	d_lengths: &[u32; ZOPFLI_NUM_D],
-	store: &LZ77Store,
-	lstart: usize,
-	lend: usize,
-) -> Result<usize, ZopfliError> {
-	if lstart + ZOPFLI_NUM_LL * 3 > lend {
-		Ok(calculate_block_symbol_size_small(
-			ll_lengths,
-			d_lengths,
-			store,
-			lstart,
-			lend,
-		))
-	}
-	else {
-		let (ll_counts, d_counts) = store.histogram(lstart, lend)?;
-		Ok(calculate_block_symbol_size_given_counts(
-			&ll_counts,
-			&d_counts,
-			ll_lengths,
-			d_lengths,
-			store,
-			lstart,
-			lend,
-		))
-	}
-}
-
 #[inline]
 /// # Calculate Block Symbol Size w/ Histogram and Counts.
 fn calculate_block_symbol_size_given_counts(
@@ -896,7 +868,6 @@ fn encode_tree(
 	zopfli_length_limited_code_lengths::<7, 19>(&cl_counts, &mut cl_lengths)?;
 
 	// Find the last non-zero index of the counts table.
-	// Safety: all ORDER values are between 0..19.
 	let mut hclen = 15;
 	while hclen > 0 && cl_counts[DEFLATE_ORDER[hclen + 3] as usize] == 0 {
 		hclen -= 1;
@@ -915,7 +886,6 @@ fn encode_tree(
 
 		// Write each cl_length in the jumbled DEFLATE order.
 		for &o in &DEFLATE_ORDER[..hclen + 4] {
-			// Safety: all ORDER values are between 0..19.
 			out.add_bits(cl_lengths[o as usize], 3);
 		}
 
@@ -1304,7 +1274,6 @@ fn try_lz77_expensive_fixed(
 	expected_data_size: usize,
 	out: &mut ZopfliOut,
 ) -> Result<bool, ZopfliError> {
-	// Safety: the split points are checked at creation.
 	let (instart, inend) = get_lz77_byte_range(store, lstart, lend)?;
 
 	// Run all the expensive fixed-cost checks.

From c7c45f460c67003fa8a28df289d866c0ee145b67 Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Sat, 20 Apr 2024 10:58:09 -0700
Subject: [PATCH 18/31] cleanup: shrink variable

---
 src/image/zopflipng/blocks.rs | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/image/zopflipng/blocks.rs b/src/image/zopflipng/blocks.rs
index 9dcad9c..b5c42cf 100644
--- a/src/image/zopflipng/blocks.rs
+++ b/src/image/zopflipng/blocks.rs
@@ -1234,18 +1234,18 @@ fn optimize_huffman_for_rle(mut counts: &mut [usize]) {
 /// Ensure there are at least two distance codes to avoid issues with buggy
 /// decoders.
 fn patch_distance_codes(d_lengths: &mut [u32; ZOPFLI_NUM_D]) {
-	let mut one: Option<usize> = None;
+	let mut one: Option<bool> = None;
 	for (i, dist) in d_lengths.iter().copied().enumerate().take(30) {
 		// We have (at least) two non-zero entries; no patching needed!
-		if 0 != dist && one.replace(i).is_some() { return; }
+		if 0 != dist && one.replace(i == 0).is_some() { return; }
 	}
 
 	match one {
-		// The first entry had a code, so patch the second to give us two.
-		Some(0) => { d_lengths[1] = 1; },
-		// Patch the first entry to give us two.
-		Some(_) => { d_lengths[0] = 1; },
-		// There were no codes, so let's just patch the first two.
+		// The first entry had a code, so patching the second gives us two.
+		Some(true) => { d_lengths[1] = 1; },
+		// The first entry didn't have a code, so patching it gives us two.
+		Some(false) => { d_lengths[0] = 1; },
+		// There were no codes, so we can just patch the first two.
 		None => {
 			d_lengths[0] = 1;
 			d_lengths[1] = 1;

From 8ff78ee8bff52f0afb542e74fa24849675918746 Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Sat, 20 Apr 2024 18:39:22 -0700
Subject: [PATCH 19/31] misc: pass sublen as array instead of slice misc:
 return error instead of false for bad trace lengths

---
 src/image/zopflipng/cache.rs | 28 +++++++-------
 src/image/zopflipng/hash.rs  | 75 ++++++++++++++++++------------------
 2 files changed, 52 insertions(+), 51 deletions(-)

diff --git a/src/image/zopflipng/cache.rs b/src/image/zopflipng/cache.rs
index 407e8c3..3ba489c 100644
--- a/src/image/zopflipng/cache.rs
+++ b/src/image/zopflipng/cache.rs
@@ -6,6 +6,7 @@ the thread-local LMC static.
 */
 
 use super::{
+	SUBLEN_LEN,
 	zopfli_error,
 	ZOPFLI_MAX_MATCH,
 	ZOPFLI_MIN_MATCH,
@@ -91,7 +92,7 @@ impl MatchCache {
 		&self,
 		pos: usize,
 		limit: &mut usize,
-		sublen: &mut [u16],
+		sublen: &mut Option<&mut [u16; SUBLEN_LEN]>,
 		distance: &mut u16,
 		length: &mut u16,
 	) -> Result<bool, ZopfliError> {
@@ -101,33 +102,30 @@ impl MatchCache {
 
 		// Find the max sublength once, if ever.
 		let maxlength =
-			if sublen.is_empty() { 0 }
+			if sublen.is_none() { 0 }
 			else { max_sublen(&self.sublen[SUBLEN_CACHED_LEN * pos..]) };
 
 		// Proceed if our cached length or max sublength are under the limit.
 		if
 			*limit == ZOPFLI_MAX_MATCH ||
 			usize::from(cache_len) <= *limit ||
-			(! sublen.is_empty() && maxlength >= *limit)
+			(sublen.is_some() && maxlength >= *limit)
 		{
 			// Update length and distance if the sublength pointer is null or
 			// the cached sublength is bigger than the cached length.
-			if sublen.is_empty() || usize::from(cache_len) <= maxlength {
+			if sublen.is_none() || usize::from(cache_len) <= maxlength {
 				// Cap the length.
 				*length = cache_len;
 				if usize::from(*length) > *limit {
 					*length = *limit as u16;
 				}
 
-				// Use the cached distance directly.
-				if sublen.is_empty() {
-					*distance = cache_dist;
-				}
-				else {
+				// Set the distance from the sublength cache.
+				if let Some(s) = sublen {
 					// Pull the sublength from cache and pull the distance from
 					// that.
-					self.write_sublen(pos, usize::from(*length), sublen);
-					*distance = sublen[usize::from(*length)];
+					self.write_sublen(pos, usize::from(*length), s);
+					*distance = s[usize::from(*length)];
 
 					// Sanity check: make sure the sublength distance at length
 					// matches the redundantly-cached distance.
@@ -139,6 +137,10 @@ impl MatchCache {
 						return Err(zopfli_error!());
 					}
 				}
+				// Use the cached distance directly.
+				else {
+					*distance = cache_dist;
+				}
 
 				// We did stuff!
 				return Ok(true);
@@ -172,7 +174,7 @@ impl MatchCache {
 	pub(crate) fn set_sublen(
 		&mut self,
 		pos: usize,
-		sublen: &[u16],
+		sublen: &[u16; SUBLEN_LEN],
 		distance: u16,
 		length: u16,
 	) -> Result<(), ZopfliError> {
@@ -225,7 +227,7 @@ impl MatchCache {
 	/// # Write Sublength.
 	///
 	/// Fill the provided sublength slice with data from the cache.
-	fn write_sublen(&self, pos: usize, length: usize, sublen: &mut [u16]) {
+	fn write_sublen(&self, pos: usize, length: usize, sublen: &mut [u16; SUBLEN_LEN]) {
 		// Short circuit.
 		if length < ZOPFLI_MIN_MATCH { return; }
 
diff --git a/src/image/zopflipng/hash.rs b/src/image/zopflipng/hash.rs
index 0e48818..5359e42 100644
--- a/src/image/zopflipng/hash.rs
+++ b/src/image/zopflipng/hash.rs
@@ -117,7 +117,7 @@ impl ZopfliState {
 				arr,
 				i,
 				ZOPFLI_MAX_MATCH,
-				&mut sublen,
+				&mut Some(&mut sublen),
 				&mut distance,
 				&mut length,
 				&mut self.lmc,
@@ -227,7 +227,7 @@ impl ZopfliState {
 
 			// Forward and backward squeeze passes.
 			self.hash.get_best_lengths(arr, instart, stats, costs, &mut self.lmc)?;
-			if self.trace_paths() {
+			if self.trace_paths()? {
 				self.hash.follow_paths(
 					arr,
 					instart,
@@ -249,28 +249,31 @@ impl ZopfliState {
 	///
 	/// Calculate the optimal path of lz77 lengths to use, from the
 	/// lengths gathered during the `ZopfliHash::get_best_lengths` pass.
-	fn trace_paths(&mut self) -> bool {
+	fn trace_paths(&mut self) -> Result<bool, ZopfliError> {
 		let costs = self.costs.as_slice();
-		if costs.len() < 2 { return false; }
-
-		self.paths.truncate(0);
-		let mut idx = costs.len() - 1;
-		while 0 < idx && idx < costs.len() {
-			let v = costs[idx].1;
-			debug_assert!((1..=ZOPFLI_MAX_MATCH as u16).contains(&v));
-			if ! (1..=ZOPFLI_MAX_MATCH as u16).contains(&v) { return false; }
-
-			// Only lengths of at least ZOPFLI_MIN_MATCH count as lengths
-			// after tracing.
-			self.paths.push(
-				if v < ZOPFLI_MIN_MATCH as u16 { 1 } else { v }
-			);
+		if costs.len() < 2 { Ok(false) }
+		else {
+			self.paths.truncate(0);
+			let mut idx = costs.len() - 1;
+			while 0 < idx && idx < costs.len() {
+				let v = usize::from(costs[idx].1);
+				if 1 <= v && v <= idx && v <= ZOPFLI_MAX_MATCH {
+					// Only lengths of at least ZOPFLI_MIN_MATCH count as lengths
+					// after tracing.
+					self.paths.push(
+						if v < ZOPFLI_MIN_MATCH { 1 } else { v as u16 }
+					);
+
+					// Move onto the next length or finish.
+					idx -= v;
+				}
+				else {
+					return Err(zopfli_error!());
+				}
+			}
 
-			// Move onto the next length or finish.
-			idx = idx.saturating_sub(usize::from(v));
+			Ok(true)
 		}
-
-		true
 	}
 }
 
@@ -470,7 +473,7 @@ impl ZopfliHash {
 				arr,
 				i,
 				ZOPFLI_MAX_MATCH,
-				&mut sublen,
+				&mut Some(&mut sublen),
 				&mut distance,
 				&mut length,
 				lmc,
@@ -665,7 +668,7 @@ impl ZopfliHash {
 					arr,
 					i,
 					usize::from(length),
-					&mut [],
+					&mut None,
 					&mut dist,
 					&mut test_length,
 					lmc,
@@ -714,7 +717,7 @@ impl ZopfliHash {
 		arr: &[u8],
 		pos: usize,
 		mut limit: usize,
-		sublen: &mut [u16],
+		sublen: &mut Option<&mut [u16; SUBLEN_LEN]>,
 		distance: &mut u16,
 		length: &mut u16,
 		lmc: &mut MatchCache,
@@ -735,7 +738,7 @@ impl ZopfliHash {
 		}
 
 		// This is explicitly checked by the caller and again by find_loop()
-		// below, so we can leave this as a redundant debug assertion.
+		// below, so we can leave it as a redundant debug assertion.
 		debug_assert!((ZOPFLI_MIN_MATCH..=ZOPFLI_MAX_MATCH).contains(&limit));
 
 		// We'll need at least ZOPFLI_MIN_MATCH bytes for a search; if we don't
@@ -754,9 +757,11 @@ impl ZopfliHash {
 		let (bestdist, bestlength) = self.find_loop(arr, pos, limit, sublen)?;
 
 		// Cache the results for next time, maybe.
-		if let Some(blockstart) = cache {
-			if limit == ZOPFLI_MAX_MATCH && ! sublen.is_empty() {
-				lmc.set_sublen(pos - blockstart, sublen, bestdist, bestlength)?;
+		if limit == ZOPFLI_MAX_MATCH {
+			if let Some(blockstart) = cache {
+				if let Some(s) = sublen {
+					lmc.set_sublen(pos - blockstart, s, bestdist, bestlength)?;
+				}
 			}
 		}
 
@@ -783,17 +788,11 @@ impl ZopfliHash {
 		arr: &[u8],
 		pos: usize,
 		limit: usize,
-		sublen: &mut [u16],
+		sublen: &mut Option<&mut [u16; SUBLEN_LEN]>,
 	) -> Result<(u16, u16), ZopfliError> {
 		// This is asserted by find() too, but it's a good reminder.
 		if ZOPFLI_MAX_MATCH < limit { return Err(zopfli_error!()); }
 
-		// Help the compiler understand sublen has a fixed size if provided.
-		// (We can't do an Option<Array> because it's too big for Copy.)
-		if ! sublen.is_empty() && sublen.len() != ZOPFLI_MAX_MATCH + 1 {
-			return Err(zopfli_error!());
-		}
-
 		let hpos = pos & ZOPFLI_WINDOW_MASK;
 
 		// The default distance and length. We'll be wanting 16-bit values for
@@ -877,13 +876,13 @@ impl ZopfliHash {
 				}
 
 				// We've found a better length!
-				if bestlength < currentlength {
+				if bestlength < currentlength && currentlength < SUBLEN_LEN {
 					// Update the sublength slice, if provided. Note that
 					// sublengths are (ZOPFLI_MAX_MATCH+1) if provided, and
 					// ZOPFLI_MAX_MATCH is the largest possible value of
 					// currentlength.
-					if currentlength < sublen.len() {
-						sublen[bestlength + 1..=currentlength].fill(dist as u16);
+					if let Some(s) = sublen {
+						s[bestlength + 1..=currentlength].fill(dist as u16);
 					}
 
 					bestdist = dist;

From 47107fc88383de955add6c932bd0fada20fcdcaa Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Sat, 20 Apr 2024 19:12:21 -0700
Subject: [PATCH 20/31] cleanup: make conditionals smarter for push_entry,
 remove unsafe

---
 src/image/zopflipng/lz77.rs | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/src/image/zopflipng/lz77.rs b/src/image/zopflipng/lz77.rs
index bd2919c..1b2ea65 100644
--- a/src/image/zopflipng/lz77.rs
+++ b/src/image/zopflipng/lz77.rs
@@ -56,46 +56,41 @@ impl LZ77Store {
 		LZ77StoreEntry::new(litlen, dist, pos).map(|e| self.push_entry(e))
 	}
 
-	#[allow(unsafe_code)]
 	/// # Push Entry.
 	fn push_entry(&mut self, entry: LZ77StoreEntry) {
 		let old_len = self.entries.len();
-		let ll_start = ZOPFLI_NUM_LL * old_len.wrapping_div(ZOPFLI_NUM_LL);
-		let d_start = ZOPFLI_NUM_D * old_len.wrapping_div(ZOPFLI_NUM_D);
 
 		// The histograms are wrapping and cumulative, and need to be extended
-		// any time we reach a new ZOPFLI_NUM_* bucket level.
+		// any time we reach a new ZOPFLI_NUM_* bucket level. The first time
+		// around, we just need to zero-pad.
 		if old_len == 0 {
 			self.ll_counts.resize(ZOPFLI_NUM_LL, 0);
 			self.d_counts.resize(ZOPFLI_NUM_D, 0);
 		}
-		else {
+		// New chunks start with the previous chunk's totals.
+		else if old_len % ZOPFLI_NUM_D == 0 {
+			self.d_counts.extend_from_within((old_len - ZOPFLI_NUM_D)..old_len);
+
+			// 288 (LL) is evenly divisible by 32 (D), so would only ever wrap
+			// in cases where D wrapped too.
 			if old_len % ZOPFLI_NUM_LL == 0 {
 				self.ll_counts.extend_from_within((old_len - ZOPFLI_NUM_LL)..old_len);
 			}
-
-			if old_len % ZOPFLI_NUM_D == 0 {
-				self.d_counts.extend_from_within((old_len - ZOPFLI_NUM_D)..old_len);
-			}
 		}
 
 		// If the distance is zero, we just need to bump the litlen count.
 		if entry.dist <= 0 {
-			// Safety: the counts were pre-allocated a few lines back (if
-			// needed).
-			unsafe {
-				*self.ll_counts.get_unchecked_mut(ll_start + entry.litlen as usize) += 1;
-			}
+			let ll_counts = self.ll_counts.as_mut_slice();
+			ll_counts[ll_counts.len() - ZOPFLI_NUM_LL + entry.litlen as usize] += 1;
 		}
 		// If it is non-zero, we need to set the correct symbols and bump both
 		// counts.
 		else {
-			// Safety: the counts were pre-allocated a few lines back (if
-			// needed).
-			unsafe {
-				*self.ll_counts.get_unchecked_mut(ll_start + entry.ll_symbol as usize) += 1;
-				*self.d_counts.get_unchecked_mut(d_start + entry.d_symbol as usize) += 1;
-			}
+			let ll_counts = self.ll_counts.as_mut_slice();
+			ll_counts[ll_counts.len() - ZOPFLI_NUM_LL + entry.ll_symbol as usize] += 1;
+
+			let d_counts = self.d_counts.as_mut_slice();
+			d_counts[d_counts.len() - ZOPFLI_NUM_D + entry.d_symbol as usize] += 1;
 		}
 
 		// Don't forget to push the entry!

From d8a7b490f0739bedbfad68aef6edfb9864a58460 Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Sat, 20 Apr 2024 20:52:53 -0700
Subject: [PATCH 21/31] cleanup: simplify histogram conditionals

---
 src/image/zopflipng/lz77.rs | 40 +++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/src/image/zopflipng/lz77.rs b/src/image/zopflipng/lz77.rs
index 1b2ea65..cf5bbdf 100644
--- a/src/image/zopflipng/lz77.rs
+++ b/src/image/zopflipng/lz77.rs
@@ -51,11 +51,13 @@ impl LZ77Store {
 		self.d_counts.truncate(0);
 	}
 
+	#[inline(never)]
 	/// # Push Values.
 	pub(crate) fn push(&mut self, litlen: u16, dist: u16, pos: usize) -> Result<(), ZopfliError> {
 		LZ77StoreEntry::new(litlen, dist, pos).map(|e| self.push_entry(e))
 	}
 
+	#[inline]
 	/// # Push Entry.
 	fn push_entry(&mut self, entry: LZ77StoreEntry) {
 		let old_len = self.entries.len();
@@ -79,14 +81,13 @@ impl LZ77Store {
 		}
 
 		// If the distance is zero, we just need to bump the litlen count.
+		let ll_counts = self.ll_counts.as_mut_slice();
 		if entry.dist <= 0 {
-			let ll_counts = self.ll_counts.as_mut_slice();
 			ll_counts[ll_counts.len() - ZOPFLI_NUM_LL + entry.litlen as usize] += 1;
 		}
 		// If it is non-zero, we need to set the correct symbols and bump both
 		// counts.
 		else {
-			let ll_counts = self.ll_counts.as_mut_slice();
 			ll_counts[ll_counts.len() - ZOPFLI_NUM_LL + entry.ll_symbol as usize] += 1;
 
 			let d_counts = self.d_counts.as_mut_slice();
@@ -163,11 +164,9 @@ impl LZ77Store {
 			.ok_or(zopfli_error!())?;
 
 		// Subtract the symbol occurences between (pos+1) and the end of the
-		// chunks.
-		for (i, e) in self.entries.iter().enumerate().take(ll_end.max(d_end)).skip(pos + 1) {
-			if i < ll_end {
-				ll_counts[e.ll_symbol as usize] -= 1;
-			}
+		// available data for the chunk.
+		for (i, e) in self.entries.iter().enumerate().take(ll_end).skip(pos + 1) {
+			ll_counts[e.ll_symbol as usize] -= 1;
 			if i < d_end && 0 < e.dist {
 				d_counts[e.d_symbol as usize] -= 1;
 			}
@@ -194,10 +193,8 @@ impl LZ77Store {
 		// the end_counts. We can avoid intermediate storage by rearranging
 		// the formula so that the start_symbols get _added_ to the end_counts
 		// directly.
-		for (i, e) in self.entries.iter().enumerate().take(ll_end.max(d_end)).skip(pos + 1) {
-			if i < ll_end {
-				ll_counts[e.ll_symbol as usize] += 1;
-			}
+		for (i, e) in self.entries.iter().enumerate().take(ll_end).skip(pos + 1) {
+			ll_counts[e.ll_symbol as usize] += 1;
 			if i < d_end && 0 < e.dist {
 				d_counts[e.d_symbol as usize] += 1;
 			}
@@ -267,3 +264,24 @@ impl LZ77StoreEntry {
 		else { self.litlen }
 	}
 }
+
+
+
+#[cfg(test)]
+mod tests {
+	use super::*;
+
+	#[test]
+	fn t_histogram_sub_take() {
+		// In _histogram_sub(), we assume d_end <= ll_end; let's verify that
+		// pattern seems to hold…
+		for i in 0..=usize::from(u16::MAX) {
+			let ll_start = ZOPFLI_NUM_LL * i.wrapping_div(ZOPFLI_NUM_LL);
+			let d_start = ZOPFLI_NUM_D * i.wrapping_div(ZOPFLI_NUM_D);
+			let ll_end = ll_start + ZOPFLI_NUM_LL;
+			let d_end = d_start + ZOPFLI_NUM_D;
+
+			assert!(d_end <= ll_end, "Failed with {i}!");
+		}
+	}
+}

From da9b9e813344ffdaf2ce5aab3c9d9d6dd351d3b5 Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Sat, 20 Apr 2024 22:33:59 -0700
Subject: [PATCH 22/31] misc: convert bound panic to Err

---
 src/image/zopflipng/blocks.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/image/zopflipng/blocks.rs b/src/image/zopflipng/blocks.rs
index b5c42cf..bf7f6f6 100644
--- a/src/image/zopflipng/blocks.rs
+++ b/src/image/zopflipng/blocks.rs
@@ -535,7 +535,7 @@ fn add_lz77_data(
 	out: &mut ZopfliOut
 ) -> Result<(), ZopfliError> {
 	let mut test_size = 0;
-	for e in &store.entries[lstart..lend] {
+	for e in store.entries.get(lstart..lend).ok_or(zopfli_error!())? {
 		// Length only.
 		if e.dist <= 0 {
 			if (e.litlen as u16) >= 256 {

From 12213847466b192e319f8f93827a8cac3bd26fbf Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Sat, 20 Apr 2024 23:58:08 -0700
Subject: [PATCH 23/31] cleanup: reset zopfli inlining hints

---
 src/image/zopflipng/blocks.rs | 18 ------------------
 src/image/zopflipng/cache.rs  |  1 -
 src/image/zopflipng/hash.rs   | 16 +++-------------
 src/image/zopflipng/kat.rs    |  5 -----
 src/image/zopflipng/lz77.rs   |  2 --
 5 files changed, 3 insertions(+), 39 deletions(-)

diff --git a/src/image/zopflipng/blocks.rs b/src/image/zopflipng/blocks.rs
index bf7f6f6..eeb62ff 100644
--- a/src/image/zopflipng/blocks.rs
+++ b/src/image/zopflipng/blocks.rs
@@ -71,7 +71,6 @@ impl SplitPoints {
 }
 
 impl SplitPoints {
-	#[inline]
 	/// # Uncompressed Split Pass.
 	///
 	/// This sets the uncompressed split points, by way of first setting the
@@ -106,7 +105,6 @@ impl SplitPoints {
 		else { Ok(len) }
 	}
 
-	#[inline(never)]
 	/// # LZ77 Split Pass.
 	///
 	/// This sets the LZ77 split points according to convoluted cost
@@ -160,7 +158,6 @@ impl SplitPoints {
 	}
 
 	#[allow(unsafe_code)]
-	#[inline]
 	/// # Split Best.
 	///
 	/// Compare the optimal raw split points with a dedicated lz77 pass and
@@ -231,7 +228,6 @@ impl SplitPoints {
 
 
 
-#[inline]
 /// # Deflate a Part.
 ///
 /// Image compression is done in chunks of a million bytes. This does all the
@@ -446,7 +442,6 @@ fn add_lz77_block(
 	clippy::cast_sign_loss,
 	clippy::too_many_arguments,
 )]
-#[inline]
 /// # Add LZ77 Block (Automatic Type).
 ///
 /// This calculates the expected output sizes for all three block types, then
@@ -501,7 +496,6 @@ fn add_lz77_block_auto_type(
 	)
 }
 
-#[inline(never)]
 /// # Add Dynamic Tree.
 ///
 /// Determine the optimal tree index, then add it to the output.
@@ -579,7 +573,6 @@ fn add_lz77_data(
 	else { Err(zopfli_error!()) }
 }
 
-#[inline(never)]
 /// # Calculate Block Size (in Bits).
 fn calculate_block_size(
 	store: &LZ77Store,
@@ -622,7 +615,6 @@ fn calculate_block_size(
 	}
 }
 
-#[inline]
 /// # Calculate Best Block Size (in Bits).
 fn calculate_block_size_auto_type(
 	store: &LZ77Store,
@@ -648,7 +640,6 @@ fn calculate_block_size_auto_type(
 	else { Ok(dynamic_cost) }
 }
 
-#[inline]
 /// # Calculate Block Symbol Size w/ Histogram and Counts.
 fn calculate_block_symbol_size_given_counts(
 	ll_counts: &[usize; ZOPFLI_NUM_LL],
@@ -691,7 +682,6 @@ fn calculate_block_symbol_size_given_counts(
 	result
 }
 
-#[inline]
 /// # Calculate Small Block Symbol Size.
 fn calculate_block_symbol_size_small(
 	ll_lengths: &[u32; ZOPFLI_NUM_LL],
@@ -750,7 +740,6 @@ fn calculate_tree_size(
 	clippy::cognitive_complexity, // Yeah, this is terrible!
 	clippy::similar_names,
 )]
-#[inline(never)]
 /// # Encode Huffman Tree.
 ///
 /// Build a tree and optionally write it to the output file, returning the
@@ -1012,8 +1001,6 @@ fn find_minimum_cost(
 /// (This is not necessarily the optimal Huffman lengths.)
 ///
 /// The total size in bits (minus the 3-bit header) is returned.
-///
-/// This is a rewrite of the original `deflate.c` method.
 fn get_dynamic_lengths(
 	store: &LZ77Store,
 	lstart: usize,
@@ -1063,8 +1050,6 @@ fn get_lz77_byte_range(
 ///
 /// Note: this incorporates the functionality of `ZopfliLZ77OptimalRun`
 /// directly.
-///
-/// This is a rewrite of the original `squeeze.c` method.
 fn lz77_optimal(
 	arr: &[u8],
 	instart: usize,
@@ -1168,8 +1153,6 @@ fn split_cost(store: &LZ77Store, start: usize, mid: usize, end: usize) -> Result
 ///
 /// Change the population counts to improve Huffman tree compression,
 /// particularly its RLE part.
-///
-/// This is a rewrite of the original `deflate.c` method.
 fn optimize_huffman_for_rle(mut counts: &mut [usize]) {
 	// Convert counts to a proper slice with trailing zeroes trimmed.
 	let ptr = counts.as_mut_ptr();
@@ -1254,7 +1237,6 @@ fn patch_distance_codes(d_lengths: &mut [u32; ZOPFLI_NUM_D]) {
 }
 
 #[allow(clippy::too_many_arguments)]
-#[inline(never)]
 /// # (Maybe) Add LZ77 Expensive Fixed Block.
 ///
 /// This runs the full suite of fixed-tree tests on the data and writes it to
diff --git a/src/image/zopflipng/cache.rs b/src/image/zopflipng/cache.rs
index 3ba489c..90ff41c 100644
--- a/src/image/zopflipng/cache.rs
+++ b/src/image/zopflipng/cache.rs
@@ -80,7 +80,6 @@ impl MatchCache {
 		}
 	}
 
-	#[inline]
 	#[allow(clippy::cast_possible_truncation)]
 	/// # Find Match.
 	///
diff --git a/src/image/zopflipng/hash.rs b/src/image/zopflipng/hash.rs
index 5359e42..0d4c1fd 100644
--- a/src/image/zopflipng/hash.rs
+++ b/src/image/zopflipng/hash.rs
@@ -244,7 +244,6 @@ impl ZopfliState {
 
 impl ZopfliState {
 	#[allow(clippy::cast_possible_truncation)]
-	#[inline]
 	/// # Trace Paths.
 	///
 	/// Calculate the optimal path of lz77 lengths to use, from the
@@ -381,7 +380,6 @@ impl ZopfliHash {
 		clippy::cast_possible_wrap,
 		clippy::similar_names,
 	)]
-	#[inline]
 	/// # Update Hash.
 	///
 	/// This updates the hash tables using the data from `arr`. The `pos` value
@@ -421,18 +419,14 @@ impl ZopfliHash {
 
 impl ZopfliHash {
 	#[allow(clippy::cast_possible_truncation)]
-	#[inline]
 	/// # Get Best Lengths.
 	///
 	/// This method performs the forward pass for "squeeze", calculating the
 	/// optimal length to reach every byte from a previous byte. The resulting
 	/// cost is returned.
 	///
-	/// Note: the repeated float truncation looks like an oversight but is
-	/// intentional; trying to use only one or the other exclusively alters the
-	/// outcome, so whatever. Haha.
-	///
-	/// This is a rewrite of the original `squeeze.c` method.
+	/// Note: the costs really do need to be calculated in 64 bits, truncated
+	/// to 32 bits for storage, then widened back to 64 bits for comparison.
 	fn get_best_lengths(
 		&mut self,
 		arr: &[u8],
@@ -572,7 +566,6 @@ impl ZopfliHash {
 	}
 
 	#[allow(clippy::cast_possible_truncation)]
-	#[inline]
 	/// # Best Length Max Match.
 	///
 	/// This fast-forwards through long repetitions in the middle of a
@@ -675,8 +668,7 @@ impl ZopfliHash {
 					Some(instart),
 				)?;
 
-				// This logic is so screwy; I hesitate to make this a debug
-				// assertion!
+				// Make sure we were able to find what we were expecting.
 				if test_length != length && test_length > 2 {
 					return Err(zopfli_error!());
 				}
@@ -999,8 +991,6 @@ impl ZopfliHashChain {
 ///
 /// This is a simplistic cost model for the "greedy" LZ77 pass that helps it
 /// make a slightly better choice between two options during lazy matching.
-///
-/// This is a rewrite of the original `lz77.c` method.
 const fn get_length_score(length: u16, distance: u16) -> u16 {
 	if 1024 < distance { length - 1 }
 	else { length }
diff --git a/src/image/zopflipng/kat.rs b/src/image/zopflipng/kat.rs
index 4590aa6..1177dee 100644
--- a/src/image/zopflipng/kat.rs
+++ b/src/image/zopflipng/kat.rs
@@ -141,17 +141,14 @@ struct Leaf<'a> {
 impl<'a> Eq for Leaf<'a> {}
 
 impl<'a> Ord for Leaf<'a> {
-	#[inline]
 	fn cmp(&self, other: &Self) -> Ordering { self.frequency.cmp(&other.frequency) }
 }
 
 impl<'a> PartialEq for Leaf<'a> {
-	#[inline]
 	fn eq(&self, other: &Self) -> bool { self.frequency == other.frequency }
 }
 
 impl<'a> PartialOrd for Leaf<'a> {
-	#[inline]
 	fn partial_cmp(&self, other: &Self) -> Option<Ordering> { Some(self.cmp(other)) }
 }
 
@@ -165,11 +162,9 @@ struct List<'a> {
 }
 
 impl<'a> List<'a> {
-	#[inline]
 	/// # Rotate.
 	fn rotate(&mut self) { self.lookahead0 = self.lookahead1; }
 
-	#[inline]
 	/// # Weight Sum.
 	const fn weight_sum(&self) -> usize {
 		self.lookahead0.weight + self.lookahead1.weight
diff --git a/src/image/zopflipng/lz77.rs b/src/image/zopflipng/lz77.rs
index cf5bbdf..dfc0304 100644
--- a/src/image/zopflipng/lz77.rs
+++ b/src/image/zopflipng/lz77.rs
@@ -51,13 +51,11 @@ impl LZ77Store {
 		self.d_counts.truncate(0);
 	}
 
-	#[inline(never)]
 	/// # Push Values.
 	pub(crate) fn push(&mut self, litlen: u16, dist: u16, pos: usize) -> Result<(), ZopfliError> {
 		LZ77StoreEntry::new(litlen, dist, pos).map(|e| self.push_entry(e))
 	}
 
-	#[inline]
 	/// # Push Entry.
 	fn push_entry(&mut self, entry: LZ77StoreEntry) {
 		let old_len = self.entries.len();

From 45f8c1c406ac42d1bf5693c3a7942ecfc870ac4e Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Sun, 21 Apr 2024 17:40:45 -0700
Subject: [PATCH 24/31] cleanup: use tuple for length/distance cache instead of
 merging values into a single u32

---
 src/image/zopflipng/cache.rs | 49 +++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/src/image/zopflipng/cache.rs b/src/image/zopflipng/cache.rs
index 90ff41c..620ccc6 100644
--- a/src/image/zopflipng/cache.rs
+++ b/src/image/zopflipng/cache.rs
@@ -19,7 +19,7 @@ use super::{
 ///
 /// Length and distance are always fetched/stored together, so are grouped into
 /// a single value to reduce indexing/bounds overhead.
-const DEFAULT_LD: u32 = u32::from_le_bytes([1, 0, 0, 0]);
+const DEFAULT_LD: (u16, u16) = (1, 0);
 
 /// # Sublength Cache Entries.
 const ZOPFLI_CACHE_LENGTH: usize = 8;
@@ -37,7 +37,7 @@ const SUBLEN_CACHED_LEN: usize = ZOPFLI_CACHE_LENGTH * 3;
 /// sublengths. Its memory usage is no joke, but the performance savings more
 /// than make up for it.
 pub(crate) struct MatchCache {
-	ld: Vec<u32>,
+	ld: Vec<(u16, u16)>,
 	sublen: Vec<u8>,
 }
 
@@ -95,8 +95,13 @@ impl MatchCache {
 		distance: &mut u16,
 		length: &mut u16,
 	) -> Result<bool, ZopfliError> {
+		// One sanity check to rule them all.
+		if pos >= self.ld.len() || SUBLEN_CACHED_LEN * pos >= self.sublen.len() {
+			return Err(zopfli_error!());
+		}
+
 		// If we have no distance, we have no cache.
-		let (cache_len, cache_dist) = self.ld(pos);
+		let (cache_len, cache_dist) = self.ld[pos];
 		if cache_len != 0 && cache_dist == 0 { return Ok(false); }
 
 		// Find the max sublength once, if ever.
@@ -153,19 +158,6 @@ impl MatchCache {
 		Ok(false)
 	}
 
-	/// # Get Length and Distance.
-	fn ld(&self, pos: usize) -> (u16, u16) {
-		let [l1, l2, d1, d2] = self.ld[pos].to_le_bytes();
-		(u16::from_le_bytes([l1, l2]), u16::from_le_bytes([d1, d2]))
-	}
-
-	/// # Set Length and Distance.
-	fn set_ld(&mut self, pos: usize, len: u16, dist: u16) {
-		let [l1, l2] = len.to_le_bytes();
-		let [d1, d2] = dist.to_le_bytes();
-		self.ld[pos] = u32::from_le_bytes([l1, l2, d1, d2]);
-	}
-
 	#[allow(clippy::cast_possible_truncation)]
 	/// # Set Sublength.
 	///
@@ -177,29 +169,33 @@ impl MatchCache {
 		distance: u16,
 		length: u16,
 	) -> Result<(), ZopfliError> {
-		let old_ld = self.ld(pos);
-		if old_ld.0 == 0 || old_ld.1 != 0 { return Ok(()); }
-		else if old_ld != (1, 0) { return Err(zopfli_error!()); }
+		match self.ld.get(pos).copied() {
+			// If the current value is the default, let's proceed!
+			Some(DEFAULT_LD) => {},
+			// If the current value is something else and legit, abort happy.
+			Some((l, d)) if l == 0 || d != 0 => return Ok(()),
+			// Otherwise abort sad!
+			_ => return Err(zopfli_error!()),
+		}
 
 		// The sublength isn't cacheable, but that fact is itself worth
 		// caching!
 		if usize::from(length) < ZOPFLI_MIN_MATCH {
-			self.set_ld(pos, 0, 0);
+			self.ld[pos] = (0, 0);
 			return Ok(());
 		}
 
 		// Save the length/distance bit.
 		if distance == 0 { return Err(zopfli_error!()); }
-		self.set_ld(pos, length, distance);
+		self.ld[pos] = (length, distance);
 
 		// The cache gets written three bytes at a time; this iterator will
 		// help us eliminate the bounds checks we'd otherwise run into.
-		// let mut dst = self.sublen[SUBLEN_CACHED_LEN * pos..SUBLEN_CACHED_LEN * pos + SUBLEN_CACHED_LEN].chunks_exact_mut(3);
 		let mut dst = self.sublen.chunks_exact_mut(3)
 			.skip(ZOPFLI_CACHE_LENGTH * pos)
 			.take(ZOPFLI_CACHE_LENGTH);
 
-		// Write all mismatched pairs.
+		// Start by writing all mismatched pairs, up to the limit.
 		for (i, pair) in sublen.windows(2).skip(ZOPFLI_MIN_MATCH).take(usize::from(length) - 3).enumerate() {
 			if pair[0] != pair[1] {
 				let Some(next) = dst.next() else { return Ok(()); };
@@ -208,13 +204,14 @@ impl MatchCache {
 			}
 		}
 
-		// Write the final length if we're still here.
+		// The final value is implicitly "mismatched"; if we haven't hit the
+		// limit we should write it too.
 		if let Some(next) = dst.next() {
 			next[0] = (length - 3) as u8;
 			next[1..].copy_from_slice(sublen[usize::from(length)].to_le_bytes().as_slice());
 
-			// And copy that value to the end of the cache if we still haven't
-			// hit the limit.
+			// If we're still below the limit, copy (just) the length to the
+			// last slot to simplify any subsequent max_length lookups.
 			if let Some([c1, _rest @ ..]) = dst.last() {
 				*c1 = (length - 3) as u8;
 			}

From e49e580dd601fbeeed41984a78da92dbdc0b7b03 Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Sun, 21 Apr 2024 18:20:33 -0700
Subject: [PATCH 25/31] cleanup: convert some more bound panics to Err

---
 src/image/zopflipng/cache.rs | 90 ++++++++++++++++++------------------
 1 file changed, 44 insertions(+), 46 deletions(-)

diff --git a/src/image/zopflipng/cache.rs b/src/image/zopflipng/cache.rs
index 620ccc6..a50f0b0 100644
--- a/src/image/zopflipng/cache.rs
+++ b/src/image/zopflipng/cache.rs
@@ -96,9 +96,10 @@ impl MatchCache {
 		length: &mut u16,
 	) -> Result<bool, ZopfliError> {
 		// One sanity check to rule them all.
-		if pos >= self.ld.len() || SUBLEN_CACHED_LEN * pos >= self.sublen.len() {
-			return Err(zopfli_error!());
-		}
+		if pos >= self.ld.len() { return Err(zopfli_error!()); }
+		let cache_sublen: &[u8; SUBLEN_CACHED_LEN] = self.sublen.get(SUBLEN_CACHED_LEN * pos..SUBLEN_CACHED_LEN * pos + SUBLEN_CACHED_LEN)
+			.and_then(|s| s.try_into().ok())
+			.ok_or(zopfli_error!())?;
 
 		// If we have no distance, we have no cache.
 		let (cache_len, cache_dist) = self.ld[pos];
@@ -107,7 +108,7 @@ impl MatchCache {
 		// Find the max sublength once, if ever.
 		let maxlength =
 			if sublen.is_none() { 0 }
-			else { max_sublen(&self.sublen[SUBLEN_CACHED_LEN * pos..]) };
+			else { max_sublen(cache_sublen) };
 
 		// Proceed if our cached length or max sublength are under the limit.
 		if
@@ -120,15 +121,13 @@ impl MatchCache {
 			if sublen.is_none() || usize::from(cache_len) <= maxlength {
 				// Cap the length.
 				*length = cache_len;
-				if usize::from(*length) > *limit {
-					*length = *limit as u16;
-				}
+				if usize::from(*length) > *limit { *length = *limit as u16; }
 
 				// Set the distance from the sublength cache.
 				if let Some(s) = sublen {
 					// Pull the sublength from cache and pull the distance from
 					// that.
-					self.write_sublen(pos, usize::from(*length), s);
+					if 3 <= *length { write_sublen(cache_sublen, s); }
 					*distance = s[usize::from(*length)];
 
 					// Sanity check: make sure the sublength distance at length
@@ -142,9 +141,7 @@ impl MatchCache {
 					}
 				}
 				// Use the cached distance directly.
-				else {
-					*distance = cache_dist;
-				}
+				else { *distance = cache_dist; }
 
 				// We did stuff!
 				return Ok(true);
@@ -185,6 +182,12 @@ impl MatchCache {
 			return Ok(());
 		}
 
+		// Reslice it to the (inclusive) length, ignoring the first 3 entries
+		// since they're below the minimum give-a-shittable limit. Note that
+		// without them, each index can be represented (and stored) as a u8.
+		let slice = sublen.get(ZOPFLI_MIN_MATCH..=usize::from(length))
+			.ok_or(zopfli_error!())?;
+
 		// Save the length/distance bit.
 		if distance == 0 { return Err(zopfli_error!()); }
 		self.ld[pos] = (length, distance);
@@ -196,51 +199,27 @@ impl MatchCache {
 			.take(ZOPFLI_CACHE_LENGTH);
 
 		// Start by writing all mismatched pairs, up to the limit.
-		for (i, pair) in sublen.windows(2).skip(ZOPFLI_MIN_MATCH).take(usize::from(length) - 3).enumerate() {
+		for (i, pair) in (0_u8..).zip(slice.windows(2)) {
 			if pair[0] != pair[1] {
-				let Some(next) = dst.next() else { return Ok(()); };
-				next[0] = i as u8;
-				next[1..].copy_from_slice(pair[0].to_le_bytes().as_slice());
+				let Some([d0, d1, d2]) = dst.next() else { return Ok(()); };
+				*d0 = i;
+				[*d1, *d2] = pair[0].to_le_bytes();
 			}
 		}
 
 		// The final value is implicitly "mismatched"; if we haven't hit the
 		// limit we should write it too.
-		if let Some(next) = dst.next() {
-			next[0] = (length - 3) as u8;
-			next[1..].copy_from_slice(sublen[usize::from(length)].to_le_bytes().as_slice());
+		if let Some([d0, d1, d2]) = dst.next() {
+			*d0 = (length - 3) as u8;
+			[*d1, *d2] = slice[slice.len() - 1].to_le_bytes();
 
-			// If we're still below the limit, copy (just) the length to the
+			// If we're still below the limit, copy (only) the length to the
 			// last slot to simplify any subsequent max_length lookups.
-			if let Some([c1, _rest @ ..]) = dst.last() {
-				*c1 = (length - 3) as u8;
-			}
+			if let Some([d0, _, _]) = dst.last() { *d0 = (length - 3) as u8; }
 		}
 
 		Ok(())
 	}
-
-	/// # Write Sublength.
-	///
-	/// Fill the provided sublength slice with data from the cache.
-	fn write_sublen(&self, pos: usize, length: usize, sublen: &mut [u16; SUBLEN_LEN]) {
-		// Short circuit.
-		if length < ZOPFLI_MIN_MATCH { return; }
-
-		let slice = &self.sublen[SUBLEN_CACHED_LEN * pos..SUBLEN_CACHED_LEN * pos + SUBLEN_CACHED_LEN];
-		let maxlength = max_sublen(slice);
-		let mut prevlength = 0;
-
-		for chunk in slice.chunks_exact(3) {
-			let length = usize::from(chunk[0]) + ZOPFLI_MIN_MATCH;
-			if prevlength <= length {
-				let dist = u16::from_le_bytes([chunk[1], chunk[2]]);
-				sublen[prevlength..=length].fill(dist);
-			}
-			if length == maxlength { return; }
-			prevlength = length + 1;
-		}
-	}
 }
 
 
@@ -248,7 +227,26 @@ impl MatchCache {
 /// # Max Sublength.
 ///
 /// Return the maximum sublength length for a given chunk.
-const fn max_sublen(slice: &[u8]) -> usize {
-	if slice.len() < SUBLEN_CACHED_LEN || (slice[1] == 0 && slice[2] == 0) { 0 }
+const fn max_sublen(slice: &[u8; SUBLEN_CACHED_LEN]) -> usize {
+	// If the first chunk has no distance, assume a zero length.
+	if slice[1] == 0 && slice[2] == 0 { 0 }
+	// Otherwise the "max" is stored as the first value of the last chunk.
 	else { slice[SUBLEN_CACHED_LEN - 3] as usize + 3 }
 }
+
+/// # Write Sublength.
+///
+/// Fill the provided sublength slice with data from the cache.
+fn write_sublen(src: &[u8; SUBLEN_CACHED_LEN], dst: &mut [u16; SUBLEN_LEN]) {
+	let maxlength = max_sublen(src);
+	let mut old = 0;
+	for chunk in src.chunks_exact(3) {
+		let length = usize::from(chunk[0]) + ZOPFLI_MIN_MATCH;
+		if old <= length {
+			let value = u16::from_le_bytes([chunk[1], chunk[2]]);
+			dst[old..=length].fill(value);
+		}
+		if length == maxlength { return; }
+		old = length + 1;
+	}
+}

From 02571b3bd2d4dcb4d7f839226d1c2d64ebb154f2 Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Mon, 22 Apr 2024 10:02:17 -0700
Subject: [PATCH 26/31] cleanup: move/refactor lengths_to_symbols

---
 src/image/zopflipng/blocks.rs | 44 +++++++++++++++++++++++++++++------
 src/image/zopflipng/mod.rs    | 37 -----------------------------
 2 files changed, 37 insertions(+), 44 deletions(-)

diff --git a/src/image/zopflipng/blocks.rs b/src/image/zopflipng/blocks.rs
index eeb62ff..f72a950 100644
--- a/src/image/zopflipng/blocks.rs
+++ b/src/image/zopflipng/blocks.rs
@@ -21,7 +21,6 @@ use super::{
 	},
 	zopfli_error,
 	zopfli_length_limited_code_lengths,
-	zopfli_lengths_to_symbols,
 	ZOPFLI_NUM_D,
 	ZOPFLI_NUM_LL,
 	ZopfliError,
@@ -420,10 +419,8 @@ fn add_lz77_block(
 		};
 
 	// Now sort out the symbols.
-	let mut ll_symbols = [0_u32; ZOPFLI_NUM_LL];
-	let mut d_symbols = [0_u32; ZOPFLI_NUM_D];
-	zopfli_lengths_to_symbols::<16, ZOPFLI_NUM_LL>(&ll_lengths, &mut ll_symbols);
-	zopfli_lengths_to_symbols::<16, ZOPFLI_NUM_D>(&d_lengths, &mut d_symbols);
+	let ll_symbols = lengths_to_symbols::<16, ZOPFLI_NUM_LL>(&ll_lengths)?;
+	let d_symbols = lengths_to_symbols::<16, ZOPFLI_NUM_D>(&d_lengths)?;
 
 	// Write all the data!
 	add_lz77_data(
@@ -865,8 +862,7 @@ fn encode_tree(
 	// Write the tree!
 	if let Some(out) = out {
 		// Convert the lengths to symbols.
-		let mut cl_symbols = [0_u32; 19];
-		zopfli_lengths_to_symbols::<8, 19>(&cl_lengths, &mut cl_symbols);
+		let cl_symbols = lengths_to_symbols::<8, 19>(&cl_lengths)?;
 
 		// Write the main lengths.
 		out.add_bits(hlit as u32, 5);
@@ -1044,6 +1040,40 @@ fn get_lz77_byte_range(
 	else { Err(zopfli_error!()) }
 }
 
+/// # Zopfli Lengths to Symbols.
+///
+/// This updates the symbol array given the corresponding lengths.
+fn lengths_to_symbols<const MAXBITS: usize, const SIZE: usize>(lengths: &[u32; SIZE])
+-> Result<[u32; SIZE], ZopfliError> {
+	// Count up the codes by code length.
+	let mut counts: [u32; MAXBITS] = [0; MAXBITS];
+	for l in lengths.iter().copied() {
+		if (l as usize) < MAXBITS { counts[l as usize] += 1; }
+		else { return Err(zopfli_error!()); }
+	}
+
+	// Find the numerical value of the smallest code for each code length.
+	counts[0] = 0;
+	let mut code = 0;
+	let mut next_code: [u32; MAXBITS] = [0; MAXBITS];
+	for i in 1..MAXBITS {
+		code = (code + counts[i - 1]) << 1;
+		next_code[i] = code;
+	}
+
+	// Update the symbols accordingly.
+	let mut symbols = [0; SIZE];
+	for (s, l) in symbols.iter_mut().zip(lengths.iter().copied()) {
+		// The MAXBITS comparison is only for the compiler; it will never not
+		// be true.
+		if l != 0 && (l as usize) < MAXBITS {
+			*s = next_code[l as usize];
+			next_code[l as usize] += 1;
+		}
+	}
+	Ok(symbols)
+}
+
 /// # Optimal LZ77.
 ///
 /// Calculate lit/len and dist pairs for the dataset.
diff --git a/src/image/zopflipng/mod.rs b/src/image/zopflipng/mod.rs
index 2194947..a077f63 100644
--- a/src/image/zopflipng/mod.rs
+++ b/src/image/zopflipng/mod.rs
@@ -162,40 +162,3 @@ fn encode(
 
 	Some(out)
 }
-
-#[allow(unsafe_code)]
-/// # Zopfli Lengths to Symbols.
-///
-/// This updates the symbol array given the corresponding lengths.
-fn zopfli_lengths_to_symbols<const MAXBITS: usize, const SIZE: usize>(
-	lengths: &[u32; SIZE],
-	symbols: &mut [u32; SIZE],
-) {
-	// Count up the codes by code length.
-	let mut counts: [u32; MAXBITS] = [0; MAXBITS];
-	for l in lengths {
-		let l = *l as usize;
-		if l < MAXBITS { counts[l] += 1; }
-		else { return; }
-	}
-
-	// Find the numerical value of the smallest code for each code length.
-	counts[0] = 0;
-	let mut code = 0;
-	let mut next_code: [u32; MAXBITS] = [0; MAXBITS];
-	for i in 1..MAXBITS {
-		code = (code + counts[i - 1]) << 1;
-		next_code[i] = code;
-	}
-
-	// Update the symbols accordingly.
-	for (s, l) in symbols.iter_mut().zip(lengths.iter().copied()) {
-		if l == 0 { *s = 0; }
-		else {
-			// Safety: all lengths were tested to be < MAXBITS a few lines up.
-			debug_assert!((l as usize) < MAXBITS);
-			*s = unsafe { *next_code.get_unchecked(l as usize) };
-			next_code[l as usize] += 1;
-		}
-	}
-}

From 73f53cc7e7a334a390e8c55992c25c9909f1fdad Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Mon, 22 Apr 2024 11:24:10 -0700
Subject: [PATCH 27/31] cleanup: refactor optimize_huffman_for_rle to avoid
 unsafe, elide some bound checks

---
 src/image/zopflipng/blocks.rs | 75 ++++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 37 deletions(-)

diff --git a/src/image/zopflipng/blocks.rs b/src/image/zopflipng/blocks.rs
index f72a950..56af743 100644
--- a/src/image/zopflipng/blocks.rs
+++ b/src/image/zopflipng/blocks.rs
@@ -6,7 +6,10 @@ and ends that didn't make it into other modules.
 */
 
 use dactyl::NoHash;
-use std::collections::HashSet;
+use std::{
+	cell::Cell,
+	collections::HashSet,
+};
 use super::{
 	DEFLATE_ORDER,
 	DISTANCE_BITS,
@@ -299,14 +302,14 @@ enum BlockType {
 /// It moots the need to collect such values into a vector in advance,
 /// reducing the number of passes required to optimize Huffman codes.
 struct GoodForRle<'a> {
-	counts: &'a [usize],
+	counts: &'a [Cell<usize>],
 	good: usize,
 	bad: usize,
 }
 
 impl<'a> GoodForRle<'a> {
 	/// # New Instance.
-	const fn new(counts: &'a [usize]) -> Self {
+	const fn new(counts: &'a [Cell<usize>]) -> Self {
 		Self { counts, good: 0, bad: 0 }
 	}
 }
@@ -330,13 +333,13 @@ impl<'a> Iterator for GoodForRle<'a> {
 
 		// See how many times the next entry is repeated, if at all, shortening
 		// the slice accordingly.
-		let scratch = self.counts[0];
+		let scratch = self.counts[0].get();
 		let mut stride = 0;
 		while let [count, rest @ ..] = self.counts {
 			// Note the reptition and circle back around. This will always
 			// trigger on the first pass, so stride will always be at least
 			// one.
-			if *count == scratch {
+			if count.get() == scratch {
 				stride += 1;
 				self.counts = rest;
 			}
@@ -1178,34 +1181,35 @@ fn split_cost(store: &LZ77Store, start: usize, mid: usize, end: usize) -> Result
 	)
 }
 
-#[allow(unsafe_code, clippy::integer_division)]
+#[allow(clippy::integer_division)]
 /// # Optimize Huffman RLE Compression.
 ///
 /// Change the population counts to improve Huffman tree compression,
 /// particularly its RLE part.
 fn optimize_huffman_for_rle(mut counts: &mut [usize]) {
 	// Convert counts to a proper slice with trailing zeroes trimmed.
-	let ptr = counts.as_mut_ptr();
 	while let [ rest @ .., 0 ] = counts { counts = rest; }
 	if counts.is_empty() { return; }
 
+	// We need to read and write simultaneously; once again the Cell trick can
+	// keep us safe!
+	let counts = Cell::from_mut(counts).as_slice_of_cells();
+
 	// Find collapseable ranges!
 	let mut stride = 0;
-	let mut scratch = counts[0];
+	let mut scratch = counts[0].get();
 	let mut sum = 0;
-	let four = counts.len().saturating_sub(3);
-	for (i, (&count, good)) in counts.iter().zip(GoodForRle::new(counts)).enumerate() {
+	for (i, (count, good)) in counts.iter().map(Cell::get).zip(GoodForRle::new(counts)).enumerate() {
 		// Time to reset (and maybe collapse).
 		if good || count.abs_diff(scratch) >= 4 {
 			// Collapse the stride if it is as least four and contained
 			// something non-zero.
 			if sum != 0 && stride >= 4 {
 				let v = ((sum + stride / 2) / stride).max(1);
-				// Safety: this is a very un-Rust thing to do, but we're only
-				// modifying values after-the-fact; the current and future
-				// data remains as it was.
-				unsafe {
-					for j in i - stride..i { ptr.add(j).write(v); }
+				// This condition just helps the compiler understand the range
+				// won't overflow; it can't, but it doesn't know that.
+				if let Some(from) = i.checked_sub(stride) {
+					for c in &counts[from..i] { c.set(v); }
 				}
 			}
 
@@ -1213,21 +1217,13 @@ fn optimize_huffman_for_rle(mut counts: &mut [usize]) {
 			stride = 0;
 			sum = 0;
 
-			// If we have at least four remaining values (including the
-			// current), take a sort of weighted average of them.
-			if i < four {
-				scratch = (
-					// Safety: the compiler doesn't understand (i < four) means
-					// we have at least i+3 entries left.
-					unsafe { *counts.get_unchecked(i + 3) } +
-					counts[i + 2] +
-					counts[i + 1] +
-					count +
-					2
-				) / 4;
-			}
-			// Otherwise just use the current value.
-			else { scratch = count; }
+			// If there are at least three future counts, we can set scratch
+			// to a sorted weighted average, otherwise the current value will
+			// do.
+			scratch = counts.get(i..i + 4).map_or(
+				count,
+				|c| c.iter().fold(2, |a, c| a + c.get()) / 4
+			);
 		}
 
 		stride += 1;
@@ -1237,8 +1233,11 @@ fn optimize_huffman_for_rle(mut counts: &mut [usize]) {
 	// Collapse the trailing stride, if any.
 	if sum != 0 && stride >= 4 {
 		let v = ((sum + stride / 2) / stride).max(1);
-		let len = counts.len();
-		counts[len - stride..].fill(v);
+		// This condition just helps the compiler understand the range won't
+		// overflow; it can't, but it doesn't know that.
+		if let Some(from) = counts.len().checked_sub(stride) {
+			for c in &counts[from..] { c.set(v); }
+		}
 	}
 }
 
@@ -1390,11 +1389,13 @@ mod test {
 
 	#[test]
 	fn t_good_for_rle() {
-		const COUNTS1: &[usize] = &[196, 23, 10, 12, 5, 4, 1, 23, 8, 2, 6, 5, 0, 0, 0, 29, 5, 0, 0, 4, 4, 1, 0, 5, 2, 0, 0, 1, 4, 0, 1, 34, 10, 5, 7, 2, 1, 2, 0, 0, 3, 2, 5, 0, 1, 0, 0, 4, 2, 1, 0, 0, 1, 1, 0, 1, 1, 2, 0, 1, 4, 1, 5, 47, 13, 0, 5, 3, 1, 2, 0, 4, 0, 1, 6, 3, 0, 0, 0, 1, 3, 2, 2, 1, 4, 6, 0, 5, 0, 0, 1, 0, 0, 0, 1, 10, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 3, 1, 0, 0, 4, 0, 5, 47, 28, 3, 2, 5, 3, 0, 0, 1, 7, 0, 8, 1, 1, 1, 0, 4, 7, 2, 0, 1, 10, 0, 0, 2, 1, 0, 0, 1, 0, 0, 0, 7, 11, 4, 1, 1, 0, 3, 0, 1, 1, 1, 5, 1, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 2, 0, 0, 2, 13, 27, 4, 1, 4, 1, 1, 0, 2, 2, 0, 0, 0, 3, 0, 0, 3, 8, 0, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 1, 4, 24, 1, 4, 4, 2, 2, 0, 5, 6, 1, 1, 1, 1, 1, 0, 0, 42, 6, 3, 3, 3, 6, 0, 6, 30, 9, 10, 8, 33, 9, 44, 284, 1, 15, 21, 0, 55, 0, 19, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 13, 320, 12, 0, 0, 17, 3, 0, 3, 2];
-		const COUNTS2: &[usize] = &[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 122, 0, 288, 11, 41, 6, 5, 2, 0, 0, 0, 1];
-		const COUNTS3: &[usize] = &[201, 24, 10, 12, 5, 4, 1, 24, 8, 2, 6, 4, 0, 0, 0, 29, 5, 0, 0, 4, 4, 1, 0, 5, 2, 0, 0, 1, 4, 0, 1, 34, 10, 5, 7, 2, 1, 2, 0, 0, 3, 2, 5, 0, 1, 0, 0, 4, 2, 1, 0, 0, 1, 1, 0, 1, 1, 2, 0, 1, 4, 1, 5, 47, 13, 0, 5, 3, 1, 2, 0, 4, 0, 1, 6, 3, 0, 0, 0, 1, 3, 2, 2, 1, 4, 6, 0, 5, 0, 0, 1, 0, 0, 0, 1, 10, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 3, 1, 0, 0, 4, 0, 5, 49, 28, 3, 2, 5, 3, 0, 0, 1, 7, 0, 9, 1, 1, 1, 0, 4, 6, 2, 0, 1, 8, 0, 0, 2, 1, 0, 0, 1, 0, 0, 0, 7, 11, 4, 1, 1, 0, 3, 0, 1, 1, 1, 5, 1, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 2, 0, 0, 2, 13, 27, 4, 1, 4, 1, 1, 0, 2, 2, 0, 0, 0, 3, 0, 0, 3, 8, 0, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 1, 4, 24, 1, 4, 4, 2, 2, 0, 5, 6, 1, 1, 1, 1, 1, 0, 0, 44, 6, 3, 3, 3, 6, 0, 6, 30, 9, 10, 8, 33, 9, 46, 281, 1, 20, 3, 10, 59, 0, 4, 12, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 13, 318, 12, 0, 0, 21, 0, 0, 3, 2];
+		for c in [
+			[196, 23, 10, 12, 5, 4, 1, 23, 8, 2, 6, 5, 0, 0, 0, 29, 5, 0, 0, 4, 4, 1, 0, 5, 2, 0, 0, 1, 4, 0, 1, 34, 10, 5, 7, 2, 1, 2, 0, 0, 3, 2, 5, 0, 1, 0, 0, 4, 2, 1, 0, 0, 1, 1, 0, 1, 1, 2, 0, 1, 4, 1, 5, 47, 13, 0, 5, 3, 1, 2, 0, 4, 0, 1, 6, 3, 0, 0, 0, 1, 3, 2, 2, 1, 4, 6, 0, 5, 0, 0, 1, 0, 0, 0, 1, 10, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 3, 1, 0, 0, 4, 0, 5, 47, 28, 3, 2, 5, 3, 0, 0, 1, 7, 0, 8, 1, 1, 1, 0, 4, 7, 2, 0, 1, 10, 0, 0, 2, 1, 0, 0, 1, 0, 0, 0, 7, 11, 4, 1, 1, 0, 3, 0, 1, 1, 1, 5, 1, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 2, 0, 0, 2, 13, 27, 4, 1, 4, 1, 1, 0, 2, 2, 0, 0, 0, 3, 0, 0, 3, 8, 0, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 1, 4, 24, 1, 4, 4, 2, 2, 0, 5, 6, 1, 1, 1, 1, 1, 0, 0, 42, 6, 3, 3, 3, 6, 0, 6, 30, 9, 10, 8, 33, 9, 44, 284, 1, 15, 21, 0, 55, 0, 19, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 13, 320, 12, 0, 0, 17, 3, 0, 3, 2].as_mut_slice(),
+			[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 122, 0, 288, 11, 41, 6, 5, 2, 0, 0, 0, 1].as_mut_slice(),
+			[201, 24, 10, 12, 5, 4, 1, 24, 8, 2, 6, 4, 0, 0, 0, 29, 5, 0, 0, 4, 4, 1, 0, 5, 2, 0, 0, 1, 4, 0, 1, 34, 10, 5, 7, 2, 1, 2, 0, 0, 3, 2, 5, 0, 1, 0, 0, 4, 2, 1, 0, 0, 1, 1, 0, 1, 1, 2, 0, 1, 4, 1, 5, 47, 13, 0, 5, 3, 1, 2, 0, 4, 0, 1, 6, 3, 0, 0, 0, 1, 3, 2, 2, 1, 4, 6, 0, 5, 0, 0, 1, 0, 0, 0, 1, 10, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 3, 1, 0, 0, 4, 0, 5, 49, 28, 3, 2, 5, 3, 0, 0, 1, 7, 0, 9, 1, 1, 1, 0, 4, 6, 2, 0, 1, 8, 0, 0, 2, 1, 0, 0, 1, 0, 0, 0, 7, 11, 4, 1, 1, 0, 3, 0, 1, 1, 1, 5, 1, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 2, 0, 0, 2, 13, 27, 4, 1, 4, 1, 1, 0, 2, 2, 0, 0, 0, 3, 0, 0, 3, 8, 0, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 1, 1, 1, 4, 24, 1, 4, 4, 2, 2, 0, 5, 6, 1, 1, 1, 1, 1, 0, 0, 44, 6, 3, 3, 3, 6, 0, 6, 30, 9, 10, 8, 33, 9, 46, 281, 1, 20, 3, 10, 59, 0, 4, 12, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 13, 318, 12, 0, 0, 21, 0, 0, 3, 2].as_mut_slice(),
+		] {
+			let c = Cell::from_mut(c).as_slice_of_cells();
 
-		for c in [COUNTS1, COUNTS2, COUNTS3] {
 			// Make sure our ExactSizeness is working.
 			let good = GoodForRle::new(c);
 			assert_eq!(
@@ -1403,7 +1404,7 @@ mod test {
 				"GoodForRle iterator count does not match source.",
 			);
 
-			// And make sure that is the count we actually end up with.
+			// And make sure we actually collect that count!
 			let good = good.collect::<Vec<bool>>();
 			assert_eq!(
 				good.len(),

From 6907730835a8a85331aba230fe388b6f7385cf44 Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Mon, 22 Apr 2024 12:36:40 -0700
Subject: [PATCH 28/31] cleanup: refactor length_limited_code_lengths

---
 src/image/zopflipng/blocks.rs |  12 +-
 src/image/zopflipng/kat.rs    | 383 ++++++++++++++++++----------------
 src/image/zopflipng/mod.rs    |   2 +-
 3 files changed, 206 insertions(+), 191 deletions(-)

diff --git a/src/image/zopflipng/blocks.rs b/src/image/zopflipng/blocks.rs
index 56af743..3f5631f 100644
--- a/src/image/zopflipng/blocks.rs
+++ b/src/image/zopflipng/blocks.rs
@@ -16,6 +16,7 @@ use super::{
 	DISTANCE_VALUES,
 	FIXED_TREE_D,
 	FIXED_TREE_LL,
+	length_limited_code_lengths,
 	LENGTH_SYMBOLS_BITS_VALUES,
 	LZ77Store,
 	stats::{
@@ -23,7 +24,6 @@ use super::{
 		SymbolStats,
 	},
 	zopfli_error,
-	zopfli_length_limited_code_lengths,
 	ZOPFLI_NUM_D,
 	ZOPFLI_NUM_LL,
 	ZopfliError,
@@ -854,7 +854,7 @@ fn encode_tree(
 	}
 
 	// Update the lengths and symbols given the counts.
-	zopfli_length_limited_code_lengths::<7, 19>(&cl_counts, &mut cl_lengths)?;
+	length_limited_code_lengths::<7, 19>(&cl_counts, &mut cl_lengths)?;
 
 	// Find the last non-zero index of the counts table.
 	let mut hclen = 15;
@@ -1011,8 +1011,8 @@ fn get_dynamic_lengths(
 	let (mut ll_counts, d_counts) = store.histogram(lstart, lend)?;
 	ll_counts[256] = 1;
 
-	zopfli_length_limited_code_lengths::<15, ZOPFLI_NUM_LL>(&ll_counts, ll_lengths)?;
-	zopfli_length_limited_code_lengths::<15, ZOPFLI_NUM_D>(&d_counts, d_lengths)?;
+	length_limited_code_lengths::<15, ZOPFLI_NUM_LL>(&ll_counts, ll_lengths)?;
+	length_limited_code_lengths::<15, ZOPFLI_NUM_D>(&d_counts, d_lengths)?;
 
 	patch_distance_codes(d_lengths);
 	try_optimize_huffman_for_rle(
@@ -1354,8 +1354,8 @@ fn try_optimize_huffman_for_rle(
 	let mut d_counts2 = *d_counts;
 	optimize_huffman_for_rle(&mut ll_counts2);
 	optimize_huffman_for_rle(&mut d_counts2);
-	zopfli_length_limited_code_lengths::<15, ZOPFLI_NUM_LL>(&ll_counts2, &mut ll_lengths2)?;
-	zopfli_length_limited_code_lengths::<15, ZOPFLI_NUM_D>(&d_counts2, &mut d_lengths2)?;
+	length_limited_code_lengths::<15, ZOPFLI_NUM_LL>(&ll_counts2, &mut ll_lengths2)?;
+	length_limited_code_lengths::<15, ZOPFLI_NUM_D>(&d_counts2, &mut d_lengths2)?;
 	patch_distance_codes(&mut d_lengths2);
 
 	// Calculate the optimized tree and data sizes.
diff --git a/src/image/zopflipng/kat.rs b/src/image/zopflipng/kat.rs
index 1177dee..a1d41fc 100644
--- a/src/image/zopflipng/kat.rs
+++ b/src/image/zopflipng/kat.rs
@@ -12,7 +12,7 @@ use std::{
 		RefCell,
 	},
 	cmp::Ordering,
-	mem::MaybeUninit,
+	num::NonZeroUsize,
 };
 use super::{
 	zopfli_error,
@@ -24,7 +24,7 @@ use super::{
 thread_local!(
 	/// # Shared Arena.
 	///
-	/// Each call to `zopfli_length_limited_code_lengths` generates a hefty
+	/// Each call to `length_limited_code_lengths` generates a hefty
 	/// list of recursive node chains. This helps mitigate the costs of
 	/// reallocation.
 	static BUMP: RefCell<Bump> = RefCell::new(Bump::with_capacity(32_768))
@@ -32,110 +32,72 @@ thread_local!(
 
 
 
-#[allow(unsafe_code)]
 /// # Length Limited Code Lengths.
 ///
 /// This writes minimum-redundancy length-limited code bitlengths for symbols
-/// with the given counts, limited by `MAXBITS`.
-pub(crate) fn zopfli_length_limited_code_lengths<const MAXBITS: usize, const SIZE: usize>(
+/// with the given counts, limited by `MAXBITS` (either 7 or 15 in practice).
+pub(crate) fn length_limited_code_lengths<const MAXBITS: usize, const SIZE: usize>(
 	frequencies: &[usize; SIZE],
 	bitlengths: &mut [u32; SIZE],
 ) -> Result<(), ZopfliError> {
-	// Convert (used) frequencies to leaves. There will never be more than
-	// ZOPFLI_NUM_LL of them, but often there will be less, so we'll leverage
-	// MaybeUninit to save unnecessary writes.
-	let mut leaves: [MaybeUninit<Leaf>; SIZE] = unsafe {
-		MaybeUninit::uninit().assume_init()
-	};
+	// SIZE is only ever going to be 19, 32, or 288 at runtime, but the
+	// compiler won't necessarily bother to confirm that in advance. Our tests
+	// go as low as 6, so let's just run with that.
+	if SIZE < 6 { return Err(zopfli_error!()); }
+
+	// For performance reasons the bitlengths are passed by reference, but
+	// they should always be zero-filled by this point.
+	debug_assert!(bitlengths.iter().all(|b| *b == 0));
+
+	// Convert bitlengths to a slice-of-cells so we can chop it up willynilly
+	// without losing writeability.
+	let bitlengths = Cell::from_mut(bitlengths.as_mut_slice()).as_slice_of_cells();
+
+	// Build up a collection of "leaves" by joining each non-zero frequency
+	// with its corresponding bitlength.
+	let mut raw_leaves = [Leaf { frequency: NonZeroUsize::MIN, bitlength: &bitlengths[0] }; SIZE];
 	let mut len_leaves = 0;
-	for (&frequency, bitlength) in frequencies.iter().zip(bitlengths.iter_mut()) {
-		// Zero out the bitlength regardless.
-		*bitlength = 0;
-
-		if frequency != 0 {
-			leaves[len_leaves].write(Leaf { frequency, bitlength });
-			len_leaves += 1;
-		}
+	for (v, leaf) in frequencies.iter()
+		.copied()
+		.zip(bitlengths)
+		.filter_map(|(frequency, bitlength)| NonZeroUsize::new(frequency).map(
+			|frequency| Leaf { frequency, bitlength }
+		))
+		.zip(raw_leaves.iter_mut())
+	{
+		*leaf = v;
+		len_leaves += 1;
 	}
 
-	// Nothing to do!
-	if len_leaves == 0 { return Ok(()); }
-
-	// This method is either called with 15 maxbits and 32 or 288 potential
-	// leaves, or 7 maxbits and 19 potential leaves; in either case, the max
-	// leaves are well within range.
-	if (1 << MAXBITS) < len_leaves {
-		return Err(zopfli_error!());
-	}
+	// Shortcut: nothing to do!
+	if len_leaves == 0 || SIZE < len_leaves { return Ok(()); }
 
-	// Set up the pool!
-	BUMP.with_borrow_mut(|nodes| {
-		// The leaves we can actually use:
-		let final_leaves: &mut [Leaf] = unsafe {
-			&mut *(std::ptr::addr_of_mut!(leaves[..len_leaves]) as *mut [Leaf])
-		};
-
-		// Sortcut: weighting only applies when there are more than two leaves;
-		// otherwise we can just record their values as one and call it a day.
-		if len_leaves <= 2 {
-			for leaf in final_leaves { *leaf.bitlength = 1; }
-			return Ok(());
-		}
+	// Reslice to the leaves we're actually using.
+	let leaves = &mut raw_leaves[..len_leaves];
 
-		// Sort the leaves.
-		final_leaves.sort();
+	// Sortcut: weighting only applies when there are more than two leaves.
+	if leaves.len() <= 2 {
+		for leaf in leaves { leaf.bitlength.set(1); }
+		return Ok(());
+	}
 
-		// Shrink maxbits if we have fewer leaves. Note that "maxbits" is an
-		// inclusive value.
-		let maxbits =
-			if len_leaves - 1 < MAXBITS { len_leaves - 1 }
-			else { MAXBITS };
+	// Sort the leaves by frequency.
+	leaves.sort();
 
-		let lookahead0 = nodes.alloc(Node {
-			weight: final_leaves[0].frequency,
-			count: 1,
-			tail: Cell::new(None),
-		});
-		let lookahead1 = nodes.alloc(Node {
-			weight: final_leaves[1].frequency,
-			count: 2,
-			tail: Cell::new(None),
-		});
-		let mut pool = Pool {
-			nodes,
-			leaves: final_leaves,
-		};
-
-		// We won't have more than 15 lists, but we might have fewer.
-		let mut lists = [List { lookahead0, lookahead1 }; 15];
-		let final_lists = &mut lists[..maxbits];
-
-		// In the last list, (2 * len_leaves - 2) active chains need to be
-		// created. We have two already from initialization; each boundary_pm run
-		// will give us another.
-		let num_boundary_pm_runs = 2 * len_leaves - 4;
-		for _ in 0..num_boundary_pm_runs - 1 { pool.boundary_pm(final_lists); }
-
-		// Final touchups!
-		pool.boundary_pm_final(final_lists);
-
-		// Write the results!
-		pool.write_bit_lengths(final_lists[maxbits - 1].lookahead1);
-
-		// Please be kind, rewind!
-		nodes.reset();
-		Ok(())
-	})
+	// Crunch!
+	BUMP.with_borrow_mut(|nodes| llcl::<MAXBITS>(leaves, nodes))
 }
 
 
 
+#[derive(Clone, Copy)]
 /// # Leaf.
 ///
-/// This joins a frequency with its matching bitlength from the raw C slices.
+/// This is a simple tuple containing a non-zero frequency and its companion
+/// bitlength.
 struct Leaf<'a> {
-	frequency: usize,
-	bitlength: &'a mut u32,
+	frequency: NonZeroUsize,
+	bitlength: &'a Cell<u32>,
 }
 
 impl<'a> Eq for Leaf<'a> {}
@@ -156,6 +118,8 @@ impl<'a> PartialOrd for Leaf<'a> {
 
 #[derive(Clone, Copy)]
 /// # List.
+///
+/// This struct holds a pair of recursive node chains.
 struct List<'a> {
 	lookahead0: &'a Node<'a>,
 	lookahead1: &'a Node<'a>,
@@ -183,112 +147,161 @@ struct Node<'a> {
 
 
 
-/// # Node Pool.
-struct Pool<'a> {
-	nodes: &'a Bump,
-	leaves: &'a mut [Leaf<'a>],
+/// # Crunch the Code Lengths.
+///
+/// This method serves as the closure for the exported method's
+/// `BUMP.with_borrow_mut()` call, abstracted here mainly just to improve
+/// readability.
+fn llcl<'a, const MAXBITS: usize>(
+	leaves: &'a [Leaf<'a>],
+	nodes: &'a mut Bump,
+) -> Result<(), ZopfliError> {
+	// This can't happen; it is just a reminder for the compiler.
+	if leaves.len() < 3 || (1 << MAXBITS) < leaves.len() {
+		return Err(zopfli_error!());
+	}
+
+	// Shrink maxbits if we have fewer (leaves - 1).
+	let maxbits =
+		if leaves.len() - 1 < MAXBITS { leaves.len() - 1 }
+		else { MAXBITS };
+
+	let lookahead0 = nodes.alloc(Node {
+		weight: leaves[0].frequency.get(),
+		count: 1,
+		tail: Cell::new(None),
+	});
+
+	let lookahead1 = nodes.alloc(Node {
+		weight: leaves[1].frequency.get(),
+		count: 2,
+		tail: Cell::new(None),
+	});
+
+	// The max MAXBITS is only 15, so we might as well just (slightly)
+	// over-allocate an array to hold our lists.
+	let mut raw_lists = [List { lookahead0, lookahead1 }; MAXBITS];
+	let lists = &mut raw_lists[..maxbits];
+
+	// In the last list, (2 * len_leaves - 2) active chains need to be
+	// created. We have two already from initialization; each boundary_pm run
+	// will give us another.
+	for _ in 0..2 * leaves.len() - 5 {
+		llcl_boundary_pm(leaves, lists, nodes)?;
+	}
+
+	llcl_finish(leaves, lists, nodes)?;
+
+	// Please be kind, rewind!
+	nodes.reset();
+
+	Ok(())
 }
 
-impl<'a> Pool<'a> {
-	/// # Last Count Frequency.
-	const fn last_count_frequency(&self, last_count: usize) -> Option<usize> {
-		if last_count < self.leaves.len() {
-			Some(self.leaves[last_count].frequency)
+/// # Boundary Package-Merge Step.
+///
+/// Add a new chain to the list, using either a leaf or combination of two
+/// chains from the previous list.
+fn llcl_boundary_pm<'a>(leaves: &'a [Leaf<'a>], lists: &mut [List<'a>], nodes: &'a Bump)
+-> Result<(), ZopfliError> {
+	// This method should never be called with an empty list.
+	let [rest @ .., current] = lists else { return Err(zopfli_error!()); };
+	let last_count = current.lookahead1.count;
+
+	// We're at the beginning, which is the end since we're iterating in
+	// reverse.
+	if rest.is_empty() {
+		if last_count < leaves.len() {
+			// Shift the lookahead and add a new node.
+			current.rotate();
+			current.lookahead1 = nodes.alloc(Node {
+				weight: leaves[last_count].frequency.get(),
+				count: last_count + 1,
+				tail: current.lookahead0.tail.clone(),
+			});
 		}
-		else { None }
+		return Ok(());
 	}
 
-	/// # Boundary Package-Merge Step.
-	///
-	/// Add a new chain to the list, using either a leaf or combination of two
-	/// chains from the previous list.
-	fn boundary_pm(&mut self, lists: &mut [List<'a>]) {
-		let [rest @ .., current] = lists else { unreachable!(); };
-		let last_count = current.lookahead1.count;
-
-		// We're at the beginning, which is the end since we're iterating in
-		// reverse.
-		if rest.is_empty() {
-			if let Some(weight) = self.last_count_frequency(last_count) {
-				// Shift the lookahead and add a new node.
-				current.rotate();
-				current.lookahead1 = self.nodes.alloc(Node {
-					weight,
-					count: last_count + 1,
-					tail: current.lookahead0.tail.clone(),
-				});
-			}
-			return;
-		}
+	// Shift the lookahead.
+	current.rotate();
 
-		// Shift the lookahead.
-		current.rotate();
-
-		let previous = rest[rest.len() - 1];
-		let weight_sum = previous.weight_sum();
-
-		// Add a leaf and increment the count.
-		if let Some(weight) = self.last_count_frequency(last_count) {
-			if weight < weight_sum {
-				current.lookahead1 = self.nodes.alloc(Node {
-					weight,
-					count: last_count + 1,
-					tail: current.lookahead0.tail.clone(),
-				});
-				return;
-			}
-		}
+	let previous = rest[rest.len() - 1];
+	let weight_sum = previous.weight_sum();
 
-		// Update the tail.
-		current.lookahead1 = self.nodes.alloc(Node {
-			weight: weight_sum,
-			count: last_count,
-			tail: Cell::new(Some(previous.lookahead1)),
+	// Add a leaf and increment the count.
+	if last_count < leaves.len() && leaves[last_count].frequency.get() < weight_sum {
+		current.lookahead1 = nodes.alloc(Node {
+			weight: leaves[last_count].frequency.get(),
+			count: last_count + 1,
+			tail: current.lookahead0.tail.clone(),
 		});
-
-		// Replace the used-up lookahead chains.
-		self.boundary_pm(rest);
-		self.boundary_pm(rest);
+		return Ok(());
 	}
 
-	/// # Final Package-Merge Step.
-	fn boundary_pm_final(&mut self, lists: &mut [List<'a>]) {
-		let [_rest @ .., previous, current] = lists else { unreachable!(); };
+	// Update the tail.
+	current.lookahead1 = nodes.alloc(Node {
+		weight: weight_sum,
+		count: last_count,
+		tail: Cell::new(Some(previous.lookahead1)),
+	});
 
-		let last_count = current.lookahead1.count;
-		let weight_sum = previous.weight_sum();
-		if last_count < self.leaves.len() && weight_sum > self.leaves[last_count].frequency {
-			current.lookahead1 = self.nodes.alloc(Node {
-				weight: 0,
-				count: last_count + 1,
-				tail: current.lookahead1.tail.clone(),
-			});
-		}
-		else {
-			current.lookahead1.tail.set(Some(previous.lookahead1));
-		}
+	// Replace the used-up lookahead chains by recursing twice.
+	llcl_boundary_pm(leaves, rest, nodes)?;
+	llcl_boundary_pm(leaves, rest, nodes)
+}
+
+/// # Finish and Write Code Lengths!
+///
+/// Add the final chain to the list, then write the weighted counts to the
+/// bitlengths.
+fn llcl_finish<'a>(
+	leaves: &'a [Leaf<'a>],
+	lists: &'a mut [List<'a>],
+	nodes: &'a Bump,
+) -> Result<(), ZopfliError> {
+	// This won't fail; we'll always have at least two lists.
+	let [_rest @ .., list_y, list_z] = lists else { return Err(zopfli_error!()); };
+
+	// Add one more chain or update the tail.
+	let last_count = list_z.lookahead1.count;
+	let weight_sum = list_y.weight_sum();
+	if last_count < leaves.len() && leaves[last_count].frequency.get() < weight_sum {
+		list_z.lookahead1 = nodes.alloc(Node {
+			weight: 0,
+			count: last_count + 1,
+			tail: list_z.lookahead1.tail.clone(),
+		});
+	}
+	else {
+		list_z.lookahead1.tail.set(Some(list_y.lookahead1));
 	}
 
-	/// # Extract/Write Bit Lengths.
-	///
-	/// Copy the bit lengths from the last chain of the last list.
-	fn write_bit_lengths(&mut self, mut node: &'a Node<'a>) {
-		let mut val = node.count;
-		let mut value = 1;
-		while let Some(tail) = node.tail.get() {
-			if val > tail.count {
-				for leaf in &mut self.leaves[tail.count..val] {
-					*leaf.bitlength = value;
-				}
-				val = tail.count;
+	// Write the changes!
+	let mut node = list_z.lookahead1;
+	let mut last_count = node.count;
+
+	// But make sure we counted correctly first!
+	if leaves.len() < last_count { return Err(zopfli_error!()); }
+
+	// Okay, now we can write them!
+	let mut writer = leaves.iter().take(last_count).rev();
+	let mut value = 1;
+	while let Some(tail) = node.tail.get() {
+		// Wait for a change in counts to write the values.
+		if tail.count < last_count {
+			for leaf in writer.by_ref().take(last_count - tail.count) {
+				leaf.bitlength.set(value);
 			}
-			value += 1;
-			node = tail;
-		}
-		for leaf in &mut self.leaves[..val] {
-			*leaf.bitlength = value;
+			last_count = tail.count;
 		}
+		value += 1;
+		node = tail;
 	}
+
+	// Write the final value to any remaining leaves.
+	for leaf in writer { leaf.bitlength.set(value); }
+	Ok(())
 }
 
 
@@ -306,7 +319,7 @@ mod tests {
 	fn t_kat3() {
 		let f = [1, 1, 5, 7, 10, 14];
 		let mut b = [0; 6];
-		assert!(zopfli_length_limited_code_lengths::<3, 6>(&f, &mut b).is_ok());
+		assert!(length_limited_code_lengths::<3, 6>(&f, &mut b).is_ok());
 		assert_eq!(b, [3, 3, 3, 3, 2, 2]);
 	}
 
@@ -314,7 +327,7 @@ mod tests {
 	fn t_kat4() {
 		let f = [1, 1, 5, 7, 10, 14];
 		let mut b = [0; 6];
-		assert!(zopfli_length_limited_code_lengths::<4, 6>(&f, &mut b).is_ok());
+		assert!(length_limited_code_lengths::<4, 6>(&f, &mut b).is_ok());
 		assert_eq!(b, [4, 4, 3, 2, 2, 2]);
 	}
 
@@ -322,7 +335,7 @@ mod tests {
 	fn t_kat7() {
 		let f = [252, 0, 1, 6, 9, 10, 6, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
 		let mut b = [0; 19];
-		assert!(zopfli_length_limited_code_lengths::<7, 19>(&f, &mut b).is_ok());
+		assert!(length_limited_code_lengths::<7, 19>(&f, &mut b).is_ok());
 		assert_eq!(b, [1, 0, 6, 4, 3, 3, 3, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
 	}
 
@@ -333,7 +346,7 @@ mod tests {
 			23, 15, 17, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 		];
 		let mut b = [0; 32];
-		assert!(zopfli_length_limited_code_lengths::<15, 32>(&f, &mut b).is_ok());
+		assert!(length_limited_code_lengths::<15, 32>(&f, &mut b).is_ok());
 		assert_eq!(
 			b,
 			[
@@ -347,18 +360,20 @@ mod tests {
 	fn t_kat_limited() {
 		// No frequencies.
 		let mut f = [0; 19];
-		let mut b = [5; 19]; // Also test the bits get reset correctly.
-		assert!(zopfli_length_limited_code_lengths::<7, 19>(&f, &mut b).is_ok());
+		let mut b = [0; 19];
+		assert!(length_limited_code_lengths::<7, 19>(&f, &mut b).is_ok());
 		assert_eq!(b, [0; 19]);
 
 		// One frequency.
 		f[2] = 10;
-		assert!(zopfli_length_limited_code_lengths::<7, 19>(&f, &mut b).is_ok());
+		b.fill(0);
+		assert!(length_limited_code_lengths::<7, 19>(&f, &mut b).is_ok());
 		assert_eq!(b, [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
 
 		// Two frequencies.
 		f[0] = 248;
-		assert!(zopfli_length_limited_code_lengths::<7, 19>(&f, &mut b).is_ok());
+		b.fill(0);
+		assert!(length_limited_code_lengths::<7, 19>(&f, &mut b).is_ok());
 		assert_eq!(b, [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
 	}
 }
diff --git a/src/image/zopflipng/mod.rs b/src/image/zopflipng/mod.rs
index a077f63..02f2b7d 100644
--- a/src/image/zopflipng/mod.rs
+++ b/src/image/zopflipng/mod.rs
@@ -34,7 +34,7 @@ use error::{
 };
 pub(crate) use hash::ZopfliState;
 use lz77::LZ77Store;
-use kat::zopfli_length_limited_code_lengths;
+use kat::length_limited_code_lengths;
 use super::{
 	ffi::EncodedImage,
 	lodepng::{

From ffd0536c145729863b19cb7c036460d00a368976 Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Tue, 23 Apr 2024 20:01:19 -0700
Subject: [PATCH 29/31] bump: oxipng 9.1.1

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index f391913..2e61cb6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -124,7 +124,7 @@ version = "=2.1.0"
 features = [ "jpegtran" ]
 
 [dependencies.oxipng]
-version = "=9.0.0"
+version = "=9.1.1"
 default-features = false
 
 [build-dependencies]

From 6e3981ea4157a233cf52229230ec08a1ffad13c2 Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Tue, 23 Apr 2024 22:50:12 -0700
Subject: [PATCH 30/31] bump: 3.0.1

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index 2e61cb6..e8e2ef4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "flaca"
-version = "3.0.0"
+version = "3.0.1"
 license = "WTFPL"
 authors = ["Josh Stoik <josh@blobfolio.com>"]
 edition = "2021"

From 6fa7971431845e4163cee6cf19447a4c52bd6594 Mon Sep 17 00:00:00 2001
From: Josh Stoik <josh@blobfolio.com>
Date: Tue, 23 Apr 2024 22:51:09 -0700
Subject: [PATCH 31/31] build: 3.0.1

---
 CREDITS.md                     | 6 +++---
 release/completions/flaca.bash | 1 +
 release/man/flaca.1            | 7 +++++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/CREDITS.md b/CREDITS.md
index 7ed1606..a487f90 100644
--- a/CREDITS.md
+++ b/CREDITS.md
@@ -1,7 +1,7 @@
 # Project Dependencies
     Package:   flaca
-    Version:   3.0.0
-    Generated: 2024-04-13 07:52:07 UTC
+    Version:   3.0.1
+    Generated: 2024-04-24 05:50:28 UTC
 
 | Package | Version | Author(s) | License |
 | ---- | ---- | ---- | ---- |
@@ -30,7 +30,7 @@
 | [libdeflater](https://github.com/adamkewley/libdeflater) | 1.20.0 | [Adam Kewley](mailto:contact@adamkewley.com) | Apache-2.0 |
 | [log](https://github.com/rust-lang/log) | 0.4.21 | The Rust Project Developers | Apache-2.0 or MIT |
 | [mozjpeg-sys](https://github.com/kornelski/mozjpeg-sys.git) | 2.1.0 | [Kornel](mailto:kornel@geekhood.net) | IJG AND Zlib AND BSD-3-Clause |
-| [oxipng](https://github.com/shssoichiro/oxipng) | 9.0.0 | [Joshua Holmer](mailto:jholmer.in@gmail.com) | MIT |
+| [oxipng](https://github.com/shssoichiro/oxipng) | 9.1.1 | [Joshua Holmer](mailto:jholmer.in@gmail.com) | MIT |
 | [radium](https://github.com/bitvecto-rs/radium) | 0.7.0 | [Nika Layzell](mailto:nika@thelayzells.com) and [myrrlyn](mailto:self@myrrlyn.dev) | MIT |
 | [rayon](https://github.com/rayon-rs/rayon) | 1.10.0 | [Niko Matsakis](mailto:niko@alum.mit.edu) and [Josh Stone](mailto:cuviper@gmail.com) | Apache-2.0 or MIT |
 | [rayon-core](https://github.com/rayon-rs/rayon) | 1.12.1 | [Niko Matsakis](mailto:niko@alum.mit.edu) and [Josh Stone](mailto:cuviper@gmail.com) | Apache-2.0 or MIT |
diff --git a/release/completions/flaca.bash b/release/completions/flaca.bash
index 541e14f..7917b63 100644
--- a/release/completions/flaca.bash
+++ b/release/completions/flaca.bash
@@ -18,6 +18,7 @@ _basher___flaca() {
 		opts+=("-V")
 		opts+=("--version")
 	fi
+	[[ " ${COMP_LINE} " =~ " -j " ]] || opts+=("-j")
 	if [[ ! " ${COMP_LINE} " =~ " -l " ]] && [[ ! " ${COMP_LINE} " =~ " --list " ]]; then
 		opts+=("-l")
 		opts+=("--list")
diff --git a/release/man/flaca.1 b/release/man/flaca.1
index fb36f26..d443b63 100644
--- a/release/man/flaca.1
+++ b/release/man/flaca.1
@@ -1,6 +1,6 @@
-.TH "FLACA" "1" "April 2024" "Flaca v3.0.0" "User Commands"
+.TH "FLACA" "1" "April 2024" "Flaca v3.0.1" "User Commands"
 .SH NAME
-Flaca \- Manual page for flaca v3.0.0.
+Flaca \- Manual page for flaca v3.0.1.
 .SH DESCRIPTION
 Brute\-force, lossless JPEG and PNG compression.
 .SS USAGE:
@@ -24,6 +24,9 @@ Show progress bar while minifying.
 Print version information and exit.
 .SS OPTIONS:
 .TP
+\fB\-j\fR <NUM>
+Limit parallelization to this many threads (instead of giving each logical core its own image to work on). If negative, the value will be subtracted from the total number of logical cores.
+.TP
 \fB\-l\fR, \fB\-\-list\fR <FILE>
 Read (absolute) image and/or directory paths from this text file — or STDIN if '\-' — one entry per line, instead of or in addition to (actually trailing) <PATH(S)>.
 .TP