astcenc: Update to 4.4.0

> The 4.4.0 release is a minor release with image quality improvements, > a small performance boost, a few new quality-of-life features, and a > few minor fixes for uncommon build configurations. https://github.com/ARM-software/astc-encoder/releases/tag/4.4.0
godotengine · May 11, 2023 · 5a3f955 · 5a3f955
1 parent fd4a06c
commit 5a3f955
Show file tree

Hide file tree

Showing 15 changed files with 659 additions and 953 deletions.
diff --git a/modules/astcenc/SCsub b/modules/astcenc/SCsub
@@ -29,7 +29,6 @@ thirdparty_sources = [
     "astcenc_partition_tables.cpp",
     "astcenc_percentile_tables.cpp",
     "astcenc_pick_best_endpoint_format.cpp",
-    "astcenc_platform_isa_detection.cpp",
     "astcenc_quantization.cpp",
     "astcenc_symbolic_physical.cpp",
     "astcenc_weight_align.cpp",

diff --git a/thirdparty/README.md b/thirdparty/README.md
@@ -20,7 +20,7 @@ Files extracted from upstream source:
 ## astcenc
 
 - Upstream: https://github.com/ARM-software/astc-encoder
-- Version: 4.3.0 (ec83dda79fcefe07f69cdae7ed980d169bf2c4d4, 2023)
+- Version: 4.4.0 (5a5b5a1ef60dd47c27c28c66c118d22c40e3197e, 2023)
 - License: Apache 2.0
 
 Files extracted from upstream source:

diff --git a/thirdparty/astcenc/astcenc.h b/thirdparty/astcenc/astcenc.h
@@ -43,6 +43,14 @@
  *       for faster processing. The caller is responsible for creating the worker threads, and
  *       synchronizing between images.
  *
+ * Extended instruction set support
+ * ================================
+ *
+ * This library supports use of extended instruction sets, such as SSE4.1 and AVX2. These are
+ * enabled at compile time when building the library. There is no runtime checking in the core
+ * library that the instruction sets used are actually available. Checking compatibility is the
+ * responsibility of the calling code.
+ *
  * Threading
  * =========
  *
@@ -191,8 +199,6 @@ enum astcenc_error {
 	ASTCENC_ERR_OUT_OF_MEM,
 	/** @brief The call failed due to the build using fast math. */
 	ASTCENC_ERR_BAD_CPU_FLOAT,
-	/** @brief The call failed due to the build using an unsupported ISA. */
-	ASTCENC_ERR_BAD_CPU_ISA,
 	/** @brief The call failed due to an out-of-spec parameter. */
 	ASTCENC_ERR_BAD_PARAM,
 	/** @brief The call failed due to an out-of-spec block size. */
@@ -472,7 +478,7 @@ struct astcenc_config
 	/**
 	 * @brief The number of trial candidates per mode search (-candidatelimit).
 	 *
-	 * Valid values are between 1 and TUNE_MAX_TRIAL_CANDIDATES (default 4).
+	 * Valid values are between 1 and TUNE_MAX_TRIAL_CANDIDATES.
 	 */
 	unsigned int tune_candidate_limit;
 
@@ -520,21 +526,21 @@ struct astcenc_config
 	 *
 	 * This option is further scaled for normal maps, so it skips less often.
 	 */
-	float tune_2_partition_early_out_limit_factor;
+	float tune_2partition_early_out_limit_factor;
 
 	/**
 	 * @brief The threshold for skipping 4.1 trials (-3partitionlimitfactor).
 	 *
 	 * This option is further scaled for normal maps, so it skips less often.
 	 */
-	float tune_3_partition_early_out_limit_factor;
+	float tune_3partition_early_out_limit_factor;
 
 	/**
 	 * @brief The threshold for skipping two weight planes (-2planelimitcorrelation).
 	 *
 	 * This option is ineffective for normal maps.
 	 */
-	float tune_2_plane_early_out_limit_correlation;
+	float tune_2plane_early_out_limit_correlation;
 
 #if defined(ASTCENC_DIAGNOSTICS)
 	/**

diff --git a/thirdparty/astcenc/astcenc_averages_and_directions.cpp b/thirdparty/astcenc/astcenc_averages_and_directions.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2022 Arm Limited
+// Copyright 2011-2023 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -725,8 +725,7 @@ void compute_error_squared_rgba(
 	const image_block& blk,
 	const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
 	const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
-	float uncor_lengths[BLOCK_MAX_PARTITIONS],
-	float samec_lengths[BLOCK_MAX_PARTITIONS],
+	float line_lengths[BLOCK_MAX_PARTITIONS],
 	float& uncor_error,
 	float& samec_error
 ) {
@@ -740,12 +739,6 @@ void compute_error_squared_rgba(
 	{
 		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
 
-		float uncor_loparam = 1e10f;
-		float uncor_hiparam = -1e10f;
-
-		float samec_loparam = 1e10f;
-		float samec_hiparam = -1e10f;
-
 		processed_line4 l_uncor = uncor_plines[partition];
 		processed_line4 l_samec = samec_plines[partition];
 
@@ -773,9 +766,6 @@ void compute_error_squared_rgba(
 		vfloat uncor_loparamv(1e10f);
 		vfloat uncor_hiparamv(-1e10f);
 
-		vfloat samec_loparamv(1e10f);
-		vfloat samec_hiparamv(-1e10f);
-
 		vfloat ew_r(blk.channel_weight.lane<0>());
 		vfloat ew_g(blk.channel_weight.lane<1>());
 		vfloat ew_b(blk.channel_weight.lane<2>());
@@ -825,9 +815,6 @@ void compute_error_squared_rgba(
 			                   + (data_b * l_samec_bs2)
 			                   + (data_a * l_samec_bs3);
 
-			samec_loparamv = min(samec_param, samec_loparamv);
-			samec_hiparamv = max(samec_param, samec_hiparamv);
-
 			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
 			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
 			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
@@ -843,18 +830,9 @@ void compute_error_squared_rgba(
 			lane_ids += vint(ASTCENC_SIMD_WIDTH);
 		}
 
-		uncor_loparam = hmin_s(uncor_loparamv);
-		uncor_hiparam = hmax_s(uncor_hiparamv);
-
-		samec_loparam = hmin_s(samec_loparamv);
-		samec_hiparam = hmax_s(samec_hiparamv);
-
-		float uncor_linelen = uncor_hiparam - uncor_loparam;
-		float samec_linelen = samec_hiparam - samec_loparam;
-
 		// Turn very small numbers and NaNs into a small number
-		uncor_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
-		samec_lengths[partition] = astc::max(samec_linelen, 1e-7f);
+		float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
+		line_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
 	}
 
 	uncor_error = hadd_s(uncor_errorsumv);
@@ -882,19 +860,9 @@ void compute_error_squared_rgb(
 		unsigned int texel_count = pi.partition_texel_count[partition];
 		promise(texel_count > 0);
 
-		float uncor_loparam = 1e10f;
-		float uncor_hiparam = -1e10f;
-
-		float samec_loparam = 1e10f;
-		float samec_hiparam = -1e10f;
-
 		processed_line3 l_uncor = pl.uncor_pline;
 		processed_line3 l_samec = pl.samec_pline;
 
-		// This implementation is an example vectorization of this function.
-		// It works for - the codec is a 2-4% faster than not vectorizing - but
-		// the benefit is limited by the use of gathers and register pressure
-
 		// Vectorize some useful scalar inputs
 		vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
 		vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
@@ -913,9 +881,6 @@ void compute_error_squared_rgb(
 		vfloat uncor_loparamv(1e10f);
 		vfloat uncor_hiparamv(-1e10f);
 
-		vfloat samec_loparamv(1e10f);
-		vfloat samec_hiparamv(-1e10f);
-
 		vfloat ew_r(blk.channel_weight.lane<0>());
 		vfloat ew_g(blk.channel_weight.lane<1>());
 		vfloat ew_b(blk.channel_weight.lane<2>());
@@ -958,9 +923,6 @@ void compute_error_squared_rgb(
 			                   + (data_g * l_samec_bs1)
 			                   + (data_b * l_samec_bs2);
 
-			samec_loparamv = min(samec_param, samec_loparamv);
-			samec_hiparamv = max(samec_param, samec_hiparamv);
-
 			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
 			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
 			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
@@ -974,18 +936,9 @@ void compute_error_squared_rgb(
 			lane_ids += vint(ASTCENC_SIMD_WIDTH);
 		}
 
-		uncor_loparam = hmin_s(uncor_loparamv);
-		uncor_hiparam = hmax_s(uncor_hiparamv);
-
-		samec_loparam = hmin_s(samec_loparamv);
-		samec_hiparam = hmax_s(samec_hiparamv);
-
-		float uncor_linelen = uncor_hiparam - uncor_loparam;
-		float samec_linelen = samec_hiparam - samec_loparam;
-
 		// Turn very small numbers and NaNs into a small number
-		pl.uncor_line_len = astc::max(uncor_linelen, 1e-7f);
-		pl.samec_line_len = astc::max(samec_linelen, 1e-7f);
+		float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv);
+		pl.line_length = astc::max(uncor_linelen, 1e-7f);
 	}
 
 	uncor_error = hadd_s(uncor_errorsumv);

diff --git a/thirdparty/astcenc/astcenc_block_sizes.cpp b/thirdparty/astcenc/astcenc_block_sizes.cpp
@@ -776,8 +776,8 @@ static void construct_dt_entry_2d(
 	assert(maxprec_1plane >= 0 || maxprec_2planes >= 0);
 	bsd.decimation_modes[index].maxprec_1plane = static_cast<int8_t>(maxprec_1plane);
 	bsd.decimation_modes[index].maxprec_2planes = static_cast<int8_t>(maxprec_2planes);
-	bsd.decimation_modes[index].refprec_1_plane = 0;
-	bsd.decimation_modes[index].refprec_2_planes = 0;
+	bsd.decimation_modes[index].refprec_1plane = 0;
+	bsd.decimation_modes[index].refprec_2planes = 0;
 }
 
 /**
@@ -934,11 +934,11 @@ static void construct_block_size_descriptor_2d(
 
 			if (is_dual_plane)
 			{
-				dm.set_ref_2_plane(bm.get_weight_quant_mode());
+				dm.set_ref_2plane(bm.get_weight_quant_mode());
 			}
 			else
 			{
-				dm.set_ref_1_plane(bm.get_weight_quant_mode());
+				dm.set_ref_1plane(bm.get_weight_quant_mode());
 			}
 
 			bsd.block_mode_packed_index[i] = static_cast<uint16_t>(packed_bm_idx);
@@ -969,8 +969,8 @@ static void construct_block_size_descriptor_2d(
 	{
 		bsd.decimation_modes[i].maxprec_1plane = -1;
 		bsd.decimation_modes[i].maxprec_2planes = -1;
-		bsd.decimation_modes[i].refprec_1_plane = 0;
-		bsd.decimation_modes[i].refprec_2_planes = 0;
+		bsd.decimation_modes[i].refprec_1plane = 0;
+		bsd.decimation_modes[i].refprec_2planes = 0;
 	}
 
 	// Determine the texels to use for kmeans clustering.
@@ -1055,8 +1055,8 @@ static void construct_block_size_descriptor_3d(
 
 				bsd.decimation_modes[decimation_mode_count].maxprec_1plane = static_cast<int8_t>(maxprec_1plane);
 				bsd.decimation_modes[decimation_mode_count].maxprec_2planes = static_cast<int8_t>(maxprec_2planes);
-				bsd.decimation_modes[decimation_mode_count].refprec_1_plane = maxprec_1plane == -1 ? 0 : 0xFFFF;
-				bsd.decimation_modes[decimation_mode_count].refprec_2_planes = maxprec_2planes == -1 ? 0 : 0xFFFF;
+				bsd.decimation_modes[decimation_mode_count].refprec_1plane = maxprec_1plane == -1 ? 0 : 0xFFFF;
+				bsd.decimation_modes[decimation_mode_count].refprec_2planes = maxprec_2planes == -1 ? 0 : 0xFFFF;
 				decimation_mode_count++;
 			}
 		}
@@ -1067,8 +1067,8 @@ static void construct_block_size_descriptor_3d(
 	{
 		bsd.decimation_modes[i].maxprec_1plane = -1;
 		bsd.decimation_modes[i].maxprec_2planes = -1;
-		bsd.decimation_modes[i].refprec_1_plane = 0;
-		bsd.decimation_modes[i].refprec_2_planes = 0;
+		bsd.decimation_modes[i].refprec_1plane = 0;
+		bsd.decimation_modes[i].refprec_2planes = 0;
 	}
 
 	bsd.decimation_mode_count_always = 0; // Skipped for 3D modes