From 4133894ec138ce2bd26d0fb7cc53f78aa20c0e97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9C=B4=E7=B4=A0=E8=B4=9D=E5=8F=B6=E6=96=AF?=
 <gaobiao6@huawei.com>
Date: Tue, 27 Aug 2024 07:28:33 +0000
Subject: [PATCH 1/3] astc-encoder optimization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 朴素贝叶斯 <gaobiao6@huawei.com>
---
 Source/astcenc_compress_symbolic.cpp          | 213 +++++++++++++++++-
 Source/astcenc_find_best_partitioning.cpp     |  14 ++
 .../astcenc_ideal_endpoints_and_weights.cpp   |  75 ++++--
 Source/astcenc_internal.h                     |   4 +
 Source/astcenc_pick_best_endpoint_format.cpp  | 129 +++++++++--
 Source/astcenc_vecmathlib_common_4.h          |  16 ++
 Source/astcenc_vecmathlib_neon_4.h            |  51 ++++-
 Source/astcenc_weight_align.cpp               | 105 ++++++++-
 8 files changed, 568 insertions(+), 39 deletions(-)

diff --git a/Source/astcenc_compress_symbolic.cpp b/Source/astcenc_compress_symbolic.cpp
index df55d05..6636109 100644
--- a/Source/astcenc_compress_symbolic.cpp
+++ b/Source/astcenc_compress_symbolic.cpp
@@ -69,6 +69,214 @@ static void merge_endpoints(
  * @param      blk           The image block color data to compress.
  * @param[out] scb           The symbolic compressed block output.
  */
+#if ASTCENC_NEON != 0
+static bool realign_weights_undecimated(
+	astcenc_profile decode_mode,
+	const block_size_descriptor& bsd,
+	const image_block& blk,
+	symbolic_compressed_block& scb
+) {
+	// Get the partition descriptor
+	unsigned int partition_count = scb.partition_count;
+	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
+
+	// Get the quantization table
+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
+	unsigned int weight_quant_level = bm.quant_mode;
+	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
+
+	unsigned int max_plane = bm.is_dual_plane;
+	int plane2_component = scb.plane2_component;
+	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
+
+	// Decode the color endpoints
+	bool rgb_hdr;
+	bool alpha_hdr;
+	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
+	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
+	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
+	vfloat4 offset[BLOCK_MAX_PARTITIONS];
+
+	promise(partition_count > 0);
+
+	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
+	{
+		unpack_color_endpoints(decode_mode,
+		                       scb.color_formats[pa_idx],
+		                       scb.color_values[pa_idx],
+		                       rgb_hdr, alpha_hdr,
+		                       endpnt0[pa_idx],
+		                       endpnt1[pa_idx]);
+	}
+
+	uint8_t* dec_weights_uquant = scb.weights;
+	bool adjustments = false;
+
+	// For each plane and partition ...
+	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
+	{
+		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
+		{
+			// Compute the endpoint delta for all components in current plane
+			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
+			epd = select(epd, vint4::zero(), plane_mask);
+
+			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
+			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
+		}
+
+		// For each weight compute previous, current, and next errors
+		promise(bsd.texel_count > 0);
+
+		unsigned int texel = 0;
+		for (; texel + ASTCENC_SIMD_WIDTH <= bsd.texel_count; texel += ASTCENC_SIMD_WIDTH)
+		{
+			int uqw0 = dec_weights_uquant[texel];
+			int uqw1 = dec_weights_uquant[texel + 1];
+			int uqw2 = dec_weights_uquant[texel + 2];
+			int uqw3 = dec_weights_uquant[texel + 3];
+
+			vint4 uqw_vec = vint4(uqw0, uqw1, uqw2, uqw3);
+			vint4 prev_and_next_vec = vint4(qat.prev_next_values[uqw0], qat.prev_next_values[uqw1],
+							qat.prev_next_values[uqw2], qat.prev_next_values[uqw3]);
+
+			vint4 mask = vint4(0xFF, 0xFF, 0xFF, 0xFF);
+			vint4 uqw_down_vec = prev_and_next_vec & mask;
+			vint4 uqw_up_vec = vint4(vshrq_n_s32(prev_and_next_vec.m, 8)) & mask;
+
+			vfloat4 weight_base_vec = int_to_float(uqw_vec);
+			vfloat4 weight_down_vec = int_to_float(uqw_down_vec) - weight_base_vec;
+			vfloat4 weight_up_vec = int_to_float(uqw_up_vec) - weight_base_vec;
+
+			unsigned int partition0 = pi.partition_of_texel[texel];
+			unsigned int partition1 = pi.partition_of_texel[texel + 1];
+			unsigned int partition2 = pi.partition_of_texel[texel + 2];
+			unsigned int partition3 = pi.partition_of_texel[texel + 3];
+
+			vfloat4 color_offset0 = offset[partition0];
+			vfloat4 color_offset1 = offset[partition1];
+			vfloat4 color_offset2 = offset[partition2];
+			vfloat4 color_offset3 = offset[partition3];
+
+			vfloat4 color_base0 = endpnt0f[partition0];
+			vfloat4 color_base1 = endpnt0f[partition1];
+			vfloat4 color_base2 = endpnt0f[partition2];
+			vfloat4 color_base3 = endpnt0f[partition3];
+
+			vfloat4 color0 = color_base0 + color_offset0 * weight_base_vec.lane<0>();
+			vfloat4 color1 = color_base1 + color_offset1 * weight_base_vec.lane<1>();
+			vfloat4 color2 = color_base2 + color_offset2 * weight_base_vec.lane<2>();
+			vfloat4 color3 = color_base3 + color_offset3 * weight_base_vec.lane<3>();
+
+			vfloat4 orig_color0 = blk.texel(texel);
+			vfloat4 orig_color1 = blk.texel(texel + 1);
+			vfloat4 orig_color2 = blk.texel(texel + 2);
+			vfloat4 orig_color3 = blk.texel(texel + 3);
+
+			vfloat4 error_weight = blk.channel_weight;
+
+			vfloat4 color_diff0 = color0 - orig_color0;
+			vfloat4 color_diff1 = color1 - orig_color1;
+			vfloat4 color_diff2 = color2 - orig_color2;
+			vfloat4 color_diff3 = color3 - orig_color3;
+
+			vfloat4 color_diff_down0 = color_diff0 + color_offset0 * weight_down_vec.lane<0>();
+			vfloat4 color_diff_down1 = color_diff1 + color_offset1 * weight_down_vec.lane<1>();
+			vfloat4 color_diff_down2 = color_diff2 + color_offset2 * weight_down_vec.lane<2>();
+			vfloat4 color_diff_down3 = color_diff3 + color_offset3 * weight_down_vec.lane<3>();
+			
+			vfloat4 color_diff_up0 = color_diff0 + color_offset0 * weight_up_vec.lane<0>();
+			vfloat4 color_diff_up1 = color_diff1 + color_offset1 * weight_up_vec.lane<1>();
+			vfloat4 color_diff_up2 = color_diff2 + color_offset2 * weight_up_vec.lane<2>();
+			vfloat4 color_diff_up3 = color_diff3 + color_offset3 * weight_up_vec.lane<3>();
+
+			float error_base0 = dot_s(color_diff0 * color_diff0, error_weight);
+			float error_base1 = dot_s(color_diff1 * color_diff1, error_weight);
+			float error_base2 = dot_s(color_diff2 * color_diff2, error_weight);
+			float error_base3 = dot_s(color_diff3 * color_diff3, error_weight);
+
+			float error_down0 = dot_s(color_diff_down0 * color_diff_down0, error_weight);
+			float error_down1 = dot_s(color_diff_down1 * color_diff_down1, error_weight);
+			float error_down2 = dot_s(color_diff_down2 * color_diff_down2, error_weight);
+			float error_down3 = dot_s(color_diff_down3 * color_diff_down3, error_weight);
+
+			float error_up0 = dot_s(color_diff_up0 * color_diff_up0, error_weight);
+			float error_up1 = dot_s(color_diff_up1 * color_diff_up1, error_weight);
+			float error_up2 = dot_s(color_diff_up2 * color_diff_up2, error_weight);
+			float error_up3 = dot_s(color_diff_up3 * color_diff_up3, error_weight);
+
+			vfloat4 error_base_vec = vfloat4(error_base0, error_base1, error_base2, error_base3);
+			vfloat4 error_down_vec = vfloat4(error_down0, error_down1, error_down2, error_down3);
+			vfloat4 error_up_vec = vfloat4(error_up0, error_up1, error_up2, error_up3);
+
+			vmask4 check_result_up = (error_up_vec < error_base_vec) & 
+					(error_up_vec < error_down_vec) & (uqw_vec < vin4(64));
+
+			vmask4 check_result_down = (error_down_vec < error_base_vec) & (uqw_vec > vint4::zero());
+			check_result_down = check_result_down & (~check_result_up);
+
+			if (popcount(check_result_up | check_result_down) != 0)
+			{
+				uqw_vec = select(uqw_vec, uqw_up_vec, check_result_up);
+				uqw_vec = select(uqw_vec, uqw_down_vec, check_result_down);
+
+				dec_weights_uquant[texel] = uqw.lane<0>();
+				dec_weights_uquant[texel + 1] = uqw.lane<1>();
+				dec_weights_uquant[texel + 2] = uqw.lane<2>();
+				dec_weights_uquant[texel + 3] = uqw.lane<3>();
+				adjustments = true;
+			}
+		};
+
+		for (; texel < bsd.texel_count; texel++)
+		{
+			int uqw = dec_weights_uquant[texel];
+
+			uint32_t prev_and_next = qat.prev_next_values[uqw];
+			int uqw_down = prev_and_next & 0xFF;
+			int uqw_up = (prev_and_next >> 8) & 0xFF;
+
+			// Interpolate the colors to create the diffs
+			float weight_base = static_cast<float>(uqw);
+			float weight_down = static_cast<float>(uqw_down - uqw);
+			float weight_up = static_cast<float>(uqw_up - uqw);
+
+			unsigned int partition = pi.partition_of_texel[texel];
+			vfloat4 color_offset = offset[partition];
+			vfloat4 color_base   = endpnt0f[partition];
+
+			vfloat4 color = color_base + color_offset * weight_base;
+			vfloat4 orig_color   = blk.texel(texel);
+			vfloat4 error_weight = blk.channel_weight;
+
+			vfloat4 color_diff      = color - orig_color;
+			vfloat4 color_diff_down = color_diff + color_offset * weight_down;
+			vfloat4 color_diff_up   = color_diff + color_offset * weight_up;
+
+			float error_base = dot_s(color_diff      * color_diff,      error_weight);
+			float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
+			float error_up   = dot_s(color_diff_up   * color_diff_up,   error_weight);
+
+			// Check if the prev or next error is better, and if so use it
+			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
+			{
+				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
+				adjustments = true;
+			}
+			else if ((error_down < error_base) && (uqw > 0))
+			{
+				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
+				adjustments = true;
+			}
+		}
+
+		// Prepare iteration for plane 2
+		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
+		plane_mask = ~plane_mask;
+	}
+	return adjustments;
+}
+#else
 static bool realign_weights_undecimated(
 	astcenc_profile decode_mode,
 	const block_size_descriptor& bsd,
@@ -175,6 +383,7 @@ static bool realign_weights_undecimated(
 
 	return adjustments;
 }
+#endif
 
 /**
  * @brief Attempt to improve weights given a chosen configuration.
@@ -423,7 +632,7 @@ static float compress_symbolic_block_for_partition_1plane(
 
 	// For each mode, use the angular method to compute a shift
 	compute_angular_endpoints_1plane(
-	    only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
+	    privateProfile, only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
 
 	float* weight_low_value = tmpbuf.weight_low_value1;
 	float* weight_high_value = tmpbuf.weight_high_value1;
@@ -799,7 +1008,7 @@ static float compress_symbolic_block_for_partition_2planes(
 	float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
 
 	compute_angular_endpoints_2planes(
-	    bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
+	    privateProfile, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
 
 	// For each mode (which specifies a decimation and a quantization):
 	//     * Compute number of bits needed for the quantized weights
diff --git a/Source/astcenc_find_best_partitioning.cpp b/Source/astcenc_find_best_partitioning.cpp
index bfbcc35..5d0682e 100644
--- a/Source/astcenc_find_best_partitioning.cpp
+++ b/Source/astcenc_find_best_partitioning.cpp
@@ -250,6 +250,19 @@ static void kmeans_update(
  *
  * @return    The number of bit mismatches.
  */
+#if ASTCENC_NEON != 0
+static inline uint8_t partition_mismatch2(
+	const uint64_t a[2],
+	const uint64_t b[2]
+) {
+	uint64x2_t a01 = vld1q_u64(a);
+	uint64x2_t b01 = vld1q_u64(b);
+	uint64x2_t b10 = vextq_u64(b01, b01, 1);
+	uint8_t c1 = popcount(veorq_u64(a01, b01));
+	uint8_t c2 = popcount(veorq_u64(a01, b10));
+	return static_cast<uint8_t>(astc::min(c1, c2) / 2);
+}
+#else
 static inline uint8_t partition_mismatch2(
 	const uint64_t a[2],
 	const uint64_t b[2]
@@ -261,6 +274,7 @@ static inline uint8_t partition_mismatch2(
 	// in the expected position, and again when present in the wrong partition
 	return static_cast<uint8_t>(astc::min(v1, v2) / 2);
 }
+#endif
 
 /**
  * @brief Compute bit-mismatch for partitioning in 3-partition mode.
diff --git a/Source/astcenc_ideal_endpoints_and_weights.cpp b/Source/astcenc_ideal_endpoints_and_weights.cpp
index 051782f..2a17899 100644
--- a/Source/astcenc_ideal_endpoints_and_weights.cpp
+++ b/Source/astcenc_ideal_endpoints_and_weights.cpp
@@ -364,7 +364,7 @@ static void compute_ideal_colors_and_weights_3_comp(
 	unsigned int texel_count = blk.texel_count;
 	promise(texel_count > 0);
 
-	partition_metrics pms[BLOCK_MAX_PARTITIONS];
+	partition_metrics *pms = (partition_metrics *)&blk.pms[0];
 
 	float error_weight;
 	const float* data_vr = nullptr;
@@ -372,7 +372,7 @@ static void compute_ideal_colors_and_weights_3_comp(
 	const float* data_vb = nullptr;
 	if (omitted_component == 0)
 	{
-		error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
+		error_weight = hadd_s(blk.channel_weight.swz<1, 2, 3>());
 		data_vr = blk.data_g;
 		data_vg = blk.data_b;
 		data_vb = blk.data_a;
@@ -428,7 +428,52 @@ static void compute_ideal_colors_and_weights_3_comp(
 		float highparam { -1e10f };
 
 		unsigned int partition_texel_count = pi.partition_texel_count[i];
-		for (unsigned int j = 0; j < partition_texel_count; j++)
+
+		vfloat4 lowparam_vec = float4(1e10f, 1e10f, 1e10f, 1e10f);
+		vfloat4 highparam_vec = float4(-1e10f, -1e10f, -1e10f, -1e10f);
+		
+		unsigned int j = 0;
+		for (; j + ASTCENC_SIMD_WIDTH <= partition_texel_count; j += ASTCENC_SIMD_WIDTH)
+		{
+			unsigned int tix0 = pi.texels_of_partition[i][j];
+			unsigned int tix1 = pi.texels_of_partition[i][j + 1];
+			unsigned int tix2 = pi.texels_of_partition[i][j + 2];
+			unsigned int tix3 = pi.texels_of_partition[i][j + 3];
+
+			vfloat4 points0 = vfloat4(data_vr[tix0], data_vg[tix0], data_vb[tix0], 0.0f);
+			vfloat4 points1 = vfloat4(data_vr[tix1], data_vg[tix1], data_vb[tix1], 0.0f);
+			vfloat4 points2 = vfloat4(data_vr[tix2], data_vg[tix2], data_vb[tix2], 0.0f);
+			vfloat4 points3 = vfloat4(data_vr[tix3], data_vg[tix3], data_vb[tix3], 0.0f);
+
+			vfloat4 sub_v0 = points0 - line.a;
+			vfloat4 sub_v1 = points1 - line.a;
+			vfloat4 sub_v2 = points2 - line.a;
+			vfloat4 sub_v3 = points3 - line.a;
+
+			vfloat4 params0 = sub_v0 * line.b;
+			vfloat4 params1 = sub_v1 * line.b;
+			vfloat4 params2 = sub_v2 * line.b;
+			vfloat4 params3 = sub_v3 * line.b;
+
+			float param0 = hadd_rgba_s(params0);
+			float param1 = hadd_rgba_s(params1);
+			float param2 = hadd_rgba_s(params2);
+			float param3 = hadd_rgba_s(params3);
+
+			ei.weights[tix0] = param0;
+			ei.weights[tix1] = param1;
+			ei.weights[tix2] = param2;
+			ei.weights[tix3] = param3;
+
+			vfloat4 params_vec = vfloat4(param0, param1, param2, param3);
+			lowparam_vec = min(params_vec, lowparam_vec);
+			highparam_vec = max(params_vec, highparam_vec);
+		}
+
+		lowparam = hmin_s(vfloat4(lowparam_vec));
+		highparam = hmax_s(vfloat4(highparam_vec));
+		
+		for (; j < partition_texel_count; j++)
 		{
 			unsigned int tix = pi.texels_of_partition[i][j];
 			vfloat4 point = vfloat3(data_vr[tix], data_vg[tix], data_vb[tix]);
@@ -460,7 +505,7 @@ static void compute_ideal_colors_and_weights_3_comp(
 			is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
 		}
 
-		for (unsigned int j = 0; j < partition_texel_count; j++)
+		for (j = 0; j < partition_texel_count; j++)
 		{
 			unsigned int tix = pi.texels_of_partition[i][j];
 			float idx = (ei.weights[tix] - lowparam) * scale;
@@ -894,7 +939,11 @@ void compute_ideal_weights_for_decimation(
 
 		for (unsigned int j = 0; j < max_texel_count; j++)
 		{
+#ifdef ASTCENC_USE_COMMON_GATHERF
+			const uint8_t* texel = di.weight_texels_tr[j] + i;
+#else
 			vint texel(di.weight_texels_tr[j] + i);
+#endif
 			vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);
 
 			if (!constant_wes)
@@ -952,7 +1001,11 @@ void compute_ideal_weights_for_decimation(
 
 		for (unsigned int j = 0; j < max_texel_count; j++)
 		{
+#ifdef ASTCENC_USE_COMMON_GATHERF
+			const uint8_t* texel = di.weight_texels_tr[j] + i;
+#else
 			vint texel(di.weight_texels_tr[j] + i);
+#endif
 			vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);
 
 			if (!constant_wes)
@@ -1041,12 +1094,9 @@ void compute_quantized_weights_for_decimation(
 			vint ixli = vtable_8bt_32bi(tab0p, weightl);
 			vint ixhi = vtable_8bt_32bi(tab0p, weighth);
 
-			vfloat ixl = int_to_float(ixli);
-			vfloat ixh = int_to_float(ixhi);
-
-			vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
+			vmask mask = int_to_float(ixli + ixhi) < (vfloat(128.0f) * ix);
 			vint weight = select(ixli, ixhi, mask);
-			ixl = select(ixl, ixh, mask);
+			vfloat ixl = int_to_float(weight);
 
 			// Invert the weight-scaling that was done initially
 			storea(ixl * rscalev + low_boundv, weight_set_out + i);
@@ -1075,12 +1125,9 @@ void compute_quantized_weights_for_decimation(
 			vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl);
 			vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth);
 
-			vfloat ixl = int_to_float(ixli);
-			vfloat ixh = int_to_float(ixhi);
-
-			vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
+			vmask mask = int_to_float(ixli + ixhi) < (vfloat(128.0f) * ix);
 			vint weight = select(ixli, ixhi, mask);
-			ixl = select(ixl, ixh, mask);
+			vfloat ixl = int_to_float(weight);
 
 			// Invert the weight-scaling that was done initially
 			storea(ixl * rscalev + low_boundv, weight_set_out + i);
diff --git a/Source/astcenc_internal.h b/Source/astcenc_internal.h
index bad1247..ba6a4b9 100644
--- a/Source/astcenc_internal.h
+++ b/Source/astcenc_internal.h
@@ -763,6 +763,8 @@ struct image_block
 	/** @brief The input (compress) or output (decompress) data for the alpha color component. */
 	ASTCENC_ALIGNAS float data_a[BLOCK_MAX_TEXELS];
 
+	partition_metrics pms[BLOCK_MAX_PARTITIONS];
+
 	/** @brief The number of texels in the block. */
 	uint8_t texel_count;
 
@@ -2040,6 +2042,7 @@ void prepare_angular_tables();
  * @param[out] tmpbuf                   Preallocated scratch buffers for the compressor.
  */
 void compute_angular_endpoints_1plane(
+	QualityProfile privateProfile,
 	bool only_always,
 	const block_size_descriptor& bsd,
 	const float* dec_weight_ideal_value,
@@ -2055,6 +2058,7 @@ void compute_angular_endpoints_1plane(
  * @param[out] tmpbuf                   Preallocated scratch buffers for the compressor.
  */
 void compute_angular_endpoints_2planes(
+	QualityProfile privateProfile,
 	const block_size_descriptor& bsd,
 	const float* dec_weight_ideal_value,
 	unsigned int max_weight_quant,
diff --git a/Source/astcenc_pick_best_endpoint_format.cpp b/Source/astcenc_pick_best_endpoint_format.cpp
index dbc12ec..ff4e445 100644
--- a/Source/astcenc_pick_best_endpoint_format.cpp
+++ b/Source/astcenc_pick_best_endpoint_format.cpp
@@ -120,9 +120,99 @@ static void compute_error_squared_rgb_single_partition(
 	vfloat l_bs1(l_pline.bs.lane<1>());
 	vfloat l_bs2(l_pline.bs.lane<2>());
 
-	vint lane_ids = vint::lane_id();
-	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+	vfloat one_third(1/3.0f, 1/3.0f, 1/3.0f, 1/3.0f);
+	vfloat uncor_errv0 = vfloat::zero();
+	vfloat uncor_errv1 = vfloat::zero();
+	vfloat uncor_errv2 = vfloat::zero();
+	vfloat samec_errv0 = vfloat::zero();
+	vfloat samec_errv1 = vfloat::zero();
+	vfloat samec_errv2 = vfloat::zero();
+	vfloat rgbl_errv0 = vfloat::zero();
+	vfloat rgbl_errv1 = vfloat::zero();
+	vfloat rgbl_errv2 = vfloat::zero();
+	vfloat l_errv0 = vfloat::zero();
+	vfloat l_errv1 = vfloat::zero();
+	vfloat l_errv2 = vfloat::zero();
+
+	unsigned int i = 0;
+	for (; i + ASTCENC_SIMD_WIDTH <= texel_count; i += ASTCENC_SIMD_WIDTH)
 	{
+#ifdef ASTCENC_USE_COMMON_GATHERF
+		const uint8_t* tix = texel_indexes + i;
+#else
+		vint tix(texel_indexes + i);
+#endif
+
+		// Compute the error that arises from just ditching alpha
+		vfloat data_a = gatherf(blk.data_a, tix);
+		vfloat alpha_diff = data_a - default_a;
+		alpha_diff = alpha_diff * alpha_diff;
+
+		haccumulate(a_drop_errv, alpha_diff);
+
+		vfloat data_r = gatherf(blk.data_r, tix);
+		vfloat data_g = gatherf(blk.data_g, tix);
+		vfloat data_b = gatherf(blk.data_b, tix);
+
+		vfloat data_rgb_avg = (data_r + data_g + data_b) * one_third;
+		vfloat data_rgb_0 = data_rgb_avg - data_r;
+		vfloat data_rgb_1 = data_rgb_avg - data_g;
+		vfloat data_rgb_2 = data_rgb_avg - data_b;
+
+		// Compute uncorrelated error
+		vfloat param = data_r * uncor_bs0
+		             + data_g * uncor_bs1
+		             + data_b * uncor_bs2;
+
+		vfloat dist0 = (uncor_amod0 + param * uncor_bs0) - data_r;
+		vfloat dist1 = (uncor_amod1 + param * uncor_bs1) - data_g;
+		vfloat dist2 = (uncor_amod2 + param * uncor_bs2) - data_b;
+
+		haccumulate(uncor_errv0, dist0 * dist0);
+		haccumulate(uncor_errv1, dist1 * dist1);
+		haccumulate(uncor_errv2, dist2 * dist2);
+
+		// Compute same chroma error - no "amod", its always zero
+		param = data_r * samec_bs0
+		      + data_g * samec_bs1
+		      + data_b * samec_bs2;
+
+		dist0 = (param * samec_bs0) - data_r;
+		dist1 = (param * samec_bs1) - data_g;
+		dist2 = (param * samec_bs2) - data_b;
+
+		haccumulate(uncor_errv0, dist0 * dist0);
+		haccumulate(uncor_errv1, dist1 * dist1);
+		haccumulate(uncor_errv2, dist2 * dist2);
+
+		// Compute rgbl error
+		dist0 = rgbl_amod0 + data_rgb_0;
+		dist1 = rgbl_amod1 + data_rgb_1;
+		dist2 = rgbl_amod2 + data_rgb_2;
+
+		haccumulate(rgbl_errv0, dist0 * dist0);
+		haccumulate(rgbl_errv1, dist1 * dist1);
+		haccumulate(rgbl_errv2, dist2 * dist2);
+		
+		// Compute luma error - no "amod", its always zero
+		dist0 = data_rgb_0;
+		dist1 = data_rgb_1;
+		dist2 = data_rgb_2;
+
+		haccumulate(l_errv0, dist0 * dist0);
+		haccumulate(l_errv1, dist1 * dist1);
+		haccumulate(l_errv2, dist2 * dist2);
+	}
+
+	uncor_errv = uncor_errv0 * ews.lane<0>() + uncor_errv1 * ews.lane<1>() + uncor_errv2 * ews.lane<2>();
+	samec_errv = samec_errv0 * ews.lane<0>() + samec_errv1 * ews.lane<1>() + samec_errv2 * ews.lane<2>();
+	rgbl_errv = rgbl_errv0 * ews.lane<0>() + rgbl_errv1 * ews.lane<1>() + rgbl_errv2 * ews.lane<2>();
+	l_errv = l_errv0 * ews.lane<0>() + l_errv1 * ews.lane<1>() + l_errv2 * ews.lane<2>();
+
+
+	if (i < texel_count)
+	{
+		vint lane_ids = vint::lane_id() + i;
 		vint tix(texel_indexes + i);
 
 		vmask mask = lane_ids < vint(texel_count);
@@ -139,6 +229,11 @@ static void compute_error_squared_rgb_single_partition(
 		vfloat data_g = gatherf(blk.data_g, tix);
 		vfloat data_b = gatherf(blk.data_b, tix);
 
+		vfloat data_rgb_avg = (data_r + data_g + data_b) * one_third;
+		vfloat data_rgb_0 = data_rgb_avg - data_r;
+		vfloat data_rgb_1 = data_rgb_avg - data_g;
+		vfloat data_rgb_2 = data_rgb_avg - data_b;
+
 		// Compute uncorrelated error
 		vfloat param = data_r * uncor_bs0
 		             + data_g * uncor_bs1
@@ -170,13 +265,9 @@ static void compute_error_squared_rgb_single_partition(
 		haccumulate(samec_errv, error, mask);
 
 		// Compute rgbl error
-		param = data_r * rgbl_bs0
-		      + data_g * rgbl_bs1
-		      + data_b * rgbl_bs2;
-
-		dist0 = (rgbl_amod0 + param * rgbl_bs0) - data_r;
-		dist1 = (rgbl_amod1 + param * rgbl_bs1) - data_g;
-		dist2 = (rgbl_amod2 + param * rgbl_bs2) - data_b;
+		dist0 = rgbl_amod0 + data_rgb_0;
+		dist1 = rgbl_amod1 + data_rgb_1;
+		dist2 = rgbl_amod2 + data_rgb_2;
 
 		error = dist0 * dist0 * ews.lane<0>()
 		      + dist1 * dist1 * ews.lane<1>()
@@ -185,13 +276,9 @@ static void compute_error_squared_rgb_single_partition(
 		haccumulate(rgbl_errv, error, mask);
 
 		// Compute luma error - no "amod", its always zero
-		param = data_r * l_bs0
-		      + data_g * l_bs1
-		      + data_b * l_bs2;
-
-		dist0 = (param * l_bs0) - data_r;
-		dist1 = (param * l_bs1) - data_g;
-		dist2 = (param * l_bs2) - data_b;
+		dist0 = data_rgb_0;
+		dist1 = data_rgb_1;
+		dist2 = data_rgb_2;
 
 		error = dist0 * dist0 * ews.lane<0>()
 		      + dist1 * dist1 * ews.lane<1>()
@@ -220,6 +307,7 @@ static void compute_error_squared_rgb_single_partition(
  * @param[out] eci   The resulting encoding choice error metrics.
   */
 static void compute_encoding_choice_errors(
+	QualityProfile privateProfile,
 	const image_block& blk,
 	const partition_info& pi,
 	const endpoints& ep,
@@ -228,9 +316,12 @@ static void compute_encoding_choice_errors(
 	int partition_count = pi.partition_count;
 	promise(partition_count > 0);
 
-	partition_metrics pms[BLOCK_MAX_PARTITIONS];
+	partition_metrics *pms = (partition_metrics *)&blk.pms[0];
 
-	compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
+	if (!blk.is_constant_channel(3) || (partition_count != 1 && privateProfile == HIGH_QUALITY_PROFILE))
+	{
+		compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
+	}
 
 	for (int i = 0; i < partition_count; i++)
 	{
@@ -1133,7 +1224,7 @@ unsigned int compute_ideal_endpoint_formats(
 	// Compute the errors that result from various encoding choices (such as using luminance instead
 	// of RGB, discarding Alpha, using RGB-scale in place of two separate RGB endpoints and so on)
 	encoding_choice_errors eci[BLOCK_MAX_PARTITIONS];
-	compute_encoding_choice_errors(blk, pi, ep, eci);
+	compute_encoding_choice_errors(privateProfile, blk, pi, ep, eci);
 
 	float best_error[BLOCK_MAX_PARTITIONS][21][4];
 	uint8_t format_of_choice[BLOCK_MAX_PARTITIONS][21][4];
diff --git a/Source/astcenc_vecmathlib_common_4.h b/Source/astcenc_vecmathlib_common_4.h
index 1e04367..00d7263 100644
--- a/Source/astcenc_vecmathlib_common_4.h
+++ b/Source/astcenc_vecmathlib_common_4.h
@@ -287,6 +287,12 @@ ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m)
 	haccumulate(accum, a);
 }
 
+#define ASTCENC_USE_COMMON_GATHERF
+ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, const uint8_t* idx)
+{
+	return vfloat4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);
+}
+
 /**
  * @brief Return the horizontal sum of RGB vector lanes as a scalar.
  */
@@ -295,6 +301,16 @@ ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a)
 	return a.lane<0>() + a.lane<1>() + a.lane<2>();
 }
 
+#if !define(ASTCENC_USE_NATIVE_ADDV)
+/**
+ * @brief Return the horizontal sum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hadd_rgba_s(vfloat4 a)
+{
+	return a.lane<0>() + a.lane<1>() + a.lane<2>() + a.lane<3>();
+}
+#endif
+
 #if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT)
 
 /**
diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h
index 42545e7..73efcbc 100644
--- a/Source/astcenc_vecmathlib_neon_4.h
+++ b/Source/astcenc_vecmathlib_neon_4.h
@@ -579,6 +579,25 @@ ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
 	return vget_lane_s32(vpadd_s32(t, t), 0);
 }
 
+/**
+ * @brief Return the horizontal sum of a vector.
+ */
+ASTCENC_SIMD_INLINE uint32_t hadd_s(vmask4 a)
+{
+	// Use add with SIMD versions
+	return vaddvq_u32(a.m);
+}
+
+#define ASTCENC_USE_NATIVE_ADDV
+/**
+ * @brief Return the horizontal sum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hadd_rgba_s(vfloat4 a)
+{
+	// Use add with SIMD versions
+	return vaddvq_f32(a.m);
+}
+
 /**
  * @brief Store a vector to a 16B aligned memory address.
  */
@@ -631,10 +650,12 @@ ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
  */
 ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
 {
-	alignas(16) uint8_t shuf[16] {
-		0, 4, 8, 12,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0
+	uint8x16_t idx = {
+		0, 4, 8, 12,
+		0, 0, 0, 0,
+		0, 0, 0, 0,
+		0, 0, 0, 0
 	};
-	uint8x16_t idx = vld1q_u8(shuf);
 	int8x16_t av = vreinterpretq_s8_s32(a.m);
 	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
 }
@@ -1096,4 +1117,28 @@ ASTCENC_SIMD_INLINE int popcount(uint64_t v)
 	return static_cast<int>(vaddlv_u8(vcnt_u8(vcreate_u8(v))));
 }
 
+/**
+ * @brief Population bit count.
+ *
+ * @param v   The value to population count.
+ *
+ * @return The number of 1 bits.
+ */
+ASTCENC_SIMD_INLINE int popcount(uint64x2_t v)
+{
+	return static_cast<int>(vaddvq_u8(vcntq_u8(vreinterpret_u8_u64(v))));
+}
+
+/**
+ * @brief Population bit count.
+ *
+ * @param v   The value to population count.
+ *
+ * @return The number of 1 bits.
+ */
+ASTCENC_SIMD_INLINE int popcount(vmask4 v)
+{
+	return static_cast<int>(vaddvq_u8(vcntq_u8(vreinterpret_u8_u32(v.m))));
+}
+
 #endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
diff --git a/Source/astcenc_weight_align.cpp b/Source/astcenc_weight_align.cpp
index 4e993e7..e1c79ba 100644
--- a/Source/astcenc_weight_align.cpp
+++ b/Source/astcenc_weight_align.cpp
@@ -149,7 +149,9 @@ static void compute_angular_offsets(
  * @param[out] cut_low_weight_error      Per angular step, the low weight cut error.
  * @param[out] cut_high_weight_error     Per angular step, the high weight cut error.
  */
+#if ASTCENC_NEON != 0
 static void compute_lowest_and_highest_weight(
+	QualityProfile privateProfile,
 	unsigned int weight_count,
 	const float* dec_weight_ideal_value,
 	unsigned int max_angular_steps,
@@ -166,6 +168,100 @@ static void compute_lowest_and_highest_weight(
 
 	vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
 
+	float max_weight = 1.0f;
+	float min_weight = 0.0f;
+	if (privateProfile == HIGH_QUALITY_PROFILE)
+	{
+		max_weight = dec_weight_ideal_value[0];
+		min_weight = dec_weight_ideal_value[0];
+		for (unsigned int j = 1; j < weight_count; j++)
+		{
+			float weight = dec_weight_ideal_value[j];
+			__asm__ volatile("fmax %s0, %s0, %s1" : "+w"(max_weight) : "W"(weight));
+			__asm__ volatile("fmin %s0, %s0, %s1" : "+w"(min_weight) : "W"(weight));
+		}
+	}
+
+	// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
+	for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
+	{
+		vfloat errval = vfloat::zero();
+		vfloat cut_low_weight_err = vfloat::zero();
+		vfloat cut_high_weight_err = vfloat::zero();
+		vfloat offset = loada(offsets + sp);
+
+		offset = (vfloat)vnegq_f32(offset.m);
+		vfloat maxidx = vfloat::zero();
+		vfloat minidx = vfloat::zero();
+
+		if (privateProfile == HIGH_SPEED_PROFILE)
+		{
+			maxidx = round((vfloat)vaddq_f32(rcp_stepsize.m, offset.m));
+			minidx = round(offset);
+		}
+		else
+		{
+			maxidx = round((vfloat)vfmaq_n_f32(offset.m, rcp_stepsize.m, max_weight));
+			minidx = round((vfloat)vfmaq_n_f32(offset.m, rcp_stepsize.m, min_weight));
+		}
+
+		for (unsigned int j = 0; j < weight_count; j++)
+		{
+			vfloat sval = (vfloat)vfmaq_n_f32(offset.m, rcp_stepsize.m, *(dec_weight_ideal_value + j));
+			vfloat svalrte = round(sval);
+			vfloat diff = sval - svalrte;
+			errval += diff * diff;
+
+			// Accumulate on min hit
+			vmask mask = svalrte == minidx;
+			vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
+			cut_low_weight_err = select(cut_low_weight_err, accum, mask);
+
+			// Accumulate on max hit
+			mask = svalrte == maxidx;
+			accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
+			cut_high_weight_err = select(cut_high_weight_err, accum, mask);
+		}
+
+		// Write out min weight and weight span; clamp span to a usable range
+		vint span = float_to_int(maxidx - minidx + vfloat(1));
+		span = min(span, vint(max_quant_steps + 3));
+		span = max(span, vint(2));
+		storea(minidx, lowest_weight + sp);
+		storea(span, weight_span + sp);
+
+		// The cut_(lowest/highest)_weight_error indicate the error that results from  forcing
+		// samples that should have had the weight value one step (up/down).
+		vfloat ssize = 1.0f / rcp_stepsize;
+		vfloat errscale = ssize * ssize;
+		storea(errval * errscale, error + sp);
+		storea(cut_low_weight_err * errscale, cut_low_weight_error + sp);
+		storea(cut_high_weight_err * errscale, cut_high_weight_error + sp);
+
+		rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
+	}
+
+}
+#else
+static void compute_lowest_and_highest_weight(
+	QualityProfile privateProfile,
+	unsigned int weight_count,
+	const float* dec_weight_ideal_value,
+	unsigned int max_angular_steps,
+	unsigned int max_quant_steps,
+	const float* offsets,
+	float* lowest_weight,
+	int* weight_span,
+	float* error,
+	float* cut_low_weight_error,
+	float* cut_high_weight_error
+) {
+	(void) privateProfile;
+	promise(weight_count > 0);
+	promise(max_angular_steps > 0);
+
+	vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
+
 	// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
 	for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
 	{
@@ -222,6 +318,7 @@ static void compute_lowest_and_highest_weight(
 		rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
 	}
 }
+#endif
 
 /**
  * @brief The main function for the angular algorithm.
@@ -233,6 +330,7 @@ static void compute_lowest_and_highest_weight(
  * @param[out] high_value                Per angular step, the highest weight value.
  */
 static void compute_angular_endpoints_for_quant_levels(
+	QualityProfile privateProfile,
 	unsigned int weight_count,
 	const float* dec_weight_ideal_value,
 	unsigned int max_quant_level,
@@ -253,7 +351,7 @@ static void compute_angular_endpoints_for_quant_levels(
 	ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS];
 	ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS];
 
-	compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
+	compute_lowest_and_highest_weight(privateProfile, weight_count, dec_weight_ideal_value,
 	                                  max_angular_steps, max_quant_steps,
 	                                  angular_offsets, lowest_weight, weight_span, error,
 	                                  cut_low_weight_error, cut_high_weight_error);
@@ -335,6 +433,7 @@ static void compute_angular_endpoints_for_quant_levels(
 
 /* See header for documentation. */
 void compute_angular_endpoints_1plane(
+	QualityProfile privateProfile,
 	bool only_always,
 	const block_size_descriptor& bsd,
 	const float* dec_weight_ideal_value,
@@ -372,6 +471,7 @@ void compute_angular_endpoints_1plane(
 		}
 
 		compute_angular_endpoints_for_quant_levels(
+			privateProfile,
 		    weight_count,
 		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
 		    max_precision, low_values[i], high_values[i]);
@@ -403,6 +503,7 @@ void compute_angular_endpoints_1plane(
 
 /* See header for documentation. */
 void compute_angular_endpoints_2planes(
+	QualityProfile privateProfile,
 	const block_size_descriptor& bsd,
 	const float* dec_weight_ideal_value,
 	unsigned int max_weight_quant,
@@ -441,11 +542,13 @@ void compute_angular_endpoints_2planes(
 		}
 
 		compute_angular_endpoints_for_quant_levels(
+			privateProfile,
 		    weight_count,
 		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
 		    max_precision, low_values1[i], high_values1[i]);
 
 		compute_angular_endpoints_for_quant_levels(
+			privateProfile,
 		    weight_count,
 		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
 		    max_precision, low_values2[i], high_values2[i]);
-- 
Gitee


From 972e4b4506ce2a07398cdc02d32b656573ac9874 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9C=B4=E7=B4=A0=E8=B4=9D=E5=8F=B6=E6=96=AF?=
 <gaobiao6@huawei.com>
Date: Tue, 27 Aug 2024 09:09:53 +0000
Subject: [PATCH 2/3] astcenc-encoder optimization
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 朴素贝叶斯 <gaobiao6@huawei.com>
---
 Source/astcenc_compress_symbolic.cpp           | 10 +++++-----
 Source/astcenc_ideal_endpoints_and_weights.cpp |  4 ++--
 Source/astcenc_vecmathlib_common_4.h           |  2 +-
 Source/astcenc_vecmathlib_neon_4.h             |  4 ++--
 Source/astcenc_weight_align.cpp                |  4 ++--
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/Source/astcenc_compress_symbolic.cpp b/Source/astcenc_compress_symbolic.cpp
index 6636109..ca1e981 100644
--- a/Source/astcenc_compress_symbolic.cpp
+++ b/Source/astcenc_compress_symbolic.cpp
@@ -210,7 +210,7 @@ static bool realign_weights_undecimated(
 			vfloat4 error_up_vec = vfloat4(error_up0, error_up1, error_up2, error_up3);
 
 			vmask4 check_result_up = (error_up_vec < error_base_vec) & 
-					(error_up_vec < error_down_vec) & (uqw_vec < vin4(64));
+					(error_up_vec < error_down_vec) & (uqw_vec < vint4(64));
 
 			vmask4 check_result_down = (error_down_vec < error_base_vec) & (uqw_vec > vint4::zero());
 			check_result_down = check_result_down & (~check_result_up);
@@ -220,10 +220,10 @@ static bool realign_weights_undecimated(
 				uqw_vec = select(uqw_vec, uqw_up_vec, check_result_up);
 				uqw_vec = select(uqw_vec, uqw_down_vec, check_result_down);
 
-				dec_weights_uquant[texel] = uqw.lane<0>();
-				dec_weights_uquant[texel + 1] = uqw.lane<1>();
-				dec_weights_uquant[texel + 2] = uqw.lane<2>();
-				dec_weights_uquant[texel + 3] = uqw.lane<3>();
+				dec_weights_uquant[texel] = uqw_vec.lane<0>();
+				dec_weights_uquant[texel + 1] = uqw_vec.lane<1>();
+				dec_weights_uquant[texel + 2] = uqw_vec.lane<2>();
+				dec_weights_uquant[texel + 3] = uqw_vec.lane<3>();
 				adjustments = true;
 			}
 		};
diff --git a/Source/astcenc_ideal_endpoints_and_weights.cpp b/Source/astcenc_ideal_endpoints_and_weights.cpp
index 2a17899..29f5745 100644
--- a/Source/astcenc_ideal_endpoints_and_weights.cpp
+++ b/Source/astcenc_ideal_endpoints_and_weights.cpp
@@ -429,8 +429,8 @@ static void compute_ideal_colors_and_weights_3_comp(
 
 		unsigned int partition_texel_count = pi.partition_texel_count[i];
 
-		vfloat4 lowparam_vec = float4(1e10f, 1e10f, 1e10f, 1e10f);
-		vfloat4 highparam_vec = float4(-1e10f, -1e10f, -1e10f, -1e10f);
+		vfloat4 lowparam_vec = vfloat4(1e10f, 1e10f, 1e10f, 1e10f);
+		vfloat4 highparam_vec = vfloat4(-1e10f, -1e10f, -1e10f, -1e10f);
 		
 		unsigned int j = 0;
 		for (; j + ASTCENC_SIMD_WIDTH <= partition_texel_count; j += ASTCENC_SIMD_WIDTH)
diff --git a/Source/astcenc_vecmathlib_common_4.h b/Source/astcenc_vecmathlib_common_4.h
index 00d7263..a19b954 100644
--- a/Source/astcenc_vecmathlib_common_4.h
+++ b/Source/astcenc_vecmathlib_common_4.h
@@ -301,7 +301,7 @@ ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a)
 	return a.lane<0>() + a.lane<1>() + a.lane<2>();
 }
 
-#if !define(ASTCENC_USE_NATIVE_ADDV)
+#if !defined(ASTCENC_USE_NATIVE_ADDV)
 /**
  * @brief Return the horizontal sum of a vector.
  */
diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h
index 73efcbc..f6f8de1 100644
--- a/Source/astcenc_vecmathlib_neon_4.h
+++ b/Source/astcenc_vecmathlib_neon_4.h
@@ -1126,7 +1126,7 @@ ASTCENC_SIMD_INLINE int popcount(uint64_t v)
  */
 ASTCENC_SIMD_INLINE int popcount(uint64x2_t v)
 {
-	return static_cast<int>(vaddvq_u8(vcntq_u8(vreinterpret_u8_u64(v))));
+	return static_cast<int>(vaddvq_u8(vcntq_u8(vreinterpretq_u8_u64(v))));
 }
 
 /**
@@ -1138,7 +1138,7 @@ ASTCENC_SIMD_INLINE int popcount(uint64x2_t v)
  */
 ASTCENC_SIMD_INLINE int popcount(vmask4 v)
 {
-	return static_cast<int>(vaddvq_u8(vcntq_u8(vreinterpret_u8_u32(v.m))));
+	return static_cast<int>(vaddvq_u8(vcntq_u8(vreinterpretq_u8_u32(v.m))));
 }
 
 #endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
diff --git a/Source/astcenc_weight_align.cpp b/Source/astcenc_weight_align.cpp
index e1c79ba..84cec1f 100644
--- a/Source/astcenc_weight_align.cpp
+++ b/Source/astcenc_weight_align.cpp
@@ -177,8 +177,8 @@ static void compute_lowest_and_highest_weight(
 		for (unsigned int j = 1; j < weight_count; j++)
 		{
 			float weight = dec_weight_ideal_value[j];
-			__asm__ volatile("fmax %s0, %s0, %s1" : "+w"(max_weight) : "W"(weight));
-			__asm__ volatile("fmin %s0, %s0, %s1" : "+w"(min_weight) : "W"(weight));
+			__asm__ volatile("fmax %s0, %s0, %s1" : "+w"(max_weight) : "w"(weight));
+			__asm__ volatile("fmin %s0, %s0, %s1" : "+w"(min_weight) : "w"(weight));
 		}
 	}
 
-- 
Gitee


From 127d8d462e26d266283d219ce1b1c4ac8371464b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9C=B4=E7=B4=A0=E8=B4=9D=E5=8F=B6=E6=96=AF?=
 <gaobiao6@huawei.com>
Date: Thu, 5 Sep 2024 01:37:46 +0000
Subject: [PATCH 3/3] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=A3=80=E8=A7=86?=
 =?UTF-8?q?=E6=84=8F=E8=A7=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 朴素贝叶斯 <gaobiao6@huawei.com>
---
 Source/astcenc_weight_align.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Source/astcenc_weight_align.cpp b/Source/astcenc_weight_align.cpp
index 84cec1f..703a0c6 100644
--- a/Source/astcenc_weight_align.cpp
+++ b/Source/astcenc_weight_align.cpp
@@ -170,7 +170,8 @@ static void compute_lowest_and_highest_weight(
 
 	float max_weight = 1.0f;
 	float min_weight = 0.0f;
-	if (privateProfile == HIGH_QUALITY_PROFILE)
+	// in HIGH_SPEED_PROFILE, max_weight is always equal to 1.0, and min_weight is always equal to 0
+	if (privateProfile != HIGH_SPEED_PROFILE)
 	{
 		max_weight = dec_weight_ideal_value[0];
 		min_weight = dec_weight_ideal_value[0];
@@ -542,13 +543,13 @@ void compute_angular_endpoints_2planes(
 		}
 
 		compute_angular_endpoints_for_quant_levels(
-			privateProfile,
+		    privateProfile,
 		    weight_count,
 		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
 		    max_precision, low_values1[i], high_values1[i]);
 
 		compute_angular_endpoints_for_quant_levels(
-			privateProfile,
+		    privateProfile,
 		    weight_count,
 		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
 		    max_precision, low_values2[i], high_values2[i]);
-- 
Gitee