From 4133894ec138ce2bd26d0fb7cc53f78aa20c0e97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B4=E7=B4=A0=E8=B4=9D=E5=8F=B6=E6=96=AF?= Date: Tue, 27 Aug 2024 07:28:33 +0000 Subject: [PATCH 1/3] astc-encoder optimization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 朴素贝叶斯 --- Source/astcenc_compress_symbolic.cpp | 213 +++++++++++++++++- Source/astcenc_find_best_partitioning.cpp | 14 ++ .../astcenc_ideal_endpoints_and_weights.cpp | 75 ++++-- Source/astcenc_internal.h | 4 + Source/astcenc_pick_best_endpoint_format.cpp | 129 +++++++++-- Source/astcenc_vecmathlib_common_4.h | 16 ++ Source/astcenc_vecmathlib_neon_4.h | 51 ++++- Source/astcenc_weight_align.cpp | 105 ++++++++- 8 files changed, 568 insertions(+), 39 deletions(-) diff --git a/Source/astcenc_compress_symbolic.cpp b/Source/astcenc_compress_symbolic.cpp index df55d05..6636109 100644 --- a/Source/astcenc_compress_symbolic.cpp +++ b/Source/astcenc_compress_symbolic.cpp @@ -69,6 +69,214 @@ static void merge_endpoints( * @param blk The image block color data to compress. * @param[out] scb The symbolic compressed block output. */ +#if ASTCENC_NEON != 0 +static bool realign_weights_undecimated( + astcenc_profile decode_mode, + const block_size_descriptor& bsd, + const image_block& blk, + symbolic_compressed_block& scb +) { + // Get the partition descriptor + unsigned int partition_count = scb.partition_count; + const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index); + + // Get the quantization table + const block_mode& bm = bsd.get_block_mode(scb.block_mode); + unsigned int weight_quant_level = bm.quant_mode; + const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level]; + + unsigned int max_plane = bm.is_dual_plane; + int plane2_component = scb.plane2_component; + vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component); + + // Decode the color endpoints + bool rgb_hdr; + bool alpha_hdr; + vint4 endpnt0[BLOCK_MAX_PARTITIONS]; + vint4 endpnt1[BLOCK_MAX_PARTITIONS]; + vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS]; + vfloat4 offset[BLOCK_MAX_PARTITIONS]; + + promise(partition_count > 0); + + for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++) + { + unpack_color_endpoints(decode_mode, + scb.color_formats[pa_idx], + scb.color_values[pa_idx], + rgb_hdr, alpha_hdr, + endpnt0[pa_idx], + endpnt1[pa_idx]); + } + + uint8_t* dec_weights_uquant = scb.weights; + bool adjustments = false; + + // For each plane and partition ... + for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++) + { + for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++) + { + // Compute the endpoint delta for all components in current plane + vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx]; + epd = select(epd, vint4::zero(), plane_mask); + + endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]); + offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f); + } + + // For each weight compute previous, current, and next errors + promise(bsd.texel_count > 0); + + unsigned int texel = 0; + for (; texel + ASTCENC_SIMD_WIDTH <= bsd.texel_count; texel += ASTCENC_SIMD_WIDTH) + { + int uqw0 = dec_weights_uquant[texel]; + int uqw1 = dec_weights_uquant[texel + 1]; + int uqw2 = dec_weights_uquant[texel + 2]; + int uqw3 = dec_weights_uquant[texel + 3]; + + vint4 uqw_vec = vint4(uqw0, uqw1, uqw2, uqw3); + vint4 prev_and_next_vec = vint4(qat.prev_next_values[uqw0], qat.prev_next_values[uqw1], + qat.prev_next_values[uqw2], qat.prev_next_values[uqw3]); + + vint4 mask = vint4(0xFF, 0xFF, 0xFF, 0xFF); + vint4 uqw_down_vec = prev_and_next_vec & mask; + vint4 uqw_up_vec = vint4(vshrq_n_s32(prev_and_next_vec.m, 8)) & mask; + + vfloat4 weight_base_vec = int_to_float(uqw_vec); + vfloat4 weight_down_vec = int_to_float(uqw_down_vec) - weight_base_vec; + vfloat4 weight_up_vec = int_to_float(uqw_up_vec) - weight_base_vec; + + unsigned int partition0 = pi.partition_of_texel[texel]; + unsigned int partition1 = pi.partition_of_texel[texel + 1]; + unsigned int partition2 = pi.partition_of_texel[texel + 2]; + unsigned int partition3 = pi.partition_of_texel[texel + 3]; + + vfloat4 color_offset0 = offset[partition0]; + vfloat4 color_offset1 = offset[partition1]; + vfloat4 color_offset2 = offset[partition2]; + vfloat4 color_offset3 = offset[partition3]; + + vfloat4 color_base0 = endpnt0f[partition0]; + vfloat4 color_base1 = endpnt0f[partition1]; + vfloat4 color_base2 = endpnt0f[partition2]; + vfloat4 color_base3 = endpnt0f[partition3]; + + vfloat4 color0 = color_base0 + color_offset0 * weight_base_vec.lane<0>(); + vfloat4 color1 = color_base1 + color_offset1 * weight_base_vec.lane<1>(); + vfloat4 color2 = color_base2 + color_offset2 * weight_base_vec.lane<2>(); + vfloat4 color3 = color_base3 + color_offset3 * weight_base_vec.lane<3>(); + + vfloat4 orig_color0 = blk.texel(texel); + vfloat4 orig_color1 = blk.texel(texel + 1); + vfloat4 orig_color2 = blk.texel(texel + 2); + vfloat4 orig_color3 = blk.texel(texel + 3); + + vfloat4 error_weight = blk.channel_weight; + + vfloat4 color_diff0 = color0 - orig_color0; + vfloat4 color_diff1 = color1 - orig_color1; + vfloat4 color_diff2 = color2 - orig_color2; + vfloat4 color_diff3 = color3 - orig_color3; + + vfloat4 color_diff_down0 = color_diff0 + color_offset0 * weight_down_vec.lane<0>(); + vfloat4 color_diff_down1 = color_diff1 + color_offset1 * weight_down_vec.lane<1>(); + vfloat4 color_diff_down2 = color_diff2 + color_offset2 * weight_down_vec.lane<2>(); + vfloat4 color_diff_down3 = color_diff3 + color_offset3 * weight_down_vec.lane<3>(); + + vfloat4 color_diff_up0 = color_diff0 + color_offset0 * weight_up_vec.lane<0>(); + vfloat4 color_diff_up1 = color_diff1 + color_offset1 * weight_up_vec.lane<1>(); + vfloat4 color_diff_up2 = color_diff2 + color_offset2 * weight_up_vec.lane<2>(); + vfloat4 color_diff_up3 = color_diff3 + color_offset3 * weight_up_vec.lane<3>(); + + float error_base0 = dot_s(color_diff0 * color_diff0, error_weight); + float error_base1 = dot_s(color_diff1 * color_diff1, error_weight); + float error_base2 = dot_s(color_diff2 * color_diff2, error_weight); + float error_base3 = dot_s(color_diff3 * color_diff3, error_weight); + + float error_down0 = dot_s(color_diff_down0 * color_diff_down0, error_weight); + float error_down1 = dot_s(color_diff_down1 * color_diff_down1, error_weight); + float error_down2 = dot_s(color_diff_down2 * color_diff_down2, error_weight); + float error_down3 = dot_s(color_diff_down3 * color_diff_down3, error_weight); + + float error_up0 = dot_s(color_diff_up0 * color_diff_up0, error_weight); + float error_up1 = dot_s(color_diff_up1 * color_diff_up1, error_weight); + float error_up2 = dot_s(color_diff_up2 * color_diff_up2, error_weight); + float error_up3 = dot_s(color_diff_up3 * color_diff_up3, error_weight); + + vfloat4 error_base_vec = vfloat4(error_base0, error_base1, error_base2, error_base3); + vfloat4 error_down_vec = vfloat4(error_down0, error_down1, error_down2, error_down3); + vfloat4 error_up_vec = vfloat4(error_up0, error_up1, error_up2, error_up3); + + vmask4 check_result_up = (error_up_vec < error_base_vec) & + (error_up_vec < error_down_vec) & (uqw_vec < vin4(64)); + + vmask4 check_result_down = (error_down_vec < error_base_vec) & (uqw_vec > vint4::zero()); + check_result_down = check_result_down & (~check_result_up); + + if (popcount(check_result_up | check_result_down) != 0) + { + uqw_vec = select(uqw_vec, uqw_up_vec, check_result_up); + uqw_vec = select(uqw_vec, uqw_down_vec, check_result_down); + + dec_weights_uquant[texel] = uqw.lane<0>(); + dec_weights_uquant[texel + 1] = uqw.lane<1>(); + dec_weights_uquant[texel + 2] = uqw.lane<2>(); + dec_weights_uquant[texel + 3] = uqw.lane<3>(); + adjustments = true; + } + }; + + for (; texel < bsd.texel_count; texel++) + { + int uqw = dec_weights_uquant[texel]; + + uint32_t prev_and_next = qat.prev_next_values[uqw]; + int uqw_down = prev_and_next & 0xFF; + int uqw_up = (prev_and_next >> 8) & 0xFF; + + // Interpolate the colors to create the diffs + float weight_base = static_cast(uqw); + float weight_down = static_cast(uqw_down - uqw); + float weight_up = static_cast(uqw_up - uqw); + + unsigned int partition = pi.partition_of_texel[texel]; + vfloat4 color_offset = offset[partition]; + vfloat4 color_base = endpnt0f[partition]; + + vfloat4 color = color_base + color_offset * weight_base; + vfloat4 orig_color = blk.texel(texel); + vfloat4 error_weight = blk.channel_weight; + + vfloat4 color_diff = color - orig_color; + vfloat4 color_diff_down = color_diff + color_offset * weight_down; + vfloat4 color_diff_up = color_diff + color_offset * weight_up; + + float error_base = dot_s(color_diff * color_diff, error_weight); + float error_down = dot_s(color_diff_down * color_diff_down, error_weight); + float error_up = dot_s(color_diff_up * color_diff_up, error_weight); + + // Check if the prev or next error is better, and if so use it + if ((error_up < error_base) && (error_up < error_down) && (uqw < 64)) + { + dec_weights_uquant[texel] = static_cast(uqw_up); + adjustments = true; + } + else if ((error_down < error_base) && (uqw > 0)) + { + dec_weights_uquant[texel] = static_cast(uqw_down); + adjustments = true; + } + } + + // Prepare iteration for plane 2 + dec_weights_uquant += WEIGHTS_PLANE2_OFFSET; + plane_mask = ~plane_mask; + } + return adjustments; +} +#else static bool realign_weights_undecimated( astcenc_profile decode_mode, const block_size_descriptor& bsd, @@ -175,6 +383,7 @@ static bool realign_weights_undecimated( return adjustments; } +#endif /** * @brief Attempt to improve weights given a chosen configuration. @@ -423,7 +632,7 @@ static float compress_symbolic_block_for_partition_1plane( // For each mode, use the angular method to compute a shift compute_angular_endpoints_1plane( - only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf); + privateProfile, only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf); float* weight_low_value = tmpbuf.weight_low_value1; float* weight_high_value = tmpbuf.weight_high_value1; @@ -799,7 +1008,7 @@ static float compress_symbolic_block_for_partition_2planes( float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask)); compute_angular_endpoints_2planes( - bsd, dec_weights_ideal, max_weight_quant, tmpbuf); + privateProfile, bsd, dec_weights_ideal, max_weight_quant, tmpbuf); // For each mode (which specifies a decimation and a quantization): // * Compute number of bits needed for the quantized weights diff --git a/Source/astcenc_find_best_partitioning.cpp b/Source/astcenc_find_best_partitioning.cpp index bfbcc35..5d0682e 100644 --- a/Source/astcenc_find_best_partitioning.cpp +++ b/Source/astcenc_find_best_partitioning.cpp @@ -250,6 +250,19 @@ static void kmeans_update( * * @return The number of bit mismatches. */ +#if ASTCENC_NEON != 0 +static inline uint8_t partition_mismatch2( + const uint64_t a[2], + const uint64_t b[2] +) { + uint64x2_t a01 = vld1q_u64(a); + uint64x2_t b01 = vld1q_u64(b); + uint64x2_t b10 = vextq_u64(b01, b01, 1); + uint8_t c1 = popcount(veorq_u64(a01, b01)); + uint8_t c2 = popcount(veorq_u64(a01, b10)); + return static_cast(astc::min(c1, c2) / 2); +} +#else static inline uint8_t partition_mismatch2( const uint64_t a[2], const uint64_t b[2] @@ -261,6 +274,7 @@ static inline uint8_t partition_mismatch2( // in the expected position, and again when present in the wrong partition return static_cast(astc::min(v1, v2) / 2); } +#endif /** * @brief Compute bit-mismatch for partitioning in 3-partition mode. diff --git a/Source/astcenc_ideal_endpoints_and_weights.cpp b/Source/astcenc_ideal_endpoints_and_weights.cpp index 051782f..2a17899 100644 --- a/Source/astcenc_ideal_endpoints_and_weights.cpp +++ b/Source/astcenc_ideal_endpoints_and_weights.cpp @@ -364,7 +364,7 @@ static void compute_ideal_colors_and_weights_3_comp( unsigned int texel_count = blk.texel_count; promise(texel_count > 0); - partition_metrics pms[BLOCK_MAX_PARTITIONS]; + partition_metrics *pms = (partition_metrics *)&blk.pms[0]; float error_weight; const float* data_vr = nullptr; @@ -372,7 +372,7 @@ static void compute_ideal_colors_and_weights_3_comp( const float* data_vb = nullptr; if (omitted_component == 0) { - error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>()); + error_weight = hadd_s(blk.channel_weight.swz<1, 2, 3>()); data_vr = blk.data_g; data_vg = blk.data_b; data_vb = blk.data_a; @@ -428,7 +428,52 @@ static void compute_ideal_colors_and_weights_3_comp( float highparam { -1e10f }; unsigned int partition_texel_count = pi.partition_texel_count[i]; - for (unsigned int j = 0; j < partition_texel_count; j++) + + vfloat4 lowparam_vec = float4(1e10f, 1e10f, 1e10f, 1e10f); + vfloat4 highparam_vec = float4(-1e10f, -1e10f, -1e10f, -1e10f); + + unsigned int j = 0; + for (; j + ASTCENC_SIMD_WIDTH <= partition_texel_count; j += ASTCENC_SIMD_WIDTH) + { + unsigned int tix0 = pi.texels_of_partition[i][j]; + unsigned int tix1 = pi.texels_of_partition[i][j + 1]; + unsigned int tix2 = pi.texels_of_partition[i][j + 2]; + unsigned int tix3 = pi.texels_of_partition[i][j + 3]; + + vfloat4 points0 = vfloat4(data_vr[tix0], data_vg[tix0], data_vb[tix0], 0.0f); + vfloat4 points1 = vfloat4(data_vr[tix1], data_vg[tix1], data_vb[tix1], 0.0f); + vfloat4 points2 = vfloat4(data_vr[tix2], data_vg[tix2], data_vb[tix2], 0.0f); + vfloat4 points3 = vfloat4(data_vr[tix3], data_vg[tix3], data_vb[tix3], 0.0f); + + vfloat4 sub_v0 = points0 - line.a; + vfloat4 sub_v1 = points1 - line.a; + vfloat4 sub_v2 = points2 - line.a; + vfloat4 sub_v3 = points3 - line.a; + + vfloat4 params0 = sub_v0 * line.b; + vfloat4 params1 = sub_v1 * line.b; + vfloat4 params2 = sub_v2 * line.b; + vfloat4 params3 = sub_v3 * line.b; + + float param0 = hadd_rgba_s(params0); + float param1 = hadd_rgba_s(params1); + float param2 = hadd_rgba_s(params2); + float param3 = hadd_rgba_s(params3); + + ei.weights[tix0] = param0; + ei.weights[tix1] = param1; + ei.weights[tix2] = param2; + ei.weights[tix3] = param3; + + vfloat4 params_vec = vfloat4(param0, param1, param2, param3); + lowparam_vec = min(params_vec, lowparam_vec); + highparam_vec = max(params_vec, highparam_vec); + } + + lowparam = hmin_s(vfloat4(lowparam_vec)); + highparam = hmax_s(vfloat4(highparam_vec)); + + for (; j < partition_texel_count; j++) { unsigned int tix = pi.texels_of_partition[i][j]; vfloat4 point = vfloat3(data_vr[tix], data_vg[tix], data_vb[tix]); @@ -460,7 +505,7 @@ static void compute_ideal_colors_and_weights_3_comp( is_constant_wes = is_constant_wes && length_squared == partition0_len_sq; } - for (unsigned int j = 0; j < partition_texel_count; j++) + for (j = 0; j < partition_texel_count; j++) { unsigned int tix = pi.texels_of_partition[i][j]; float idx = (ei.weights[tix] - lowparam) * scale; @@ -894,7 +939,11 @@ void compute_ideal_weights_for_decimation( for (unsigned int j = 0; j < max_texel_count; j++) { +#ifdef ASTCENC_USE_COMMON_GATHERF + const uint8_t* texel = di.weight_texels_tr[j] + i; +#else vint texel(di.weight_texels_tr[j] + i); +#endif vfloat weight = loada(di.weights_texel_contribs_tr[j] + i); if (!constant_wes) @@ -952,7 +1001,11 @@ void compute_ideal_weights_for_decimation( for (unsigned int j = 0; j < max_texel_count; j++) { +#ifdef ASTCENC_USE_COMMON_GATHERF + const uint8_t* texel = di.weight_texels_tr[j] + i; +#else vint texel(di.weight_texels_tr[j] + i); +#endif vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i); if (!constant_wes) @@ -1041,12 +1094,9 @@ void compute_quantized_weights_for_decimation( vint ixli = vtable_8bt_32bi(tab0p, weightl); vint ixhi = vtable_8bt_32bi(tab0p, weighth); - vfloat ixl = int_to_float(ixli); - vfloat ixh = int_to_float(ixhi); - - vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix); + vmask mask = int_to_float(ixli + ixhi) < (vfloat(128.0f) * ix); vint weight = select(ixli, ixhi, mask); - ixl = select(ixl, ixh, mask); + vfloat ixl = int_to_float(weight); // Invert the weight-scaling that was done initially storea(ixl * rscalev + low_boundv, weight_set_out + i); @@ -1075,12 +1125,9 @@ void compute_quantized_weights_for_decimation( vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl); vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth); - vfloat ixl = int_to_float(ixli); - vfloat ixh = int_to_float(ixhi); - - vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix); + vmask mask = int_to_float(ixli + ixhi) < (vfloat(128.0f) * ix); vint weight = select(ixli, ixhi, mask); - ixl = select(ixl, ixh, mask); + vfloat ixl = int_to_float(weight); // Invert the weight-scaling that was done initially storea(ixl * rscalev + low_boundv, weight_set_out + i); diff --git a/Source/astcenc_internal.h b/Source/astcenc_internal.h index bad1247..ba6a4b9 100644 --- a/Source/astcenc_internal.h +++ b/Source/astcenc_internal.h @@ -763,6 +763,8 @@ struct image_block /** @brief The input (compress) or output (decompress) data for the alpha color component. */ ASTCENC_ALIGNAS float data_a[BLOCK_MAX_TEXELS]; + partition_metrics pms[BLOCK_MAX_PARTITIONS]; + /** @brief The number of texels in the block. */ uint8_t texel_count; @@ -2040,6 +2042,7 @@ void prepare_angular_tables(); * @param[out] tmpbuf Preallocated scratch buffers for the compressor. */ void compute_angular_endpoints_1plane( + QualityProfile privateProfile, bool only_always, const block_size_descriptor& bsd, const float* dec_weight_ideal_value, @@ -2055,6 +2058,7 @@ void compute_angular_endpoints_1plane( * @param[out] tmpbuf Preallocated scratch buffers for the compressor. */ void compute_angular_endpoints_2planes( + QualityProfile privateProfile, const block_size_descriptor& bsd, const float* dec_weight_ideal_value, unsigned int max_weight_quant, diff --git a/Source/astcenc_pick_best_endpoint_format.cpp b/Source/astcenc_pick_best_endpoint_format.cpp index dbc12ec..ff4e445 100644 --- a/Source/astcenc_pick_best_endpoint_format.cpp +++ b/Source/astcenc_pick_best_endpoint_format.cpp @@ -120,9 +120,99 @@ static void compute_error_squared_rgb_single_partition( vfloat l_bs1(l_pline.bs.lane<1>()); vfloat l_bs2(l_pline.bs.lane<2>()); - vint lane_ids = vint::lane_id(); - for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) + vfloat one_third(1/3.0f, 1/3.0f, 1/3.0f, 1/3.0f); + vfloat uncor_errv0 = vfloat::zero(); + vfloat uncor_errv1 = vfloat::zero(); + vfloat uncor_errv2 = vfloat::zero(); + vfloat samec_errv0 = vfloat::zero(); + vfloat samec_errv1 = vfloat::zero(); + vfloat samec_errv2 = vfloat::zero(); + vfloat rgbl_errv0 = vfloat::zero(); + vfloat rgbl_errv1 = vfloat::zero(); + vfloat rgbl_errv2 = vfloat::zero(); + vfloat l_errv0 = vfloat::zero(); + vfloat l_errv1 = vfloat::zero(); + vfloat l_errv2 = vfloat::zero(); + + unsigned int i = 0; + for (; i + ASTCENC_SIMD_WIDTH <= texel_count; i += ASTCENC_SIMD_WIDTH) { +#ifdef ASTCENC_USE_COMMON_GATHERF + const uint8_t* tix = texel_indexes + i; +#else + vint tix(texel_indexes + i); +#endif + + // Compute the error that arises from just ditching alpha + vfloat data_a = gatherf(blk.data_a, tix); + vfloat alpha_diff = data_a - default_a; + alpha_diff = alpha_diff * alpha_diff; + + haccumulate(a_drop_errv, alpha_diff); + + vfloat data_r = gatherf(blk.data_r, tix); + vfloat data_g = gatherf(blk.data_g, tix); + vfloat data_b = gatherf(blk.data_b, tix); + + vfloat data_rgb_avg = (data_r + data_g + data_b) * one_third; + vfloat data_rgb_0 = data_rgb_avg - data_r; + vfloat data_rgb_1 = data_rgb_avg - data_g; + vfloat data_rgb_2 = data_rgb_avg - data_b; + + // Compute uncorrelated error + vfloat param = data_r * uncor_bs0 + + data_g * uncor_bs1 + + data_b * uncor_bs2; + + vfloat dist0 = (uncor_amod0 + param * uncor_bs0) - data_r; + vfloat dist1 = (uncor_amod1 + param * uncor_bs1) - data_g; + vfloat dist2 = (uncor_amod2 + param * uncor_bs2) - data_b; + + haccumulate(uncor_errv0, dist0 * dist0); + haccumulate(uncor_errv1, dist1 * dist1); + haccumulate(uncor_errv2, dist2 * dist2); + + // Compute same chroma error - no "amod", its always zero + param = data_r * samec_bs0 + + data_g * samec_bs1 + + data_b * samec_bs2; + + dist0 = (param * samec_bs0) - data_r; + dist1 = (param * samec_bs1) - data_g; + dist2 = (param * samec_bs2) - data_b; + + haccumulate(uncor_errv0, dist0 * dist0); + haccumulate(uncor_errv1, dist1 * dist1); + haccumulate(uncor_errv2, dist2 * dist2); + + // Compute rgbl error + dist0 = rgbl_amod0 + data_rgb_0; + dist1 = rgbl_amod1 + data_rgb_1; + dist2 = rgbl_amod2 + data_rgb_2; + + haccumulate(rgbl_errv0, dist0 * dist0); + haccumulate(rgbl_errv1, dist1 * dist1); + haccumulate(rgbl_errv2, dist2 * dist2); + + // Compute luma error - no "amod", its always zero + dist0 = data_rgb_0; + dist1 = data_rgb_1; + dist2 = data_rgb_2; + + haccumulate(l_errv0, dist0 * dist0); + haccumulate(l_errv1, dist1 * dist1); + haccumulate(l_errv2, dist2 * dist2); + } + + uncor_errv = uncor_errv0 * ews.lane<0>() + uncor_errv1 * ews.lane<1>() + uncor_errv2 * ews.lane<2>(); + samec_errv = samec_errv0 * ews.lane<0>() + samec_errv1 * ews.lane<1>() + samec_errv2 * ews.lane<2>(); + rgbl_errv = rgbl_errv0 * ews.lane<0>() + rgbl_errv1 * ews.lane<1>() + rgbl_errv2 * ews.lane<2>(); + l_errv = l_errv0 * ews.lane<0>() + l_errv1 * ews.lane<1>() + l_errv2 * ews.lane<2>(); + + + if (i < texel_count) + { + vint lane_ids = vint::lane_id() + i; vint tix(texel_indexes + i); vmask mask = lane_ids < vint(texel_count); @@ -139,6 +229,11 @@ static void compute_error_squared_rgb_single_partition( vfloat data_g = gatherf(blk.data_g, tix); vfloat data_b = gatherf(blk.data_b, tix); + vfloat data_rgb_avg = (data_r + data_g + data_b) * one_third; + vfloat data_rgb_0 = data_rgb_avg - data_r; + vfloat data_rgb_1 = data_rgb_avg - data_g; + vfloat data_rgb_2 = data_rgb_avg - data_b; + // Compute uncorrelated error vfloat param = data_r * uncor_bs0 + data_g * uncor_bs1 @@ -170,13 +265,9 @@ static void compute_error_squared_rgb_single_partition( haccumulate(samec_errv, error, mask); // Compute rgbl error - param = data_r * rgbl_bs0 - + data_g * rgbl_bs1 - + data_b * rgbl_bs2; - - dist0 = (rgbl_amod0 + param * rgbl_bs0) - data_r; - dist1 = (rgbl_amod1 + param * rgbl_bs1) - data_g; - dist2 = (rgbl_amod2 + param * rgbl_bs2) - data_b; + dist0 = rgbl_amod0 + data_rgb_0; + dist1 = rgbl_amod1 + data_rgb_1; + dist2 = rgbl_amod2 + data_rgb_2; error = dist0 * dist0 * ews.lane<0>() + dist1 * dist1 * ews.lane<1>() @@ -185,13 +276,9 @@ static void compute_error_squared_rgb_single_partition( haccumulate(rgbl_errv, error, mask); // Compute luma error - no "amod", its always zero - param = data_r * l_bs0 - + data_g * l_bs1 - + data_b * l_bs2; - - dist0 = (param * l_bs0) - data_r; - dist1 = (param * l_bs1) - data_g; - dist2 = (param * l_bs2) - data_b; + dist0 = data_rgb_0; + dist1 = data_rgb_1; + dist2 = data_rgb_2; error = dist0 * dist0 * ews.lane<0>() + dist1 * dist1 * ews.lane<1>() @@ -220,6 +307,7 @@ static void compute_error_squared_rgb_single_partition( * @param[out] eci The resulting encoding choice error metrics. */ static void compute_encoding_choice_errors( + QualityProfile privateProfile, const image_block& blk, const partition_info& pi, const endpoints& ep, @@ -228,9 +316,12 @@ static void compute_encoding_choice_errors( int partition_count = pi.partition_count; promise(partition_count > 0); - partition_metrics pms[BLOCK_MAX_PARTITIONS]; + partition_metrics *pms = (partition_metrics *)&blk.pms[0]; - compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms); + if (!blk.is_constant_channel(3) || (partition_count != 1 && privateProfile == HIGH_QUALITY_PROFILE)) + { + compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms); + } for (int i = 0; i < partition_count; i++) { @@ -1133,7 +1224,7 @@ unsigned int compute_ideal_endpoint_formats( // Compute the errors that result from various encoding choices (such as using luminance instead // of RGB, discarding Alpha, using RGB-scale in place of two separate RGB endpoints and so on) encoding_choice_errors eci[BLOCK_MAX_PARTITIONS]; - compute_encoding_choice_errors(blk, pi, ep, eci); + compute_encoding_choice_errors(privateProfile, blk, pi, ep, eci); float best_error[BLOCK_MAX_PARTITIONS][21][4]; uint8_t format_of_choice[BLOCK_MAX_PARTITIONS][21][4]; diff --git a/Source/astcenc_vecmathlib_common_4.h b/Source/astcenc_vecmathlib_common_4.h index 1e04367..00d7263 100644 --- a/Source/astcenc_vecmathlib_common_4.h +++ b/Source/astcenc_vecmathlib_common_4.h @@ -287,6 +287,12 @@ ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m) haccumulate(accum, a); } +#define ASTCENC_USE_COMMON_GATHERF +ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, const uint8_t* idx) +{ + return vfloat4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]); +} + /** * @brief Return the horizontal sum of RGB vector lanes as a scalar. */ @@ -295,6 +301,16 @@ ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a) return a.lane<0>() + a.lane<1>() + a.lane<2>(); } +#if !define(ASTCENC_USE_NATIVE_ADDV) +/** + * @brief Return the horizontal sum of a vector. + */ +ASTCENC_SIMD_INLINE float hadd_rgba_s(vfloat4 a) +{ + return a.lane<0>() + a.lane<1>() + a.lane<2>() + a.lane<3>(); +} +#endif + #if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT) /** diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h index 42545e7..73efcbc 100644 --- a/Source/astcenc_vecmathlib_neon_4.h +++ b/Source/astcenc_vecmathlib_neon_4.h @@ -579,6 +579,25 @@ ASTCENC_SIMD_INLINE int hadd_s(vint4 a) return vget_lane_s32(vpadd_s32(t, t), 0); } +/** + * @brief Return the horizontal sum of a vector. + */ +ASTCENC_SIMD_INLINE uint32_t hadd_s(vmask4 a) +{ + // Use add with SIMD versions + return vaddvq_u32(a.m); +} + +#define ASTCENC_USE_NATIVE_ADDV +/** + * @brief Return the horizontal sum of a vector. + */ +ASTCENC_SIMD_INLINE float hadd_rgba_s(vfloat4 a) +{ + // Use add with SIMD versions + return vaddvq_f32(a.m); +} + /** * @brief Store a vector to a 16B aligned memory address. */ @@ -631,10 +650,12 @@ ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices) */ ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a) { - alignas(16) uint8_t shuf[16] { - 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + uint8x16_t idx = { + 0, 4, 8, 12, + 0, 0, 0, 0, + 0, 0, 0, 0, + 0, 0, 0, 0 }; - uint8x16_t idx = vld1q_u8(shuf); int8x16_t av = vreinterpretq_s8_s32(a.m); return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx))); } @@ -1096,4 +1117,28 @@ ASTCENC_SIMD_INLINE int popcount(uint64_t v) return static_cast(vaddlv_u8(vcnt_u8(vcreate_u8(v)))); } +/** + * @brief Population bit count. + * + * @param v The value to population count. + * + * @return The number of 1 bits. + */ +ASTCENC_SIMD_INLINE int popcount(uint64x2_t v) +{ + return static_cast(vaddvq_u8(vcntq_u8(vreinterpret_u8_u64(v)))); +} + +/** + * @brief Population bit count. + * + * @param v The value to population count. + * + * @return The number of 1 bits. + */ +ASTCENC_SIMD_INLINE int popcount(vmask4 v) +{ + return static_cast(vaddvq_u8(vcntq_u8(vreinterpret_u8_u32(v.m)))); +} + #endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED diff --git a/Source/astcenc_weight_align.cpp b/Source/astcenc_weight_align.cpp index 4e993e7..e1c79ba 100644 --- a/Source/astcenc_weight_align.cpp +++ b/Source/astcenc_weight_align.cpp @@ -149,7 +149,9 @@ static void compute_angular_offsets( * @param[out] cut_low_weight_error Per angular step, the low weight cut error. * @param[out] cut_high_weight_error Per angular step, the high weight cut error. */ +#if ASTCENC_NEON != 0 static void compute_lowest_and_highest_weight( + QualityProfile privateProfile, unsigned int weight_count, const float* dec_weight_ideal_value, unsigned int max_angular_steps, @@ -166,6 +168,100 @@ static void compute_lowest_and_highest_weight( vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f); + float max_weight = 1.0f; + float min_weight = 0.0f; + if (privateProfile == HIGH_QUALITY_PROFILE) + { + max_weight = dec_weight_ideal_value[0]; + min_weight = dec_weight_ideal_value[0]; + for (unsigned int j = 1; j < weight_count; j++) + { + float weight = dec_weight_ideal_value[j]; + __asm__ volatile("fmax %s0, %s0, %s1" : "+w"(max_weight) : "W"(weight)); + __asm__ volatile("fmin %s0, %s0, %s1" : "+w"(min_weight) : "W"(weight)); + } + } + + // Arrays are ANGULAR_STEPS long, so always safe to run full vectors + for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH) + { + vfloat errval = vfloat::zero(); + vfloat cut_low_weight_err = vfloat::zero(); + vfloat cut_high_weight_err = vfloat::zero(); + vfloat offset = loada(offsets + sp); + + offset = (vfloat)vnegq_f32(offset.m); + vfloat maxidx = vfloat::zero(); + vfloat minidx = vfloat::zero(); + + if (privateProfile == HIGH_SPEED_PROFILE) + { + maxidx = round((vfloat)vaddq_f32(rcp_stepsize.m, offset.m)); + minidx = round(offset); + } + else + { + maxidx = round((vfloat)vfmaq_n_f32(offset.m, rcp_stepsize.m, max_weight)); + minidx = round((vfloat)vfmaq_n_f32(offset.m, rcp_stepsize.m, min_weight)); + } + + for (unsigned int j = 0; j < weight_count; j++) + { + vfloat sval = (vfloat)vfmaq_n_f32(offset.m, rcp_stepsize.m, *(dec_weight_ideal_value + j)); + vfloat svalrte = round(sval); + vfloat diff = sval - svalrte; + errval += diff * diff; + + // Accumulate on min hit + vmask mask = svalrte == minidx; + vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff; + cut_low_weight_err = select(cut_low_weight_err, accum, mask); + + // Accumulate on max hit + mask = svalrte == maxidx; + accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff; + cut_high_weight_err = select(cut_high_weight_err, accum, mask); + } + + // Write out min weight and weight span; clamp span to a usable range + vint span = float_to_int(maxidx - minidx + vfloat(1)); + span = min(span, vint(max_quant_steps + 3)); + span = max(span, vint(2)); + storea(minidx, lowest_weight + sp); + storea(span, weight_span + sp); + + // The cut_(lowest/highest)_weight_error indicate the error that results from forcing + // samples that should have had the weight value one step (up/down). + vfloat ssize = 1.0f / rcp_stepsize; + vfloat errscale = ssize * ssize; + storea(errval * errscale, error + sp); + storea(cut_low_weight_err * errscale, cut_low_weight_error + sp); + storea(cut_high_weight_err * errscale, cut_high_weight_error + sp); + + rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH); + } + +} +#else +static void compute_lowest_and_highest_weight( + QualityProfile privateProfile, + unsigned int weight_count, + const float* dec_weight_ideal_value, + unsigned int max_angular_steps, + unsigned int max_quant_steps, + const float* offsets, + float* lowest_weight, + int* weight_span, + float* error, + float* cut_low_weight_error, + float* cut_high_weight_error +) { + (void) privateProfile; + promise(weight_count > 0); + promise(max_angular_steps > 0); + + vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f); + // Arrays are ANGULAR_STEPS long, so always safe to run full vectors for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH) { @@ -222,6 +318,7 @@ static void compute_lowest_and_highest_weight( rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH); } } +#endif /** * @brief The main function for the angular algorithm. @@ -233,6 +330,7 @@ static void compute_lowest_and_highest_weight( * @param[out] high_value Per angular step, the highest weight value. */ static void compute_angular_endpoints_for_quant_levels( + QualityProfile privateProfile, unsigned int weight_count, const float* dec_weight_ideal_value, unsigned int max_quant_level, @@ -253,7 +351,7 @@ static void compute_angular_endpoints_for_quant_levels( ASTCENC_ALIGNAS float cut_low_weight_error[ANGULAR_STEPS]; ASTCENC_ALIGNAS float cut_high_weight_error[ANGULAR_STEPS]; - compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value, + compute_lowest_and_highest_weight(privateProfile, weight_count, dec_weight_ideal_value, max_angular_steps, max_quant_steps, angular_offsets, lowest_weight, weight_span, error, cut_low_weight_error, cut_high_weight_error); @@ -335,6 +433,7 @@ static void compute_angular_endpoints_for_quant_levels( /* See header for documentation. */ void compute_angular_endpoints_1plane( + QualityProfile privateProfile, bool only_always, const block_size_descriptor& bsd, const float* dec_weight_ideal_value, @@ -372,6 +471,7 @@ void compute_angular_endpoints_1plane( } compute_angular_endpoints_for_quant_levels( + privateProfile, weight_count, dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS, max_precision, low_values[i], high_values[i]); @@ -403,6 +503,7 @@ void compute_angular_endpoints_1plane( /* See header for documentation. */ void compute_angular_endpoints_2planes( + QualityProfile privateProfile, const block_size_descriptor& bsd, const float* dec_weight_ideal_value, unsigned int max_weight_quant, @@ -441,11 +542,13 @@ void compute_angular_endpoints_2planes( } compute_angular_endpoints_for_quant_levels( + privateProfile, weight_count, dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS, max_precision, low_values1[i], high_values1[i]); compute_angular_endpoints_for_quant_levels( + privateProfile, weight_count, dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET, max_precision, low_values2[i], high_values2[i]); -- Gitee From 972e4b4506ce2a07398cdc02d32b656573ac9874 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B4=E7=B4=A0=E8=B4=9D=E5=8F=B6=E6=96=AF?= Date: Tue, 27 Aug 2024 09:09:53 +0000 Subject: [PATCH 2/3] astcenc-encoder optimization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 朴素贝叶斯 --- Source/astcenc_compress_symbolic.cpp | 10 +++++----- Source/astcenc_ideal_endpoints_and_weights.cpp | 4 ++-- Source/astcenc_vecmathlib_common_4.h | 2 +- Source/astcenc_vecmathlib_neon_4.h | 4 ++-- Source/astcenc_weight_align.cpp | 4 ++-- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Source/astcenc_compress_symbolic.cpp b/Source/astcenc_compress_symbolic.cpp index 6636109..ca1e981 100644 --- a/Source/astcenc_compress_symbolic.cpp +++ b/Source/astcenc_compress_symbolic.cpp @@ -210,7 +210,7 @@ static bool realign_weights_undecimated( vfloat4 error_up_vec = vfloat4(error_up0, error_up1, error_up2, error_up3); vmask4 check_result_up = (error_up_vec < error_base_vec) & - (error_up_vec < error_down_vec) & (uqw_vec < vin4(64)); + (error_up_vec < error_down_vec) & (uqw_vec < vint4(64)); vmask4 check_result_down = (error_down_vec < error_base_vec) & (uqw_vec > vint4::zero()); check_result_down = check_result_down & (~check_result_up); @@ -220,10 +220,10 @@ static bool realign_weights_undecimated( uqw_vec = select(uqw_vec, uqw_up_vec, check_result_up); uqw_vec = select(uqw_vec, uqw_down_vec, check_result_down); - dec_weights_uquant[texel] = uqw.lane<0>(); - dec_weights_uquant[texel + 1] = uqw.lane<1>(); - dec_weights_uquant[texel + 2] = uqw.lane<2>(); - dec_weights_uquant[texel + 3] = uqw.lane<3>(); + dec_weights_uquant[texel] = uqw_vec.lane<0>(); + dec_weights_uquant[texel + 1] = uqw_vec.lane<1>(); + dec_weights_uquant[texel + 2] = uqw_vec.lane<2>(); + dec_weights_uquant[texel + 3] = uqw_vec.lane<3>(); adjustments = true; } }; diff --git a/Source/astcenc_ideal_endpoints_and_weights.cpp b/Source/astcenc_ideal_endpoints_and_weights.cpp index 2a17899..29f5745 100644 --- a/Source/astcenc_ideal_endpoints_and_weights.cpp +++ b/Source/astcenc_ideal_endpoints_and_weights.cpp @@ -429,8 +429,8 @@ static void compute_ideal_colors_and_weights_3_comp( unsigned int partition_texel_count = pi.partition_texel_count[i]; - vfloat4 lowparam_vec = float4(1e10f, 1e10f, 1e10f, 1e10f); - vfloat4 highparam_vec = float4(-1e10f, -1e10f, -1e10f, -1e10f); + vfloat4 lowparam_vec = vfloat4(1e10f, 1e10f, 1e10f, 1e10f); + vfloat4 highparam_vec = vfloat4(-1e10f, -1e10f, -1e10f, -1e10f); unsigned int j = 0; for (; j + ASTCENC_SIMD_WIDTH <= partition_texel_count; j += ASTCENC_SIMD_WIDTH) diff --git a/Source/astcenc_vecmathlib_common_4.h b/Source/astcenc_vecmathlib_common_4.h index 00d7263..a19b954 100644 --- a/Source/astcenc_vecmathlib_common_4.h +++ b/Source/astcenc_vecmathlib_common_4.h @@ -301,7 +301,7 @@ ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a) return a.lane<0>() + a.lane<1>() + a.lane<2>(); } -#if !define(ASTCENC_USE_NATIVE_ADDV) +#if !defined(ASTCENC_USE_NATIVE_ADDV) /** * @brief Return the horizontal sum of a vector. */ diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h index 73efcbc..f6f8de1 100644 --- a/Source/astcenc_vecmathlib_neon_4.h +++ b/Source/astcenc_vecmathlib_neon_4.h @@ -1126,7 +1126,7 @@ ASTCENC_SIMD_INLINE int popcount(uint64_t v) */ ASTCENC_SIMD_INLINE int popcount(uint64x2_t v) { - return static_cast(vaddvq_u8(vcntq_u8(vreinterpret_u8_u64(v)))); + return static_cast(vaddvq_u8(vcntq_u8(vreinterpretq_u8_u64(v)))); } /** @@ -1138,7 +1138,7 @@ ASTCENC_SIMD_INLINE int popcount(uint64x2_t v) */ ASTCENC_SIMD_INLINE int popcount(vmask4 v) { - return static_cast(vaddvq_u8(vcntq_u8(vreinterpret_u8_u32(v.m)))); + return static_cast(vaddvq_u8(vcntq_u8(vreinterpretq_u8_u32(v.m)))); } #endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED diff --git a/Source/astcenc_weight_align.cpp b/Source/astcenc_weight_align.cpp index e1c79ba..84cec1f 100644 --- a/Source/astcenc_weight_align.cpp +++ b/Source/astcenc_weight_align.cpp @@ -177,8 +177,8 @@ static void compute_lowest_and_highest_weight( for (unsigned int j = 1; j < weight_count; j++) { float weight = dec_weight_ideal_value[j]; - __asm__ volatile("fmax %s0, %s0, %s1" : "+w"(max_weight) : "W"(weight)); - __asm__ volatile("fmin %s0, %s0, %s1" : "+w"(min_weight) : "W"(weight)); + __asm__ volatile("fmax %s0, %s0, %s1" : "+w"(max_weight) : "w"(weight)); + __asm__ volatile("fmin %s0, %s0, %s1" : "+w"(min_weight) : "w"(weight)); } } -- Gitee From 127d8d462e26d266283d219ce1b1c4ac8371464b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B4=E7=B4=A0=E8=B4=9D=E5=8F=B6=E6=96=AF?= Date: Thu, 5 Sep 2024 01:37:46 +0000 Subject: [PATCH 3/3] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E6=A3=80=E8=A7=86?= =?UTF-8?q?=E6=84=8F=E8=A7=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 朴素贝叶斯 --- Source/astcenc_weight_align.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Source/astcenc_weight_align.cpp b/Source/astcenc_weight_align.cpp index 84cec1f..703a0c6 100644 --- a/Source/astcenc_weight_align.cpp +++ b/Source/astcenc_weight_align.cpp @@ -170,7 +170,8 @@ static void compute_lowest_and_highest_weight( float max_weight = 1.0f; float min_weight = 0.0f; - if (privateProfile == HIGH_QUALITY_PROFILE) + // in HIGH_SPEED_PROFILE, max_weight is always equal to 1.0, and min_weight is always equal to 0 + if (privateProfile != HIGH_SPEED_PROFILE) { max_weight = dec_weight_ideal_value[0]; min_weight = dec_weight_ideal_value[0]; @@ -542,13 +543,13 @@ void compute_angular_endpoints_2planes( } compute_angular_endpoints_for_quant_levels( - privateProfile, + privateProfile, weight_count, dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS, max_precision, low_values1[i], high_values1[i]); compute_angular_endpoints_for_quant_levels( - privateProfile, + privateProfile, weight_count, dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET, max_precision, low_values2[i], high_values2[i]); -- Gitee