diff --git a/webp-opt.patch b/webp-opt.patch new file mode 100644 index 0000000000000000000000000000000000000000..a006d3e2149ec9d29ce4e9e087a0ef547b19af11 --- /dev/null +++ b/webp-opt.patch @@ -0,0 +1,482 @@ +diff --git a/src/dsp/cost_neon.c b/src/dsp/cost_neon.c +index e1bf365..ec08f3c 100644 +--- a/src/dsp/cost_neon.c ++++ b/src/dsp/cost_neon.c +@@ -111,7 +111,6 @@ static int GetResidualCost_NEON(int ctx0, const VP8Residual* const res) { + extern void VP8EncDspCostInitNEON(void); + + WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitNEON(void) { +- VP8SetResidualCoeffs = SetResidualCoeffs_NEON; + VP8GetResidualCost = GetResidualCost_NEON; + } + +diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c +index b373245..86781ff 100644 +--- a/src/dsp/enc_neon.c ++++ b/src/dsp/enc_neon.c +@@ -732,11 +732,61 @@ static void CollectHistogram_NEON(const uint8_t* WEBP_RESTRICT ref, + int distribution[MAX_COEFF_THRESH + 1] = { 0 }; + for (j = start_block; j < end_block; ++j) { + int16_t out[16]; +- FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out); ++ const uint8_t* src = ref + VP8DspScan[j]; ++ const uint8_t* ref = pred + VP8DspScan[j]; ++ int16x8_t d0d1, d3d2; ++ { ++ const uint8x16_t S0 = Load4x4_NEON(src); ++ const uint8x16_t R0 = Load4x4_NEON(ref); ++ const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0)); ++ const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0)); ++ const int16x4_t D0 = vget_low_s16(D0D1); ++ const int16x4_t D1 = vget_high_s16(D0D1); ++ const int16x4_t D2 = vget_low_s16(D2D3); ++ const int16x4_t D3 = vget_high_s16(D2D3); ++ Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2); ++ } ++ { // 1rst pass ++ const int32x4_t kCst937 = vdupq_n_s32(937); ++ const int32x4_t kCst1812 = vdupq_n_s32(1812); ++ const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1) ++ const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2) ++ const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3); ++ const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2), ++ vget_high_s16(a0a1_2)); ++ const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2), ++ vget_high_s16(a0a1_2)); ++ const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217); ++ const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217); ++ const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352); ++ const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352); ++ const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9); ++ const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9); ++ Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2); ++ } ++ { // 2nd pass ++ // the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0) ++ const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16)); ++ const int32x4_t kCst51000 = vdupq_n_s32(51000); ++ const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1) ++ const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2) ++ const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7)); ++ const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4); ++ const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4); ++ const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217); ++ const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217); ++ const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352); ++ const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352); ++ const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000); ++ const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000); ++ const int16x4_t a3_eq_0 = ++ vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0))); ++ const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0); ++ } + { + int k; +- const int16x8_t a0 = vld1q_s16(out + 0); +- const int16x8_t b0 = vld1q_s16(out + 8); ++ const int16x8_t a0 = vcombine_u8(out0, out1); ++ const int16x8_t b0 = vcombine_u8(out2, out3); + const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0)); + const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0)); + const uint16x8_t a2 = vshrq_n_u16(a1, 3); +diff --git a/src/enc/quant_enc.c b/src/enc/quant_enc.c +index d1c6062..f51f4fa 100644 +--- a/src/enc/quant_enc.c ++++ b/src/enc/quant_enc.c +@@ -10,7 +10,7 @@ + // Quantization + // + // Author: Skal (pascal.massimino@gmail.com) +- ++#include + #include + #include + #include // for abs() +@@ -822,6 +822,246 @@ static int ReconstructIntra16(VP8EncIterator* WEBP_RESTRICT const it, + + return nz; + } ++static const uint8_t kShuffles[4][8] = { ++ { 0, 1, 2, 3, 8, 9, 16, 17 }, ++ { 10, 11, 4, 5, 6, 7, 12, 13 }, ++ { 18, 19, 24, 25, 26, 27, 20, 21 }, ++ { 14, 15, 22, 23, 28, 29, 30, 31 } ++}; ++#define INIT_VECTOR4(v, a, b, c, d) do { \ ++ v.val[0] = a; \ ++ v.val[1] = b; \ ++ v.val[2] = c; \ ++ v.val[3] = d; \ ++} while (0); ++static int16x8_t Quantize_NEON(int16x8_t a, int16x8_t* tmp, int16_t* const in, ++ const VP8Matrix* const mtx, ++ int offset) { ++ const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); ++ const uint16x8_t q = vld1q_u16(&mtx->q_[offset]); ++ const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]); ++ const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]); ++ const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]); ++ ++ const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in) ++ const int16x8_t sign = vshrq_n_s16(a, 15); // sign ++ const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen ++ const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq)); ++ const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq)); ++ const uint32x4_t m2 = vhaddq_u32(m0, bias0); ++ const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1 ++ const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16), ++ vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1 ++ const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL)); ++ const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign); ++ const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign ++ const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q)); ++ *tmp = c4; ++ assert(QFIX == 17); // this function can't work as is if QFIX != 16+1 ++ return c3; ++} ++static int QuantizeBlock_NEON(const int16x8x2_t ret, int16x8_t * tmp0, int16x8_t * tmp8, int16_t in[16], int16_t out[16], ++ const VP8Matrix* const mtx) { ++ const int16x8_t out0 = Quantize_NEON(ret.val[0], tmp0, in, mtx, 0); ++ const int16x8_t out1 = Quantize_NEON(ret.val[1], tmp8, in, mtx, 8); ++ uint8x8x4_t shuffles; ++ uint8x8x4_t all_out; ++ INIT_VECTOR4(all_out, ++ vreinterpret_u8_s16(vget_low_s16(out0)), ++ vreinterpret_u8_s16(vget_high_s16(out0)), ++ vreinterpret_u8_s16(vget_low_s16(out1)), ++ vreinterpret_u8_s16(vget_high_s16(out1))); ++ INIT_VECTOR4(shuffles, ++ vtbl4_u8(all_out, vld1_u8(kShuffles[0])), ++ vtbl4_u8(all_out, vld1_u8(kShuffles[1])), ++ vtbl4_u8(all_out, vld1_u8(kShuffles[2])), ++ vtbl4_u8(all_out, vld1_u8(kShuffles[3]))); ++ ++ vst1_u8((uint8_t*)(out + 0), shuffles.val[0]); ++ vst1_u8((uint8_t*)(out + 4), shuffles.val[1]); ++ vst1_u8((uint8_t*)(out + 8), shuffles.val[2]); ++ vst1_u8((uint8_t*)(out + 12), shuffles.val[3]); ++ // test zeros ++ if (*(uint64_t*)(out + 0) != 0) return 1; ++ if (*(uint64_t*)(out + 4) != 0) return 1; ++ if (*(uint64_t*)(out + 8) != 0) return 1; ++ if (*(uint64_t*)(out + 12) != 0) return 1; ++ return 0; ++} ++static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A, ++ const int16x4_t B, ++ const int16x4_t C, ++ const int16x4_t D, ++ int16x8_t* const out01, ++ int16x8_t* const out32) { ++ const int16x4x2_t AB = vtrn_s16(A, B); ++ const int16x4x2_t CD = vtrn_s16(C, D); ++ const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]), ++ vreinterpret_s32_s16(CD.val[0])); ++ const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]), ++ vreinterpret_s32_s16(CD.val[1])); ++ *out01 = vreinterpretq_s16_s64( ++ vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]), ++ vreinterpret_s64_s32(tmp13.val[0]))); ++ *out32 = vreinterpretq_s16_s64( ++ vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]), ++ vreinterpret_s64_s32(tmp02.val[1]))); ++} ++static uint8x16_t Load4x4_NEON(const uint8_t* src) { ++ uint32x4_t out = vdupq_n_u32(0); ++ out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0); ++ out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1); ++ out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2); ++ out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3); ++ return vreinterpretq_u8_u32(out); ++} ++static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a, ++ const uint8x8_t b) { ++ return vreinterpretq_s16_u16(vsubl_u8(a, b)); ++} ++static int16x8x2_t FTransform_NEON(const uint8_t* src, ++ const uint8_t* ref, ++ int16_t* out) { ++ int16x8_t d0d1, d3d2; // working 4x4 int16 variables ++ { ++ const uint8x16_t S0 = Load4x4_NEON(src); ++ const uint8x16_t R0 = Load4x4_NEON(ref); ++ const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0)); ++ const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0)); ++ const int16x4_t D0 = vget_low_s16(D0D1); ++ const int16x4_t D1 = vget_high_s16(D0D1); ++ const int16x4_t D2 = vget_low_s16(D2D3); ++ const int16x4_t D3 = vget_high_s16(D2D3); ++ Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2); ++ } ++ { // 1rst pass ++ const int32x4_t kCst937 = vdupq_n_s32(937); ++ const int32x4_t kCst1812 = vdupq_n_s32(1812); ++ const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1) ++ const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2) ++ const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3); ++ const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2), ++ vget_high_s16(a0a1_2)); ++ const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2), ++ vget_high_s16(a0a1_2)); ++ const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217); ++ const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217); ++ const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352); ++ const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352); ++ const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9); ++ const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9); ++ Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2); ++ } ++ { // 2nd pass ++ // the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0) ++ const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16)); ++ const int32x4_t kCst51000 = vdupq_n_s32(51000); ++ const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1) ++ const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2) ++ const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7)); ++ const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4); ++ const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4); ++ const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217); ++ const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217); ++ const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352); ++ const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352); ++ const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000); ++ const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000); ++ const int16x4_t a3_eq_0 = ++ vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0))); ++ const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0); ++ int16x8x2_t ret = {vcombine_s16(out0, out1), vcombine_s16(out2, out3)}; ++ return ret; ++ } ++} ++static const int16_t kC1 = 20091; ++static const int16_t kC2 = 17734; ++#define INIT_VECTOR2(v, a, b) do { \ ++ v.val[0] = a; \ ++ v.val[1] = b; \ ++} while (0) ++static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0, ++ const int16x8_t in1, ++ int16x8x2_t* const out) { ++ // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1 ++ // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3 ++ const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ... ++ // b0 d0 b1 d1 b2 d2 ... ++ *out = vzipq_s16(tmp0.val[0], tmp0.val[1]); ++} ++static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) { ++ return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v))); ++} ++static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) { ++ // {rows} = in0 | in4 ++ // in8 | in12 ++ // B1 = in4 | in12 ++ const int16x8_t B1 = ++ vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1])); ++ // C0 = kC1 * in4 | kC1 * in12 ++ // C1 = kC2 * in4 | kC2 * in12 ++ const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1); ++ const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2); ++ const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]), ++ vget_low_s16(rows->val[1])); // in0 + in8 ++ const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]), ++ vget_low_s16(rows->val[1])); // in0 - in8 ++ // c = kC2 * in4 - kC1 * in12 ++ // d = kC1 * in4 + kC2 * in12 ++ const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0)); ++ const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1)); ++ const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b ++ const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c ++ const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c ++ const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c ++ const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp)); ++ Transpose8x2_NEON(E0, E1, rows); ++} ++static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst, ++ const int16x8_t dst01, ++ const int16x8_t dst23) { ++ // Unsigned saturate to 8b. ++ const uint8x8_t dst01_u8 = vqmovun_s16(dst01); ++ const uint8x8_t dst23_u8 = vqmovun_s16(dst23); ++ ++ // Store the results. ++ vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0); ++ vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1); ++ vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0); ++ vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1); ++} ++static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01, ++ const int16x8_t row23, ++ const uint8_t* const ref, ++ uint8_t* const dst) { ++ uint32x2_t dst01 = vdup_n_u32(0); ++ uint32x2_t dst23 = vdup_n_u32(0); ++ ++ // Load the source pixels. ++ dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0); ++ dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0); ++ dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1); ++ dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1); ++ ++ { ++ // Convert to 16b. ++ const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01); ++ const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23); ++ ++ // Descale with rounding. ++ const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3); ++ const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3); ++ // Add the inverse transform. ++ SaturateAndStore4x4_NEON(dst, out01, out23); ++ } ++} ++static void ITransformOne_NEON(int16x8x2_t rows, uint8_t* ref, ++ const int16_t* in, ++ uint8_t* dst) { ++ TransformPass_NEON(&rows); ++ TransformPass_NEON(&rows); ++ Add4x4_NEON(rows.val[0], rows.val[1], ref, dst); ++} + + static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it, + int16_t levels[16], +@@ -834,16 +1074,20 @@ static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it, + int nz = 0; + int16_t tmp[16]; + +- VP8FTransform(src, ref, tmp); ++ int16x8x2_t ret = FTransform_NEON(src, ref, tmp); ++ int16x8x2_t rows; + if (DO_TRELLIS_I4 && it->do_trellis_) { + const int x = it->i4_ & 3, y = it->i4_ >> 2; + const int ctx = it->top_nz_[x] + it->left_nz_[y]; + nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_, + dqm->lambda_trellis_i4_); + } else { +- nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_); ++ int16x8_t tmp0, tmp8; ++ nz = QuantizeBlock_NEON(ret, &tmp0, &tmp8, tmp, levels, &dqm->y1_); ++ rows.val[0] = tmp0; ++ rows.val[1] = tmp8; + } +- VP8ITransform(ref, tmp, yuv_out, 0); ++ ITransformOne_NEON(rows, ref, tmp, yuv_out); + return nz; + } + +@@ -1070,7 +1314,75 @@ static const uint16_t* GetCostModeI4(VP8EncIterator* WEBP_RESTRICT const it, + const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4]; + return VP8FixedCostsI4[top][left]; + } +- ++#define LOAD_LANE_32b(src, VALUE, LANE) \ ++ (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE)) ++static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) { ++ const uint16x8_t q_w07 = vld1q_u16(&w[0]); ++ const uint16x8_t q_w8f = vld1q_u16(&w[8]); ++ int16x4x4_t d4_w; ++ INIT_VECTOR4(d4_w, ++ vget_low_s16(vreinterpretq_s16_u16(q_w07)), ++ vget_high_s16(vreinterpretq_s16_u16(q_w07)), ++ vget_low_s16(vreinterpretq_s16_u16(q_w8f)), ++ vget_high_s16(vreinterpretq_s16_u16(q_w8f))); ++ return d4_w; ++} ++static int Disto4x4_NEON_CSP(const uint8_t * const a, const int16x4x4_t d4_w) ++{ ++ uint32x2_t d_in_ab_0123 = vdup_n_u32(0); ++ uint32x2_t d_in_ab_4567 = vdup_n_u32(0); ++ uint32x2_t d_in_ab_89ab = vdup_n_u32(0); ++ uint32x2_t d_in_ab_cdef = vdup_n_u32(0); ++ uint8x8x4_t d4_in; ++ LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0); ++ LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0); ++ LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0); ++ LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0); ++ INIT_VECTOR4(d4_in, ++ vreinterpret_u8_u32(d_in_ab_0123), ++ vreinterpret_u8_u32(d_in_ab_4567), ++ vreinterpret_u8_u32(d_in_ab_89ab), ++ vreinterpret_u8_u32(d_in_ab_cdef)); ++ int16x4_t q_a0 = vreinterpret_s16_u16(vget_low_u16(vaddl_u8(d4_in.val[0], d4_in.val[2]))); ++ int16x4_t q_a1 = vreinterpret_s16_u16(vget_low_u16(vaddl_u8(d4_in.val[1], d4_in.val[3]))); ++ int16x4_t q_a2 = vreinterpret_s16_u16(vget_low_u16(vsubl_u8(d4_in.val[1], d4_in.val[3]))); ++ int16x4_t q_a3 = vreinterpret_s16_u16(vget_low_u16(vsubl_u8(d4_in.val[0], d4_in.val[2]))); ++ int16x4x4_t q4_out_v; ++ INIT_VECTOR4(q4_out_v, vadd_s16(q_a0,q_a1), vadd_s16(q_a3, q_a2), vsub_s16(q_a3, q_a2), vsub_s16(q_a0, q_a1)); ++ const int16x4x2_t q2_tmp0 = vtrn_s16(q4_out_v.val[0], q4_out_v.val[1]); ++ const int16x4x2_t q2_tmp1 = vtrn_s16(q4_out_v.val[2], q4_out_v.val[3]); ++ const int16x4x2_t q2_tmp2 = vtrn_s32(vreinterpret_s32_s16(q2_tmp0.val[0]), vreinterpret_s32_s16(q2_tmp1.val[0])); ++ const int16x4x2_t q2_tmp3 = vtrn_s32(vreinterpret_s32_s16(q2_tmp0.val[1]), vreinterpret_s32_s16(q2_tmp1.val[1])); ++ int16x4x4_t q4_out_t; ++ INIT_VECTOR4(q4_out_t, vreinterpret_s16_s32(q2_tmp2.val[0]), vreinterpret_s16_s32(q2_tmp3.val[0]), ++ vreinterpret_s16_s32(q2_tmp2.val[1]), vreinterpret_s16_s32(q2_tmp3.val[1])); ++ q_a0 = vadd_s16(q4_out_t.val[0], q4_out_t.val[2]); ++ q_a1 = vadd_s16(q4_out_t.val[1], q4_out_t.val[3]); ++ q_a3 = vsub_s16(q4_out_t.val[0], q4_out_t.val[2]); ++ q_a2 = vsub_s16(q4_out_t.val[1], q4_out_t.val[3]); ++ int16x4x4_t q4_out_h; ++ INIT_VECTOR4(q4_out_h, vabs_s16(vadd_s16(q_a0,q_a1)), vabs_s16(vadd_s16(q_a3, q_a2)), ++ vabd_s16(q_a3, q_a2), vabd_s16(q_a0, q_a1)); ++ int32x4_t q_sum = vdupq_n_s32(0); ++ q_sum = vmlal_s16(q_sum, d4_w.val[0], (q4_out_h.val[0])); ++ q_sum = vmlal_s16(q_sum, d4_w.val[1], (q4_out_h.val[1])); ++ q_sum = vmlal_s16(q_sum, d4_w.val[2], (q4_out_h.val[2])); ++ q_sum = vmlal_s16(q_sum, d4_w.val[3], (q4_out_h.val[3])); ++ return vaddvq_s32(q_sum); ++} ++static int SSE4x4_NEON(const uint8x16_t a0, ++ const uint8_t* b) { ++ const uint8x16_t b0 = Load4x4_NEON(b); ++ const uint8x16_t abs_diff = vabdq_u8(a0, b0); ++ const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff), ++ vget_low_u8(abs_diff)); ++ const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff), ++ vget_high_u8(abs_diff)); ++ /* pair-wise adds and widen */ ++ const uint32x4_t sum1 = vpaddlq_u16(prod1); ++ const uint32x4_t sum2 = vpaddlq_u16(prod2); ++ return vaddlvq_u32(vaddq_u32(sum1, sum2)); ++} + static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it, + VP8ModeScore* WEBP_RESTRICT const rd) { + const VP8Encoder* const enc = it->enc_; +@@ -1090,12 +1402,15 @@ static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it, + rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145) + SetRDScore(dqm->lambda_mode_, &rd_best); + VP8IteratorStartI4(it); ++ const int16x4x4_t d4_w = DistoLoadW_NEON(kWeightY); + do { + const int kNumBlocks = 1; + VP8ModeScore rd_i4; + int mode; + int best_mode = -1; + const uint8_t* const src = src0 + VP8Scan[it->i4_]; ++ const uint8x16_t src4x4 = Load4x4_NEON(src); ++ int sumSrc = Disto4x4_NEON_CSP(src, d4_w); + const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4); + uint8_t* best_block = best_blocks + VP8Scan[it->i4_]; + uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer. +@@ -1109,11 +1424,12 @@ static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it, + // Reconstruct + rd_tmp.nz = + ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_; ++ int sumTmp = Disto4x4_NEON_CSP(tmp_dst, d4_w); ++ int sum = abs(sumTmp - sumSrc) >> 5; + + // Compute RD-score +- rd_tmp.D = VP8SSE4x4(src, tmp_dst); + rd_tmp.SD = +- tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY)) ++ tlambda ? MULT_8B(tlambda, sum) + : 0; + rd_tmp.H = mode_costs[mode]; + +@@ -1124,7 +1440,7 @@ static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it, + } else { + rd_tmp.R = 0; + } +- ++ rd_tmp.D = SSE4x4_NEON(src4x4, tmp_dst); + // early-out check + SetRDScore(lambda, &rd_tmp); + if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;