diff --git a/webp-opt.patch b/webp-opt.patch
new file mode 100644
index 0000000000000000000000000000000000000000..a006d3e2149ec9d29ce4e9e087a0ef547b19af11
--- /dev/null
+++ b/webp-opt.patch
@@ -0,0 +1,482 @@
+diff --git a/src/dsp/cost_neon.c b/src/dsp/cost_neon.c
+index e1bf365..ec08f3c 100644
+--- a/src/dsp/cost_neon.c
++++ b/src/dsp/cost_neon.c
+@@ -111,7 +111,6 @@ static int GetResidualCost_NEON(int ctx0, const VP8Residual* const res) {
+ extern void VP8EncDspCostInitNEON(void);
+ 
+ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitNEON(void) {
+-  VP8SetResidualCoeffs = SetResidualCoeffs_NEON;
+   VP8GetResidualCost = GetResidualCost_NEON;
+ }
+ 
+diff --git a/src/dsp/enc_neon.c b/src/dsp/enc_neon.c
+index b373245..86781ff 100644
+--- a/src/dsp/enc_neon.c
++++ b/src/dsp/enc_neon.c
+@@ -732,11 +732,61 @@ static void CollectHistogram_NEON(const uint8_t* WEBP_RESTRICT ref,
+   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+   for (j = start_block; j < end_block; ++j) {
+     int16_t out[16];
+-    FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
++    const uint8_t* src = ref + VP8DspScan[j];
++    const uint8_t* ref = pred + VP8DspScan[j];
++    int16x8_t d0d1, d3d2;
++    {
++      const uint8x16_t S0 = Load4x4_NEON(src);
++      const uint8x16_t R0 = Load4x4_NEON(ref);
++      const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
++      const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
++      const int16x4_t D0 = vget_low_s16(D0D1);
++      const int16x4_t D1 = vget_high_s16(D0D1);
++      const int16x4_t D2 = vget_low_s16(D2D3);
++      const int16x4_t D3 = vget_high_s16(D2D3);
++      Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
++    }
++    { // 1rst pass
++      const int32x4_t kCst937 = vdupq_n_s32(937);
++      const int32x4_t kCst1812 = vdupq_n_s32(1812);
++      const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2   (=a0|a1)
++      const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2   (=a3|a2)
++      const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
++      const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
++                                      vget_high_s16(a0a1_2));
++      const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
++                                      vget_high_s16(a0a1_2));
++      const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
++      const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
++      const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
++      const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
++      const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
++      const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
++      Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
++    }
++    { // 2nd pass
++      // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
++      const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
++      const int32x4_t kCst51000 = vdupq_n_s32(51000);
++      const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2   (=a0|a1)
++      const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2   (=a3|a2)
++      const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
++      const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
++      const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
++      const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
++      const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
++      const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
++      const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
++      const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
++      const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
++      const int16x4_t a3_eq_0 =
++          vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
++      const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
++    }
+     {
+       int k;
+-      const int16x8_t a0 = vld1q_s16(out + 0);
+-      const int16x8_t b0 = vld1q_s16(out + 8);
++      const int16x8_t a0 = vcombine_u8(out0, out1);
++      const int16x8_t b0 = vcombine_u8(out2, out3);
+       const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
+       const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
+       const uint16x8_t a2 = vshrq_n_u16(a1, 3);
+diff --git a/src/enc/quant_enc.c b/src/enc/quant_enc.c
+index d1c6062..f51f4fa 100644
+--- a/src/enc/quant_enc.c
++++ b/src/enc/quant_enc.c
+@@ -10,7 +10,7 @@
+ //   Quantization
+ //
+ // Author: Skal (pascal.massimino@gmail.com)
+-
++#include<arm_neon.h>
+ #include <assert.h>
+ #include <math.h>
+ #include <stdlib.h>  // for abs()
+@@ -822,6 +822,246 @@ static int ReconstructIntra16(VP8EncIterator* WEBP_RESTRICT const it,
+ 
+   return nz;
+ }
++static const uint8_t kShuffles[4][8] = {
++  { 0,   1,  2,  3,  8,  9, 16, 17 },
++  { 10, 11,  4,  5,  6,  7, 12, 13 },
++  { 18, 19, 24, 25, 26, 27, 20, 21 },
++  { 14, 15, 22, 23, 28, 29, 30, 31 }
++};
++#define INIT_VECTOR4(v, a, b, c, d) do {  \
++  v.val[0] = a;                           \
++  v.val[1] = b;                           \
++  v.val[2] = c;                           \
++  v.val[3] = d;                           \
++} while (0);
++static int16x8_t Quantize_NEON(int16x8_t a, int16x8_t* tmp, int16_t* const in,
++                               const VP8Matrix* const mtx,
++                               int offset) {
++  const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
++  const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
++  const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
++  const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
++  const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
++
++  const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a));  // coeff = abs(in)
++  const int16x8_t sign = vshrq_n_s16(a, 15);                 // sign
++  const uint16x8_t c = vaddq_u16(b, sharp);                  // + sharpen
++  const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
++  const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
++  const uint32x4_t m2 = vhaddq_u32(m0, bias0);
++  const uint32x4_t m3 = vhaddq_u32(m1, bias1);     // (coeff * iQ + bias) >> 1
++  const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
++                                     vshrn_n_u32(m3, 16));   // QFIX=17 = 16+1
++  const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
++  const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
++  const int16x8_t c3 = vsubq_s16(c2, sign);                  // restore sign
++  const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
++  *tmp = c4;
++  assert(QFIX == 17);  // this function can't work as is if QFIX != 16+1
++  return c3;
++}
++static int QuantizeBlock_NEON(const int16x8x2_t ret, int16x8_t * tmp0, int16x8_t * tmp8, int16_t in[16], int16_t out[16],
++                              const VP8Matrix* const mtx) {
++  const int16x8_t out0 = Quantize_NEON(ret.val[0], tmp0, in, mtx, 0);
++  const int16x8_t out1 = Quantize_NEON(ret.val[1], tmp8, in, mtx, 8);
++  uint8x8x4_t shuffles;
++  uint8x8x4_t all_out;
++  INIT_VECTOR4(all_out,
++               vreinterpret_u8_s16(vget_low_s16(out0)),
++               vreinterpret_u8_s16(vget_high_s16(out0)),
++               vreinterpret_u8_s16(vget_low_s16(out1)),
++               vreinterpret_u8_s16(vget_high_s16(out1)));
++  INIT_VECTOR4(shuffles,
++               vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
++               vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
++               vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
++               vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
++
++  vst1_u8((uint8_t*)(out +  0), shuffles.val[0]);
++  vst1_u8((uint8_t*)(out +  4), shuffles.val[1]);
++  vst1_u8((uint8_t*)(out +  8), shuffles.val[2]);
++  vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
++  // test zeros
++  if (*(uint64_t*)(out +  0) != 0) return 1;
++  if (*(uint64_t*)(out +  4) != 0) return 1;
++  if (*(uint64_t*)(out +  8) != 0) return 1;
++  if (*(uint64_t*)(out + 12) != 0) return 1;
++  return 0;
++}
++static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
++                                              const int16x4_t B,
++                                              const int16x4_t C,
++                                              const int16x4_t D,
++                                              int16x8_t* const out01,
++                                              int16x8_t* const out32) {
++  const int16x4x2_t AB = vtrn_s16(A, B);
++  const int16x4x2_t CD = vtrn_s16(C, D);
++  const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
++                                     vreinterpret_s32_s16(CD.val[0]));
++  const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
++                                     vreinterpret_s32_s16(CD.val[1]));
++  *out01 = vreinterpretq_s16_s64(
++      vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
++                   vreinterpret_s64_s32(tmp13.val[0])));
++  *out32 = vreinterpretq_s16_s64(
++      vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
++                   vreinterpret_s64_s32(tmp02.val[1])));
++}
++static uint8x16_t Load4x4_NEON(const uint8_t* src) {
++  uint32x4_t out = vdupq_n_u32(0);
++  out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
++  out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
++  out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
++  out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
++  return vreinterpretq_u8_u32(out);
++}
++static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
++                                              const uint8x8_t b) {
++  return vreinterpretq_s16_u16(vsubl_u8(a, b));
++}
++static int16x8x2_t FTransform_NEON(const uint8_t* src,
++                            const uint8_t* ref,
++                            int16_t* out) {
++  int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
++  {
++    const uint8x16_t S0 = Load4x4_NEON(src);
++    const uint8x16_t R0 = Load4x4_NEON(ref);
++    const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
++    const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
++    const int16x4_t D0 = vget_low_s16(D0D1);
++    const int16x4_t D1 = vget_high_s16(D0D1);
++    const int16x4_t D2 = vget_low_s16(D2D3);
++    const int16x4_t D3 = vget_high_s16(D2D3);
++    Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
++  }
++  {    // 1rst pass
++    const int32x4_t kCst937 = vdupq_n_s32(937);
++    const int32x4_t kCst1812 = vdupq_n_s32(1812);
++    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
++    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
++    const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
++    const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
++                                    vget_high_s16(a0a1_2));
++    const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
++                                    vget_high_s16(a0a1_2));
++    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
++    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
++    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
++    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
++    const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
++    const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
++    Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
++  }
++  {    // 2nd pass
++    // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
++    const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
++    const int32x4_t kCst51000 = vdupq_n_s32(51000);
++    const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
++    const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
++    const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
++    const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
++    const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
++    const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
++    const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
++    const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
++    const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
++    const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
++    const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
++    const int16x4_t a3_eq_0 =
++        vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
++    const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
++    int16x8x2_t ret = {vcombine_s16(out0, out1), vcombine_s16(out2, out3)};
++    return ret;
++  }
++}
++static const int16_t kC1 = 20091;
++static const int16_t kC2 = 17734;
++#define INIT_VECTOR2(v, a, b) do {  \
++  v.val[0] = a;                     \
++  v.val[1] = b;                     \
++} while (0)
++static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
++                                          const int16x8_t in1,
++                                          int16x8x2_t* const out) {
++  // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
++  // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
++  const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
++                                                  // b0 d0 b1 d1 b2 d2 ...
++  *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
++}
++static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
++  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
++}
++static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
++  // {rows} = in0 | in4
++  //          in8 | in12
++  // B1 = in4 | in12
++  const int16x8_t B1 =
++      vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
++  // C0 = kC1 * in4 | kC1 * in12
++  // C1 = kC2 * in4 | kC2 * in12
++  const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
++  const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
++  const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
++                                vget_low_s16(rows->val[1]));   // in0 + in8
++  const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
++                                vget_low_s16(rows->val[1]));   // in0 - in8
++  // c = kC2 * in4 - kC1 * in12
++  // d = kC1 * in4 + kC2 * in12
++  const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
++  const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
++  const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
++  const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
++  const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
++  const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
++  const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
++  Transpose8x2_NEON(E0, E1, rows);
++}
++static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
++                                                 const int16x8_t dst01,
++                                                 const int16x8_t dst23) {
++  // Unsigned saturate to 8b.
++  const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
++  const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
++
++  // Store the results.
++  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
++  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
++  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
++  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
++}
++static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
++                                    const int16x8_t row23,
++                                    const uint8_t* const ref,
++                                    uint8_t* const dst) {
++  uint32x2_t dst01 = vdup_n_u32(0);
++  uint32x2_t dst23 = vdup_n_u32(0);
++
++  // Load the source pixels.
++  dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
++  dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
++  dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
++  dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
++
++  {
++    // Convert to 16b.
++    const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
++    const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
++
++    // Descale with rounding.
++    const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
++    const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
++    // Add the inverse transform.
++    SaturateAndStore4x4_NEON(dst, out01, out23);
++  }
++}
++static void ITransformOne_NEON(int16x8x2_t rows, uint8_t* ref,
++                               const int16_t* in,
++                               uint8_t* dst) {
++  TransformPass_NEON(&rows);
++  TransformPass_NEON(&rows);
++  Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
++}
+ 
+ static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it,
+                              int16_t levels[16],
+@@ -834,16 +1074,20 @@ static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it,
+   int nz = 0;
+   int16_t tmp[16];
+ 
+-  VP8FTransform(src, ref, tmp);
++  int16x8x2_t ret = FTransform_NEON(src, ref, tmp);
++  int16x8x2_t rows;
+   if (DO_TRELLIS_I4 && it->do_trellis_) {
+     const int x = it->i4_ & 3, y = it->i4_ >> 2;
+     const int ctx = it->top_nz_[x] + it->left_nz_[y];
+     nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
+                               dqm->lambda_trellis_i4_);
+   } else {
+-    nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
++    int16x8_t tmp0, tmp8;
++    nz = QuantizeBlock_NEON(ret, &tmp0, &tmp8, tmp, levels, &dqm->y1_);
++    rows.val[0] = tmp0;
++    rows.val[1] = tmp8;
+   }
+-  VP8ITransform(ref, tmp, yuv_out, 0);
++  ITransformOne_NEON(rows, ref, tmp, yuv_out);
+   return nz;
+ }
+ 
+@@ -1070,7 +1314,75 @@ static const uint16_t* GetCostModeI4(VP8EncIterator* WEBP_RESTRICT const it,
+   const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
+   return VP8FixedCostsI4[top][left];
+ }
+-
++#define LOAD_LANE_32b(src, VALUE, LANE) \
++    (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
++static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
++  const uint16x8_t q_w07 = vld1q_u16(&w[0]);
++  const uint16x8_t q_w8f = vld1q_u16(&w[8]);
++  int16x4x4_t d4_w;
++  INIT_VECTOR4(d4_w,
++               vget_low_s16(vreinterpretq_s16_u16(q_w07)),
++               vget_high_s16(vreinterpretq_s16_u16(q_w07)),
++               vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
++               vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
++  return d4_w;
++}
++static int Disto4x4_NEON_CSP(const uint8_t * const a, const int16x4x4_t d4_w)
++{
++  uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
++  uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
++  uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
++  uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
++  uint8x8x4_t d4_in;
++  LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
++  LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
++  LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
++  LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
++  INIT_VECTOR4(d4_in,
++               vreinterpret_u8_u32(d_in_ab_0123),
++               vreinterpret_u8_u32(d_in_ab_4567),
++               vreinterpret_u8_u32(d_in_ab_89ab),
++               vreinterpret_u8_u32(d_in_ab_cdef));
++  int16x4_t q_a0 = vreinterpret_s16_u16(vget_low_u16(vaddl_u8(d4_in.val[0], d4_in.val[2])));
++  int16x4_t q_a1 = vreinterpret_s16_u16(vget_low_u16(vaddl_u8(d4_in.val[1], d4_in.val[3])));
++  int16x4_t q_a2 = vreinterpret_s16_u16(vget_low_u16(vsubl_u8(d4_in.val[1], d4_in.val[3])));
++  int16x4_t q_a3 = vreinterpret_s16_u16(vget_low_u16(vsubl_u8(d4_in.val[0], d4_in.val[2])));
++  int16x4x4_t q4_out_v;
++  INIT_VECTOR4(q4_out_v, vadd_s16(q_a0,q_a1), vadd_s16(q_a3, q_a2), vsub_s16(q_a3, q_a2), vsub_s16(q_a0, q_a1));
++  const int16x4x2_t q2_tmp0 = vtrn_s16(q4_out_v.val[0], q4_out_v.val[1]);
++  const int16x4x2_t q2_tmp1 = vtrn_s16(q4_out_v.val[2], q4_out_v.val[3]);
++  const int16x4x2_t q2_tmp2 = vtrn_s32(vreinterpret_s32_s16(q2_tmp0.val[0]), vreinterpret_s32_s16(q2_tmp1.val[0]));
++  const int16x4x2_t q2_tmp3 = vtrn_s32(vreinterpret_s32_s16(q2_tmp0.val[1]), vreinterpret_s32_s16(q2_tmp1.val[1]));
++  int16x4x4_t q4_out_t;
++  INIT_VECTOR4(q4_out_t, vreinterpret_s16_s32(q2_tmp2.val[0]), vreinterpret_s16_s32(q2_tmp3.val[0]),
++    vreinterpret_s16_s32(q2_tmp2.val[1]), vreinterpret_s16_s32(q2_tmp3.val[1]));
++  q_a0 = vadd_s16(q4_out_t.val[0], q4_out_t.val[2]);
++  q_a1 = vadd_s16(q4_out_t.val[1], q4_out_t.val[3]);
++  q_a3 = vsub_s16(q4_out_t.val[0], q4_out_t.val[2]);
++  q_a2 = vsub_s16(q4_out_t.val[1], q4_out_t.val[3]);
++  int16x4x4_t q4_out_h;
++  INIT_VECTOR4(q4_out_h, vabs_s16(vadd_s16(q_a0,q_a1)), vabs_s16(vadd_s16(q_a3, q_a2)), 
++    vabd_s16(q_a3, q_a2), vabd_s16(q_a0, q_a1));
++  int32x4_t q_sum = vdupq_n_s32(0);
++  q_sum = vmlal_s16(q_sum, d4_w.val[0], (q4_out_h.val[0]));
++  q_sum = vmlal_s16(q_sum, d4_w.val[1], (q4_out_h.val[1]));
++  q_sum = vmlal_s16(q_sum, d4_w.val[2], (q4_out_h.val[2]));
++  q_sum = vmlal_s16(q_sum, d4_w.val[3], (q4_out_h.val[3]));
++  return vaddvq_s32(q_sum);
++}
++static int SSE4x4_NEON(const uint8x16_t a0,
++                       const uint8_t* b) {
++  const uint8x16_t b0 = Load4x4_NEON(b);
++  const uint8x16_t abs_diff = vabdq_u8(a0, b0);
++  const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
++                                    vget_low_u8(abs_diff));
++  const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
++                                    vget_high_u8(abs_diff));
++  /* pair-wise adds and widen */
++  const uint32x4_t sum1 = vpaddlq_u16(prod1);
++  const uint32x4_t sum2 = vpaddlq_u16(prod2);
++  return vaddlvq_u32(vaddq_u32(sum1, sum2));
++}
+ static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
+                           VP8ModeScore* WEBP_RESTRICT const rd) {
+   const VP8Encoder* const enc = it->enc_;
+@@ -1090,12 +1402,15 @@ static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
+   rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
+   SetRDScore(dqm->lambda_mode_, &rd_best);
+   VP8IteratorStartI4(it);
++  const int16x4x4_t d4_w = DistoLoadW_NEON(kWeightY);
+   do {
+     const int kNumBlocks = 1;
+     VP8ModeScore rd_i4;
+     int mode;
+     int best_mode = -1;
+     const uint8_t* const src = src0 + VP8Scan[it->i4_];
++    const uint8x16_t src4x4 = Load4x4_NEON(src);
++    int sumSrc = Disto4x4_NEON_CSP(src, d4_w);
+     const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
+     uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
+     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
+@@ -1109,11 +1424,12 @@ static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
+       // Reconstruct
+       rd_tmp.nz =
+           ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
++      int sumTmp = Disto4x4_NEON_CSP(tmp_dst, d4_w);
++      int sum = abs(sumTmp - sumSrc) >> 5;
+ 
+       // Compute RD-score
+-      rd_tmp.D = VP8SSE4x4(src, tmp_dst);
+       rd_tmp.SD =
+-          tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
++          tlambda ? MULT_8B(tlambda, sum)
+                   : 0;
+       rd_tmp.H = mode_costs[mode];
+ 
+@@ -1124,7 +1440,7 @@ static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
+       } else {
+         rd_tmp.R = 0;
+       }
+-
++      rd_tmp.D = SSE4x4_NEON(src4x4, tmp_dst);
+       // early-out check
+       SetRDScore(lambda, &rd_tmp);
+       if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;