From 3b4657b4d5d4dc8a6d0194df22e66294caff1752 Mon Sep 17 00:00:00 2001
From: "xubinbin123.xbb" <xubinbin.xbb@alibaba-inc.com>
Date: Tue, 30 Aug 2022 18:51:07 +0800
Subject: [PATCH] =?UTF-8?q?Added=20chunkcopy=5Fneon=20optimization,=20whic?=
 =?UTF-8?q?h=20can=20improve=20decompression=20performance=20by=2012.5%=20?=
 =?UTF-8?q?=E2=80=8B=E2=80=8Bunder=20lzbench.=20This=20optimization=20meth?=
 =?UTF-8?q?od=20is=20transplanted=20from=20zlib-ng,=20upstream:=20https://?=
 =?UTF-8?q?github.com/zlib-ng/zlib-ng/blob/develop/arch/arm=20/chunkset=5F?=
 =?UTF-8?q?neon.c?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ...anolis-Neon-Optimized-chunkcopy_neon.patch | 615 ++++++++++++++++++
 zlib.spec                                     |   8 +-
 2 files changed, 622 insertions(+), 1 deletion(-)
 create mode 100644 1009-zlib-anolis-Neon-Optimized-chunkcopy_neon.patch

diff --git a/1009-zlib-anolis-Neon-Optimized-chunkcopy_neon.patch b/1009-zlib-anolis-Neon-Optimized-chunkcopy_neon.patch
new file mode 100644
index 0000000..fd4051e
--- /dev/null
+++ b/1009-zlib-anolis-Neon-Optimized-chunkcopy_neon.patch
@@ -0,0 +1,615 @@
+diff -Nru zlib-1.2.11/contrib/arm/arm_chunk_copy_neon.h ../zlib-1.2.11/contrib/arm/arm_chunk_copy_neon.h
+--- zlib-1.2.11/contrib/arm/arm_chunk_copy_neon.h	1970-01-01 08:00:00.000000000 +0800
++++ ../zlib-1.2.11/contrib/arm/arm_chunk_copy_neon.h	2022-08-30 17:29:33.032693593 +0800
+@@ -0,0 +1,311 @@
++#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
++
++#define ENABLE_ARM_CHUNK_NEON
++#define INFLATE_FAST_MIN_HAVE 6
++#define INFLATE_FAST_MIN_LEFT 258
++
++#include <arm_neon.h>
++#include <string.h>
++
++typedef uint8x16_t chunk_t;
++
++#define CHUNK_SIZE 16
++
++#define HAVE_CHUNKMEMSET_1
++#define HAVE_CHUNKMEMSET_2
++#define HAVE_CHUNKMEMSET_4
++#define HAVE_CHUNKMEMSET_8
++
++#define zmemcpy_2(dest, src)  memcpy(dest, src, 2)
++#define zmemcmp_2(str1, str2) memcmp(str1, str2, 2)
++#define zmemcpy_4(dest, src)  memcpy(dest, src, 4)
++#define zmemcmp_4(str1, str2) memcmp(str1, str2, 4)
++#define zmemcpy_8(dest, src)  memcpy(dest, src, 8)
++#define zmemcmp_8(str1, str2) memcmp(str1, str2, 8)
++#define MIN(a, b) ((a) > (b) ? (b) : (a))
++#define Z_INTERNAL
++
++static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) {
++    *chunk = vld1q_dup_u8(from);
++}
++
++static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) {
++    uint16_t tmp;
++    zmemcpy_2(&tmp, from);
++    *chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp));
++}
++
++static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) {
++    uint32_t tmp;
++    zmemcpy_4(&tmp, from);
++    *chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp));
++}
++
++static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) {
++    uint64_t tmp;
++    zmemcpy_8(&tmp, from);
++    *chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp));
++}
++
++#define CHUNKSIZE        chunksize_neon
++#define CHUNKCOPY        chunkcopy_neon
++#define CHUNKCOPY_SAFE   chunkcopy_safe_neon
++#define CHUNKUNROLL      chunkunroll_neon
++#define CHUNKMEMSET      chunkmemset_neon
++#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
++
++static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
++    *chunk = vld1q_u8(s);
++}
++
++static inline void storechunk(uint8_t *out, chunk_t *chunk) {
++    vst1q_u8(out, *chunk);
++}
++
++/* Behave like chunkcopy, but avoid writing beyond of legal output. */
++static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, size_t len, uint8_t *safe) {
++    uint32_t safelen = (uint32_t)((safe - out) + 1);
++    len = MIN(len, safelen);
++    int32_t olap_src = from >= out && from < out + len;
++    int32_t olap_dst = out >= from && out < from + len;
++    size_t tocopy;
++
++    /* For all cases without overlap, memcpy is ideal */
++    if (!(olap_src || olap_dst)) {
++        memcpy(out, from, len);
++        return out + len;
++    }
++
++    /* We are emulating a self-modifying copy loop here. To do this in a way that doesn't produce undefined behavior,
++     * we have to get a bit clever. First if the overlap is such that src falls between dst and dst+len, we can do the
++     * initial bulk memcpy of the nonoverlapping region. Then, we can leverage the size of this to determine the safest
++     * atomic memcpy size we can pick such that we have non-overlapping regions. This effectively becomes a safe look
++     * behind or lookahead distance */
++    size_t non_olap_size = ((from > out) ? from - out : out - from);
++
++    memcpy(out, from, non_olap_size);
++    out += non_olap_size;
++    from += non_olap_size;
++    len -= non_olap_size;
++
++    /* So this doesn't give use a worst case scenario of function calls in a loop,
++     * we want to instead break this down into copy blocks of fixed lengths */
++    while (len) {
++        tocopy = MIN(non_olap_size, len);
++        len -= tocopy;
++
++        while (tocopy >= 32) {
++            memcpy(out, from, 32);
++            out += 32;
++            from += 32;
++            tocopy -= 32;
++        }
++
++        if (tocopy >= 16) {
++            memcpy(out, from, 16);
++            out += 16;
++            from += 16;
++            tocopy -= 16;
++        }
++
++        if (tocopy >= 8) {
++            zmemcpy_8(out, from);
++            out += 8;
++            from += 8;
++            tocopy -= 8;
++        }
++
++        if (tocopy >= 4) {
++            zmemcpy_4(out, from);
++            out += 4;
++            from += 4;
++            tocopy -= 4;
++        }
++
++        if (tocopy >= 2) {
++            zmemcpy_2(out, from);
++            out += 2;
++            from += 2;
++            tocopy -= 2;
++        }
++
++        if (tocopy) {
++            *out++ = *from++;
++        }
++    }
++
++    return out;
++}
++
++/* Returns the chunk size */
++ZLIB_INTERNAL uint32_t CHUNKSIZE(void) {
++    return sizeof(chunk_t);
++}
++
++/* Behave like memcpy, but assume that it's OK to overwrite at least
++   chunk_t bytes of output even if the length is shorter than this,
++   that the length is non-zero, and that `from` lags `out` by at least
++   sizeof chunk_t bytes (or that they don't overlap at all or simply that
++   the distance is less than the length of the copy).
++
++   Aside from better memory bus utilisation, this means that short copies
++   (chunk_t bytes or fewer) will fall straight through the loop
++   without iteration, which will hopefully make the branch prediction more
++   reliable. */
++ZLIB_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) {
++    Assert(len > 0, "chunkcopy should never have a length 0");
++    chunk_t chunk;
++    int32_t align = ((len - 1) % sizeof(chunk_t)) + 1;
++    loadchunk(from, &chunk);
++    storechunk(out, &chunk);
++    out += align;
++    from += align;
++    len -= align;
++    while (len > 0) {
++        loadchunk(from, &chunk);
++        storechunk(out, &chunk);
++        out += sizeof(chunk_t);
++        from += sizeof(chunk_t);
++        len -= sizeof(chunk_t);
++    }
++    return out;
++}
++
++/* Perform short copies until distance can be rewritten as being at least
++   sizeof chunk_t.
++
++   This assumes that it's OK to overwrite at least the first
++   2*sizeof(chunk_t) bytes of output even if the copy is shorter than this.
++   This assumption holds because inflate_fast() starts every iteration with at
++   least 258 bytes of output space available (258 being the maximum length
++   output from a single token; see inflate_fast()'s assumptions below). */
++ZLIB_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) {
++    unsigned char const *from = out - *dist;
++    chunk_t chunk;
++    while (*dist < *len && *dist < sizeof(chunk_t)) {
++        loadchunk(from, &chunk);
++        storechunk(out, &chunk);
++        out += *dist;
++        *len -= *dist;
++        *dist += *dist;
++    }
++    return out;
++}
++
++/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST.
++   Return OUT + LEN. */
++ZLIB_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
++    /* Debug performance related issues when len < sizeof(uint64_t):
++       Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */
++    Assert(dist > 0, "chunkmemset cannot have a distance 0");
++
++    uint8_t *from = out - dist;
++
++    if (dist == 1) {
++        memset(out, *from, len);
++        return out + len;
++    } else if (dist > sizeof(chunk_t)) {
++        return CHUNKCOPY(out, out - dist, len);
++    }
++
++    chunk_t chunk_load;
++    uint32_t chunk_mod = 0;
++    /* TODO: possibly build up a permutation table for this if not an even modulus */
++#ifdef HAVE_CHUNKMEMSET_2
++    if (dist == 2) {
++        chunkmemset_2(from, &chunk_load);
++    } else
++#endif
++#ifdef HAVE_CHUNKMEMSET_4
++    if (dist == 4) {
++        chunkmemset_4(from, &chunk_load);
++    } else
++#endif
++#ifdef HAVE_CHUNKMEMSET_8
++    if (dist == 8) {
++        chunkmemset_8(from, &chunk_load);
++    } else if (dist == sizeof(chunk_t)) {
++        loadchunk(from, &chunk_load);
++    } else
++#endif
++    {
++        /* This code takes string of length dist from "from" and repeats
++         * it for as many times as can fit in a chunk_t (vector register) */
++        uint32_t cpy_dist;
++        uint32_t bytes_remaining = sizeof(chunk_t);
++        uint8_t *cur_chunk = (uint8_t *)&chunk_load;
++        while (bytes_remaining) {
++            cpy_dist = MIN(dist, bytes_remaining);
++            memcpy(cur_chunk, from, cpy_dist);
++            bytes_remaining -= cpy_dist;
++            cur_chunk += cpy_dist;
++            /* This allows us to bypass an expensive integer division since we're effectively
++             * counting in this loop, anyway. However, we may have to derive a similarly
++             * sensible solution for if we use a permutation table that allows us to construct
++             * this vector in one load and one permute instruction */
++            chunk_mod = cpy_dist;
++        }
++    }
++
++    /* If we're lucky enough and dist happens to be an even modulus of our vector length,
++     * we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */
++    if (chunk_mod == 0) {
++        while (len >= (2 * sizeof(chunk_t))) {
++            storechunk(out, &chunk_load);
++            storechunk(out + sizeof(chunk_t), &chunk_load);
++            out += 2 * sizeof(chunk_t);
++            len -= 2 * sizeof(chunk_t);
++        }
++    }
++
++    /* If we don't have a "dist" length that divides evenly into a vector
++     * register, we can write the whole vector register but we need only
++     * advance by the amount of the whole string that fits in our chunk_t.
++     * If we do divide evenly into the vector length, adv_amount = chunk_t size*/
++    uint32_t adv_amount = sizeof(chunk_t) - chunk_mod;
++    while (len >= sizeof(chunk_t)) {
++        storechunk(out, &chunk_load);
++        len -= adv_amount;
++        out += adv_amount;
++    }
++
++    if (len) {
++        memcpy(out, &chunk_load, len);
++        out += len;
++    }
++
++    return out;
++}
++
++ZLIB_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len, unsigned left) {
++#if !defined(UNALIGNED64_OK)
++#if !defined(UNALIGNED_OK)
++    static const uint32_t align_mask = 7;
++#else
++    static const uint32_t align_mask = 3;
++#endif
++#endif
++
++    len = MIN(len, left);
++    uint8_t *from = out - dist;
++#if !defined(UNALIGNED64_OK)
++    while (((uintptr_t)out & align_mask) && (len > 0)) {
++        *out++ = *from++;
++        --len;
++        --left;
++    }
++#endif
++    if (left < (unsigned)(3 * sizeof(chunk_t))) {
++        while (len > 0) {
++            *out++ = *from++;
++            --len;
++        }
++        return out;
++    }
++    if (len)
++        return CHUNKMEMSET(out, dist, len);
++
++    return out;
++}
++
++#endif
+diff -Nru zlib-1.2.11/inffast.c ../zlib-1.2.11/inffast.c
+--- zlib-1.2.11/inffast.c	2017-01-16 01:29:40.000000000 +0800
++++ ../zlib-1.2.11/inffast.c	2022-08-30 17:46:16.887320481 +0800
+@@ -7,7 +7,7 @@
+ #include "inftrees.h"
+ #include "inflate.h"
+ #include "inffast.h"
+-
++#include "contrib/arm/arm_chunk_copy_neon.h"
+ #ifdef ASMINF
+ #  pragma message("Assembler code may have bugs -- use at your own risk")
+ #else
+@@ -47,10 +47,268 @@
+       requires strm->avail_out >= 258 for each loop to avoid checking for
+       output space.
+  */
++#ifdef ENABLE_ARM_CHUNK_NEON
++void ZLIB_INTERNAL arm_inflate_fast(strm, start)
++z_streamp strm;
++unsigned start;         /* inflate()'s starting value for strm->avail_out */
++{
++    struct inflate_state FAR *state;
++    z_const unsigned char FAR *in;      /* local strm->next_in */
++    z_const unsigned char FAR *last;    /* have enough input while in < last */
++    unsigned char FAR *out;     /* local strm->next_out */
++    unsigned char FAR *beg;     /* inflate()'s initial strm->next_out */
++    unsigned char FAR *end;     /* while out < end, enough space available */
++    unsigned char *safe;        /* can use chunkcopy provided out < safe */
++#ifdef INFLATE_STRICT
++    unsigned dmax;              /* maximum distance from zlib header */
++#endif
++    unsigned wsize;             /* window size or zero if not using window */
++    unsigned whave;             /* valid bytes in the window */
++    unsigned wnext;             /* window write index */
++    unsigned char FAR *window;  /* allocated sliding window, if wsize != 0 */
++    unsigned long hold;         /* local strm->hold */
++    unsigned bits;              /* local strm->bits */
++    code const FAR *lcode;      /* local strm->lencode */
++    code const FAR *dcode;      /* local strm->distcode */
++    unsigned lmask;             /* mask for first level of length codes */
++    unsigned dmask;             /* mask for first level of distance codes */
++    code here;                  /* retrieved table entry */
++    unsigned op;                /* code bits, operation, extra bits, or */
++                                /*  window position, window bytes to copy */
++    unsigned len;               /* match length, unused bytes */
++    unsigned dist;              /* match distance */
++    unsigned char FAR *from;    /* where to copy match from */
++    unsigned extra_safe;        /* copy chunks safely in all cases */
++    uint32_t chunksize = chunksize_neon();
++    /* copy state to local variables */
++    state = (struct inflate_state FAR *)strm->state;
++    in = strm->next_in;
++    last = in + (strm->avail_in - 5);
++    out = strm->next_out;
++    beg = out - (start - strm->avail_out);
++    end = out + (strm->avail_out - 257);
++    safe = out + strm->avail_out;
++#ifdef INFLATE_STRICT
++    dmax = state->dmax;
++#endif
++    wsize = state->wsize;
++    whave = state->whave;
++    wnext = state->wnext;
++    window = state->window;
++    hold = state->hold;
++    bits = state->bits;
++    lcode = state->lencode;
++    dcode = state->distcode;
++    lmask = (1U << state->lenbits) - 1;
++    dmask = (1U << state->distbits) - 1;
++    extra_safe = (wsize != 0 && out >= window && out + INFLATE_FAST_MIN_LEFT <= window + wsize);
++    /* decode literals and length/distances until end-of-block or not enough
++       input data or output space */
++    do {
++        if (bits < 15) {
++            hold += (unsigned long)(*in++) << bits;
++            bits += 8;
++            hold += (unsigned long)(*in++) << bits;
++            bits += 8;
++        }
++        here = lcode[hold & lmask];
++      dolen:
++        op = (unsigned)(here.bits);
++        hold >>= op;
++        bits -= op;
++        op = (unsigned)(here.op);
++        if (op == 0) {                          /* literal */
++            Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ?
++                    "inflate:         literal '%c'\n" :
++                    "inflate:         literal 0x%02x\n", here.val));
++            *out++ = (unsigned char)(here.val);
++        }
++        else if (op & 16) {                     /* length base */
++            len = (unsigned)(here.val);
++            op &= 15;                           /* number of extra bits */
++            if (op) {
++                if (bits < op) {
++                    hold += (unsigned long)(*in++) << bits;
++                    bits += 8;
++                }
++                len += (unsigned)hold & ((1U << op) - 1);
++                hold >>= op;
++                bits -= op;
++            }
++            Tracevv((stderr, "inflate:         length %u\n", len));
++            if (bits < 15) {
++                hold += (unsigned long)(*in++) << bits;
++                bits += 8;
++                hold += (unsigned long)(*in++) << bits;
++                bits += 8;
++            }
++            here = dcode[hold & dmask];
++          dodist:
++            op = (unsigned)(here.bits);
++            hold >>= op;
++            bits -= op;
++            op = (unsigned)(here.op);
++            if (op & 16) {                      /* distance base */
++                dist = (unsigned)(here.val);
++                op &= 15;                       /* number of extra bits */
++                if (bits < op) {
++                    hold += (unsigned long)(*in++) << bits;
++                    bits += 8;
++                    if (bits < op) {
++                        hold += (unsigned long)(*in++) << bits;
++                        bits += 8;
++                    }
++                }
++                dist += (unsigned)hold & ((1U << op) - 1);
++#ifdef INFLATE_STRICT
++                if (dist > dmax) {
++                    strm->msg = (char *)"invalid distance too far back";
++                    state->mode = BAD;
++                    break;
++                }
++#endif
++                hold >>= op;
++                bits -= op;
++                Tracevv((stderr, "inflate:         distance %u\n", dist));
++                op = (unsigned)(out - beg);     /* max distance in output */
++                if (dist > op) {                /* see if copy from window */
++                    op = dist - op;             /* distance back in window */
++                    if (op > whave) {
++                        if (state->sane) {
++                            strm->msg =
++                                (char *)"invalid distance too far back";
++                            state->mode = BAD;
++                            break;
++                        }
++#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR
++                        if (len <= op - whave) {
++                            do {
++                                *out++ = 0;
++                            } while (--len);
++                            continue;
++                        }
++                        len -= op - whave;
++                        do {
++                            *out++ = 0;
++                        } while (--op > whave);
++                        if (op == 0) {
++                            from = out - dist;
++                            do {
++                                *out++ = *from++;
++                            } while (--len);
++                            continue;
++                        }
++#endif
++                    }
++                    from = window;
++                    if (wnext == 0) {           /* very common case */
++                        from += wsize - op;
++		    } else if (wnext >= op) {   /* contiguous in window */
++                        from += wnext - op;
++                    } else {                    /* wrap around window */
++                        op -= wnext;
++                        from += wsize - op;
++                        if (op < len) {         /* some from end of window */
++                            len -= op;
++			    out = chunkcopy_safe(out, from, op, safe);
++                            from = window;      /* more from start of window */
++                            op = wnext;
++                            /* This (rare) case can create a situation where
++                               the first chunkcopy below must be checked.
++                            */
++                        }
++                    }
++		    if (op < len) {             /* still need some from output */
++                        len -= op;
++                        out = chunkcopy_safe(out, from, op, safe);
++			out = chunkunroll_neon(out, &dist, &len);
++                        out = chunkcopy_safe(out, out - dist, len, safe);
++                    } else {
++                        out = chunkcopy_safe(out, from, len, safe);
++                    }
++                }  else if (extra_safe) {
++                    /* Whole reference is in range of current output. */
++                    if (dist >= len || dist >= chunksize)
++                        out = chunkcopy_safe(out, out - dist, len, safe);
++                    else
++			out = chunkmemset_safe_neon(out, dist, len, (unsigned)((safe - out) + 1));
++                } else {
++                    /* Whole reference is in range of current output.  No range checks are
++                       necessary because we start with room for at least 258 bytes of output,
++                       so unroll and roundoff operations can write beyond `out+len` so long
++                       as they stay within 258 bytes of `out`.
++                    */
++                    if (dist >= len || dist >= chunksize)
++			out = chunkcopy_neon(out, out - dist, len);
++                    else
++                        out = chunkmemset_neon(out, dist, len);
++                }
++	    } else if ((op & 64) == 0) {          /* 2nd level distance code */
++                here = dcode[here.val + (hold & ((1U << op) - 1))];
++                goto dodist;
++            }
++            else {
++                strm->msg = (char *)"invalid distance code";
++                state->mode = BAD;
++                break;
++            }
++        }
++        else if ((op & 64) == 0) {              /* 2nd level length code */
++            here = lcode[here.val + (hold & ((1U << op) - 1))];
++            goto dolen;
++        }
++        else if (op & 32) {                     /* end-of-block */
++            Tracevv((stderr, "inflate:         end of block\n"));
++            state->mode = TYPE;
++            break;
++        }
++        else {
++            strm->msg = (char *)"invalid literal/length code";
++            state->mode = BAD;
++            break;
++        }
++    } while (in < last && out < end);
++
++    /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
++    len = bits >> 3;
++    in -= len;
++    bits -= len << 3;
++    hold &= (1U << bits) - 1;
++
++    /* update state and return */
++    strm->next_in = in;
++    strm->next_out = out;
++    strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last));
++    strm->avail_out = (unsigned)(out < end ?
++                                 257 + (end - out) : 257 - (out - end));
++    state->hold = hold;
++    state->bits = bits;
++    return;
++}
++
++/*
++   inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe):
++   - Using bit fields for code structure
++   - Different op definition to avoid & for extra bits (do & for table bits)
++   - Three separate decoding do-loops for direct, window, and wnext == 0
++   - Special case for distance > 1 copies to do overlapped load and store copy
++   - Explicit branch predictions (based on measured branch probabilities)
++   - Deferring match copy and interspersed it with decoding subsequent codes
++   - Swapping literal/length else
++   - Swapping window/direct else
++   - Larger unrolled copy loops (three is about right)
++   - Moving len -= 3 statement into middle of loop
++ */
++#endif
++
+ void ZLIB_INTERNAL inflate_fast(strm, start)
+ z_streamp strm;
+ unsigned start;         /* inflate()'s starting value for strm->avail_out */
+ {
++#ifdef ENABLE_ARM_CHUNK_NEON
++    return arm_inflate_fast(strm, start);
++#endif
++
+     struct inflate_state FAR *state;
+     z_const unsigned char FAR *in;      /* local strm->next_in */
+     z_const unsigned char FAR *last;    /* have enough input while in < last */
+@@ -306,18 +564,4 @@
+     return;
+ }
+ 
+-/*
+-   inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe):
+-   - Using bit fields for code structure
+-   - Different op definition to avoid & for extra bits (do & for table bits)
+-   - Three separate decoding do-loops for direct, window, and wnext == 0
+-   - Special case for distance > 1 copies to do overlapped load and store copy
+-   - Explicit branch predictions (based on measured branch probabilities)
+-   - Deferring match copy and interspersed it with decoding subsequent codes
+-   - Swapping literal/length else
+-   - Swapping window/direct else
+-   - Larger unrolled copy loops (three is about right)
+-   - Moving len -= 3 statement into middle of loop
+- */
+-
+ #endif /* !ASMINF */
diff --git a/zlib.spec b/zlib.spec
index 749ff83..3d9582b 100644
--- a/zlib.spec
+++ b/zlib.spec
@@ -1,4 +1,4 @@
-%define anolis_release .0.3
+%define anolis_release .0.4
 # disabled, per rhbz#1609830 and rhbz#1602742
 %bcond_with minizip
 
@@ -30,6 +30,8 @@ Patch1006: 1006-zlib-anolis-Optimize-slide_hash.patch
 Patch1007: 1007-zlib-anolis-Neon-Optimized-adler32.patch
 # optimized crc32 function with crc32 + pmul instruction in armv8
 Patch1008: 1008-zlib-anolis-Optimized-crc32-pmul-mix.patch
+# optimized chunk copy by neon in armv8
+Patch1009: 1009-zlib-anolis-Neon-Optimized-chunkcopy_neon.patch
 
 BuildRequires: automake, autoconf, libtool
 
@@ -91,6 +93,7 @@ developing applications which use minizip.
 %patch1005 -p1
 %patch1007 -p1
 %patch1008 -p1
+%patch1009 -p1
 %endif
 
 %ifarch x86_64
@@ -173,6 +176,9 @@ find $RPM_BUILD_ROOT -name '*.la' -delete
 
 
 %changelog
+* Tue Aug 30 2022 binbin Xu <xubinbin.xbb@alibaba-inc.com> - 1.2.11-17.0.4
+- add optimized chunk copy by neon patch for aarch64
+
 * Thu Aug 25 2022 binbin Xu <xubinbin.xbb@alibaba-inc.com> - 1.2.11-17.0.3
 - add optimized crc32 with pmul mix crc patch for aarch64
 
-- 
Gitee