From 3b4657b4d5d4dc8a6d0194df22e66294caff1752 Mon Sep 17 00:00:00 2001 From: "xubinbin123.xbb" Date: Tue, 30 Aug 2022 18:51:07 +0800 Subject: [PATCH] =?UTF-8?q?Added=20chunkcopy=5Fneon=20optimization,=20whic?= =?UTF-8?q?h=20can=20improve=20decompression=20performance=20by=2012.5%=20?= =?UTF-8?q?=E2=80=8B=E2=80=8Bunder=20lzbench.=20This=20optimization=20meth?= =?UTF-8?q?od=20is=20transplanted=20from=20zlib-ng,=20upstream:=20https://?= =?UTF-8?q?github.com/zlib-ng/zlib-ng/blob/develop/arch/arm=20/chunkset=5F?= =?UTF-8?q?neon.c?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...anolis-Neon-Optimized-chunkcopy_neon.patch | 615 ++++++++++++++++++ zlib.spec | 8 +- 2 files changed, 622 insertions(+), 1 deletion(-) create mode 100644 1009-zlib-anolis-Neon-Optimized-chunkcopy_neon.patch diff --git a/1009-zlib-anolis-Neon-Optimized-chunkcopy_neon.patch b/1009-zlib-anolis-Neon-Optimized-chunkcopy_neon.patch new file mode 100644 index 0000000..fd4051e --- /dev/null +++ b/1009-zlib-anolis-Neon-Optimized-chunkcopy_neon.patch @@ -0,0 +1,615 @@ +diff -Nru zlib-1.2.11/contrib/arm/arm_chunk_copy_neon.h ../zlib-1.2.11/contrib/arm/arm_chunk_copy_neon.h +--- zlib-1.2.11/contrib/arm/arm_chunk_copy_neon.h 1970-01-01 08:00:00.000000000 +0800 ++++ ../zlib-1.2.11/contrib/arm/arm_chunk_copy_neon.h 2022-08-30 17:29:33.032693593 +0800 +@@ -0,0 +1,311 @@ ++#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) ++ ++#define ENABLE_ARM_CHUNK_NEON ++#define INFLATE_FAST_MIN_HAVE 6 ++#define INFLATE_FAST_MIN_LEFT 258 ++ ++#include ++#include ++ ++typedef uint8x16_t chunk_t; ++ ++#define CHUNK_SIZE 16 ++ ++#define HAVE_CHUNKMEMSET_1 ++#define HAVE_CHUNKMEMSET_2 ++#define HAVE_CHUNKMEMSET_4 ++#define HAVE_CHUNKMEMSET_8 ++ ++#define zmemcpy_2(dest, src) memcpy(dest, src, 2) ++#define zmemcmp_2(str1, str2) memcmp(str1, str2, 2) ++#define zmemcpy_4(dest, src) memcpy(dest, src, 4) ++#define zmemcmp_4(str1, str2) memcmp(str1, str2, 4) ++#define zmemcpy_8(dest, src) memcpy(dest, src, 8) ++#define zmemcmp_8(str1, str2) memcmp(str1, str2, 8) ++#define MIN(a, b) ((a) > (b) ? (b) : (a)) ++#define Z_INTERNAL ++ ++static inline void chunkmemset_1(uint8_t *from, chunk_t *chunk) { ++ *chunk = vld1q_dup_u8(from); ++} ++ ++static inline void chunkmemset_2(uint8_t *from, chunk_t *chunk) { ++ uint16_t tmp; ++ zmemcpy_2(&tmp, from); ++ *chunk = vreinterpretq_u8_u16(vdupq_n_u16(tmp)); ++} ++ ++static inline void chunkmemset_4(uint8_t *from, chunk_t *chunk) { ++ uint32_t tmp; ++ zmemcpy_4(&tmp, from); ++ *chunk = vreinterpretq_u8_u32(vdupq_n_u32(tmp)); ++} ++ ++static inline void chunkmemset_8(uint8_t *from, chunk_t *chunk) { ++ uint64_t tmp; ++ zmemcpy_8(&tmp, from); ++ *chunk = vreinterpretq_u8_u64(vdupq_n_u64(tmp)); ++} ++ ++#define CHUNKSIZE chunksize_neon ++#define CHUNKCOPY chunkcopy_neon ++#define CHUNKCOPY_SAFE chunkcopy_safe_neon ++#define CHUNKUNROLL chunkunroll_neon ++#define CHUNKMEMSET chunkmemset_neon ++#define CHUNKMEMSET_SAFE chunkmemset_safe_neon ++ ++static inline void loadchunk(uint8_t const *s, chunk_t *chunk) { ++ *chunk = vld1q_u8(s); ++} ++ ++static inline void storechunk(uint8_t *out, chunk_t *chunk) { ++ vst1q_u8(out, *chunk); ++} ++ ++/* Behave like chunkcopy, but avoid writing beyond of legal output. */ ++static inline uint8_t* chunkcopy_safe(uint8_t *out, uint8_t *from, size_t len, uint8_t *safe) { ++ uint32_t safelen = (uint32_t)((safe - out) + 1); ++ len = MIN(len, safelen); ++ int32_t olap_src = from >= out && from < out + len; ++ int32_t olap_dst = out >= from && out < from + len; ++ size_t tocopy; ++ ++ /* For all cases without overlap, memcpy is ideal */ ++ if (!(olap_src || olap_dst)) { ++ memcpy(out, from, len); ++ return out + len; ++ } ++ ++ /* We are emulating a self-modifying copy loop here. To do this in a way that doesn't produce undefined behavior, ++ * we have to get a bit clever. First if the overlap is such that src falls between dst and dst+len, we can do the ++ * initial bulk memcpy of the nonoverlapping region. Then, we can leverage the size of this to determine the safest ++ * atomic memcpy size we can pick such that we have non-overlapping regions. This effectively becomes a safe look ++ * behind or lookahead distance */ ++ size_t non_olap_size = ((from > out) ? from - out : out - from); ++ ++ memcpy(out, from, non_olap_size); ++ out += non_olap_size; ++ from += non_olap_size; ++ len -= non_olap_size; ++ ++ /* So this doesn't give use a worst case scenario of function calls in a loop, ++ * we want to instead break this down into copy blocks of fixed lengths */ ++ while (len) { ++ tocopy = MIN(non_olap_size, len); ++ len -= tocopy; ++ ++ while (tocopy >= 32) { ++ memcpy(out, from, 32); ++ out += 32; ++ from += 32; ++ tocopy -= 32; ++ } ++ ++ if (tocopy >= 16) { ++ memcpy(out, from, 16); ++ out += 16; ++ from += 16; ++ tocopy -= 16; ++ } ++ ++ if (tocopy >= 8) { ++ zmemcpy_8(out, from); ++ out += 8; ++ from += 8; ++ tocopy -= 8; ++ } ++ ++ if (tocopy >= 4) { ++ zmemcpy_4(out, from); ++ out += 4; ++ from += 4; ++ tocopy -= 4; ++ } ++ ++ if (tocopy >= 2) { ++ zmemcpy_2(out, from); ++ out += 2; ++ from += 2; ++ tocopy -= 2; ++ } ++ ++ if (tocopy) { ++ *out++ = *from++; ++ } ++ } ++ ++ return out; ++} ++ ++/* Returns the chunk size */ ++ZLIB_INTERNAL uint32_t CHUNKSIZE(void) { ++ return sizeof(chunk_t); ++} ++ ++/* Behave like memcpy, but assume that it's OK to overwrite at least ++ chunk_t bytes of output even if the length is shorter than this, ++ that the length is non-zero, and that `from` lags `out` by at least ++ sizeof chunk_t bytes (or that they don't overlap at all or simply that ++ the distance is less than the length of the copy). ++ ++ Aside from better memory bus utilisation, this means that short copies ++ (chunk_t bytes or fewer) will fall straight through the loop ++ without iteration, which will hopefully make the branch prediction more ++ reliable. */ ++ZLIB_INTERNAL uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len) { ++ Assert(len > 0, "chunkcopy should never have a length 0"); ++ chunk_t chunk; ++ int32_t align = ((len - 1) % sizeof(chunk_t)) + 1; ++ loadchunk(from, &chunk); ++ storechunk(out, &chunk); ++ out += align; ++ from += align; ++ len -= align; ++ while (len > 0) { ++ loadchunk(from, &chunk); ++ storechunk(out, &chunk); ++ out += sizeof(chunk_t); ++ from += sizeof(chunk_t); ++ len -= sizeof(chunk_t); ++ } ++ return out; ++} ++ ++/* Perform short copies until distance can be rewritten as being at least ++ sizeof chunk_t. ++ ++ This assumes that it's OK to overwrite at least the first ++ 2*sizeof(chunk_t) bytes of output even if the copy is shorter than this. ++ This assumption holds because inflate_fast() starts every iteration with at ++ least 258 bytes of output space available (258 being the maximum length ++ output from a single token; see inflate_fast()'s assumptions below). */ ++ZLIB_INTERNAL uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len) { ++ unsigned char const *from = out - *dist; ++ chunk_t chunk; ++ while (*dist < *len && *dist < sizeof(chunk_t)) { ++ loadchunk(from, &chunk); ++ storechunk(out, &chunk); ++ out += *dist; ++ *len -= *dist; ++ *dist += *dist; ++ } ++ return out; ++} ++ ++/* Copy DIST bytes from OUT - DIST into OUT + DIST * k, for 0 <= k < LEN/DIST. ++ Return OUT + LEN. */ ++ZLIB_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) { ++ /* Debug performance related issues when len < sizeof(uint64_t): ++ Assert(len >= sizeof(uint64_t), "chunkmemset should be called on larger chunks"); */ ++ Assert(dist > 0, "chunkmemset cannot have a distance 0"); ++ ++ uint8_t *from = out - dist; ++ ++ if (dist == 1) { ++ memset(out, *from, len); ++ return out + len; ++ } else if (dist > sizeof(chunk_t)) { ++ return CHUNKCOPY(out, out - dist, len); ++ } ++ ++ chunk_t chunk_load; ++ uint32_t chunk_mod = 0; ++ /* TODO: possibly build up a permutation table for this if not an even modulus */ ++#ifdef HAVE_CHUNKMEMSET_2 ++ if (dist == 2) { ++ chunkmemset_2(from, &chunk_load); ++ } else ++#endif ++#ifdef HAVE_CHUNKMEMSET_4 ++ if (dist == 4) { ++ chunkmemset_4(from, &chunk_load); ++ } else ++#endif ++#ifdef HAVE_CHUNKMEMSET_8 ++ if (dist == 8) { ++ chunkmemset_8(from, &chunk_load); ++ } else if (dist == sizeof(chunk_t)) { ++ loadchunk(from, &chunk_load); ++ } else ++#endif ++ { ++ /* This code takes string of length dist from "from" and repeats ++ * it for as many times as can fit in a chunk_t (vector register) */ ++ uint32_t cpy_dist; ++ uint32_t bytes_remaining = sizeof(chunk_t); ++ uint8_t *cur_chunk = (uint8_t *)&chunk_load; ++ while (bytes_remaining) { ++ cpy_dist = MIN(dist, bytes_remaining); ++ memcpy(cur_chunk, from, cpy_dist); ++ bytes_remaining -= cpy_dist; ++ cur_chunk += cpy_dist; ++ /* This allows us to bypass an expensive integer division since we're effectively ++ * counting in this loop, anyway. However, we may have to derive a similarly ++ * sensible solution for if we use a permutation table that allows us to construct ++ * this vector in one load and one permute instruction */ ++ chunk_mod = cpy_dist; ++ } ++ } ++ ++ /* If we're lucky enough and dist happens to be an even modulus of our vector length, ++ * we can do two stores per loop iteration, which for most ISAs, especially x86, is beneficial */ ++ if (chunk_mod == 0) { ++ while (len >= (2 * sizeof(chunk_t))) { ++ storechunk(out, &chunk_load); ++ storechunk(out + sizeof(chunk_t), &chunk_load); ++ out += 2 * sizeof(chunk_t); ++ len -= 2 * sizeof(chunk_t); ++ } ++ } ++ ++ /* If we don't have a "dist" length that divides evenly into a vector ++ * register, we can write the whole vector register but we need only ++ * advance by the amount of the whole string that fits in our chunk_t. ++ * If we do divide evenly into the vector length, adv_amount = chunk_t size*/ ++ uint32_t adv_amount = sizeof(chunk_t) - chunk_mod; ++ while (len >= sizeof(chunk_t)) { ++ storechunk(out, &chunk_load); ++ len -= adv_amount; ++ out += adv_amount; ++ } ++ ++ if (len) { ++ memcpy(out, &chunk_load, len); ++ out += len; ++ } ++ ++ return out; ++} ++ ++ZLIB_INTERNAL uint8_t* CHUNKMEMSET_SAFE(uint8_t *out, unsigned dist, unsigned len, unsigned left) { ++#if !defined(UNALIGNED64_OK) ++#if !defined(UNALIGNED_OK) ++ static const uint32_t align_mask = 7; ++#else ++ static const uint32_t align_mask = 3; ++#endif ++#endif ++ ++ len = MIN(len, left); ++ uint8_t *from = out - dist; ++#if !defined(UNALIGNED64_OK) ++ while (((uintptr_t)out & align_mask) && (len > 0)) { ++ *out++ = *from++; ++ --len; ++ --left; ++ } ++#endif ++ if (left < (unsigned)(3 * sizeof(chunk_t))) { ++ while (len > 0) { ++ *out++ = *from++; ++ --len; ++ } ++ return out; ++ } ++ if (len) ++ return CHUNKMEMSET(out, dist, len); ++ ++ return out; ++} ++ ++#endif +diff -Nru zlib-1.2.11/inffast.c ../zlib-1.2.11/inffast.c +--- zlib-1.2.11/inffast.c 2017-01-16 01:29:40.000000000 +0800 ++++ ../zlib-1.2.11/inffast.c 2022-08-30 17:46:16.887320481 +0800 +@@ -7,7 +7,7 @@ + #include "inftrees.h" + #include "inflate.h" + #include "inffast.h" +- ++#include "contrib/arm/arm_chunk_copy_neon.h" + #ifdef ASMINF + # pragma message("Assembler code may have bugs -- use at your own risk") + #else +@@ -47,10 +47,268 @@ + requires strm->avail_out >= 258 for each loop to avoid checking for + output space. + */ ++#ifdef ENABLE_ARM_CHUNK_NEON ++void ZLIB_INTERNAL arm_inflate_fast(strm, start) ++z_streamp strm; ++unsigned start; /* inflate()'s starting value for strm->avail_out */ ++{ ++ struct inflate_state FAR *state; ++ z_const unsigned char FAR *in; /* local strm->next_in */ ++ z_const unsigned char FAR *last; /* have enough input while in < last */ ++ unsigned char FAR *out; /* local strm->next_out */ ++ unsigned char FAR *beg; /* inflate()'s initial strm->next_out */ ++ unsigned char FAR *end; /* while out < end, enough space available */ ++ unsigned char *safe; /* can use chunkcopy provided out < safe */ ++#ifdef INFLATE_STRICT ++ unsigned dmax; /* maximum distance from zlib header */ ++#endif ++ unsigned wsize; /* window size or zero if not using window */ ++ unsigned whave; /* valid bytes in the window */ ++ unsigned wnext; /* window write index */ ++ unsigned char FAR *window; /* allocated sliding window, if wsize != 0 */ ++ unsigned long hold; /* local strm->hold */ ++ unsigned bits; /* local strm->bits */ ++ code const FAR *lcode; /* local strm->lencode */ ++ code const FAR *dcode; /* local strm->distcode */ ++ unsigned lmask; /* mask for first level of length codes */ ++ unsigned dmask; /* mask for first level of distance codes */ ++ code here; /* retrieved table entry */ ++ unsigned op; /* code bits, operation, extra bits, or */ ++ /* window position, window bytes to copy */ ++ unsigned len; /* match length, unused bytes */ ++ unsigned dist; /* match distance */ ++ unsigned char FAR *from; /* where to copy match from */ ++ unsigned extra_safe; /* copy chunks safely in all cases */ ++ uint32_t chunksize = chunksize_neon(); ++ /* copy state to local variables */ ++ state = (struct inflate_state FAR *)strm->state; ++ in = strm->next_in; ++ last = in + (strm->avail_in - 5); ++ out = strm->next_out; ++ beg = out - (start - strm->avail_out); ++ end = out + (strm->avail_out - 257); ++ safe = out + strm->avail_out; ++#ifdef INFLATE_STRICT ++ dmax = state->dmax; ++#endif ++ wsize = state->wsize; ++ whave = state->whave; ++ wnext = state->wnext; ++ window = state->window; ++ hold = state->hold; ++ bits = state->bits; ++ lcode = state->lencode; ++ dcode = state->distcode; ++ lmask = (1U << state->lenbits) - 1; ++ dmask = (1U << state->distbits) - 1; ++ extra_safe = (wsize != 0 && out >= window && out + INFLATE_FAST_MIN_LEFT <= window + wsize); ++ /* decode literals and length/distances until end-of-block or not enough ++ input data or output space */ ++ do { ++ if (bits < 15) { ++ hold += (unsigned long)(*in++) << bits; ++ bits += 8; ++ hold += (unsigned long)(*in++) << bits; ++ bits += 8; ++ } ++ here = lcode[hold & lmask]; ++ dolen: ++ op = (unsigned)(here.bits); ++ hold >>= op; ++ bits -= op; ++ op = (unsigned)(here.op); ++ if (op == 0) { /* literal */ ++ Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ? ++ "inflate: literal '%c'\n" : ++ "inflate: literal 0x%02x\n", here.val)); ++ *out++ = (unsigned char)(here.val); ++ } ++ else if (op & 16) { /* length base */ ++ len = (unsigned)(here.val); ++ op &= 15; /* number of extra bits */ ++ if (op) { ++ if (bits < op) { ++ hold += (unsigned long)(*in++) << bits; ++ bits += 8; ++ } ++ len += (unsigned)hold & ((1U << op) - 1); ++ hold >>= op; ++ bits -= op; ++ } ++ Tracevv((stderr, "inflate: length %u\n", len)); ++ if (bits < 15) { ++ hold += (unsigned long)(*in++) << bits; ++ bits += 8; ++ hold += (unsigned long)(*in++) << bits; ++ bits += 8; ++ } ++ here = dcode[hold & dmask]; ++ dodist: ++ op = (unsigned)(here.bits); ++ hold >>= op; ++ bits -= op; ++ op = (unsigned)(here.op); ++ if (op & 16) { /* distance base */ ++ dist = (unsigned)(here.val); ++ op &= 15; /* number of extra bits */ ++ if (bits < op) { ++ hold += (unsigned long)(*in++) << bits; ++ bits += 8; ++ if (bits < op) { ++ hold += (unsigned long)(*in++) << bits; ++ bits += 8; ++ } ++ } ++ dist += (unsigned)hold & ((1U << op) - 1); ++#ifdef INFLATE_STRICT ++ if (dist > dmax) { ++ strm->msg = (char *)"invalid distance too far back"; ++ state->mode = BAD; ++ break; ++ } ++#endif ++ hold >>= op; ++ bits -= op; ++ Tracevv((stderr, "inflate: distance %u\n", dist)); ++ op = (unsigned)(out - beg); /* max distance in output */ ++ if (dist > op) { /* see if copy from window */ ++ op = dist - op; /* distance back in window */ ++ if (op > whave) { ++ if (state->sane) { ++ strm->msg = ++ (char *)"invalid distance too far back"; ++ state->mode = BAD; ++ break; ++ } ++#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR ++ if (len <= op - whave) { ++ do { ++ *out++ = 0; ++ } while (--len); ++ continue; ++ } ++ len -= op - whave; ++ do { ++ *out++ = 0; ++ } while (--op > whave); ++ if (op == 0) { ++ from = out - dist; ++ do { ++ *out++ = *from++; ++ } while (--len); ++ continue; ++ } ++#endif ++ } ++ from = window; ++ if (wnext == 0) { /* very common case */ ++ from += wsize - op; ++ } else if (wnext >= op) { /* contiguous in window */ ++ from += wnext - op; ++ } else { /* wrap around window */ ++ op -= wnext; ++ from += wsize - op; ++ if (op < len) { /* some from end of window */ ++ len -= op; ++ out = chunkcopy_safe(out, from, op, safe); ++ from = window; /* more from start of window */ ++ op = wnext; ++ /* This (rare) case can create a situation where ++ the first chunkcopy below must be checked. ++ */ ++ } ++ } ++ if (op < len) { /* still need some from output */ ++ len -= op; ++ out = chunkcopy_safe(out, from, op, safe); ++ out = chunkunroll_neon(out, &dist, &len); ++ out = chunkcopy_safe(out, out - dist, len, safe); ++ } else { ++ out = chunkcopy_safe(out, from, len, safe); ++ } ++ } else if (extra_safe) { ++ /* Whole reference is in range of current output. */ ++ if (dist >= len || dist >= chunksize) ++ out = chunkcopy_safe(out, out - dist, len, safe); ++ else ++ out = chunkmemset_safe_neon(out, dist, len, (unsigned)((safe - out) + 1)); ++ } else { ++ /* Whole reference is in range of current output. No range checks are ++ necessary because we start with room for at least 258 bytes of output, ++ so unroll and roundoff operations can write beyond `out+len` so long ++ as they stay within 258 bytes of `out`. ++ */ ++ if (dist >= len || dist >= chunksize) ++ out = chunkcopy_neon(out, out - dist, len); ++ else ++ out = chunkmemset_neon(out, dist, len); ++ } ++ } else if ((op & 64) == 0) { /* 2nd level distance code */ ++ here = dcode[here.val + (hold & ((1U << op) - 1))]; ++ goto dodist; ++ } ++ else { ++ strm->msg = (char *)"invalid distance code"; ++ state->mode = BAD; ++ break; ++ } ++ } ++ else if ((op & 64) == 0) { /* 2nd level length code */ ++ here = lcode[here.val + (hold & ((1U << op) - 1))]; ++ goto dolen; ++ } ++ else if (op & 32) { /* end-of-block */ ++ Tracevv((stderr, "inflate: end of block\n")); ++ state->mode = TYPE; ++ break; ++ } ++ else { ++ strm->msg = (char *)"invalid literal/length code"; ++ state->mode = BAD; ++ break; ++ } ++ } while (in < last && out < end); ++ ++ /* return unused bytes (on entry, bits < 8, so in won't go too far back) */ ++ len = bits >> 3; ++ in -= len; ++ bits -= len << 3; ++ hold &= (1U << bits) - 1; ++ ++ /* update state and return */ ++ strm->next_in = in; ++ strm->next_out = out; ++ strm->avail_in = (unsigned)(in < last ? 5 + (last - in) : 5 - (in - last)); ++ strm->avail_out = (unsigned)(out < end ? ++ 257 + (end - out) : 257 - (out - end)); ++ state->hold = hold; ++ state->bits = bits; ++ return; ++} ++ ++/* ++ inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe): ++ - Using bit fields for code structure ++ - Different op definition to avoid & for extra bits (do & for table bits) ++ - Three separate decoding do-loops for direct, window, and wnext == 0 ++ - Special case for distance > 1 copies to do overlapped load and store copy ++ - Explicit branch predictions (based on measured branch probabilities) ++ - Deferring match copy and interspersed it with decoding subsequent codes ++ - Swapping literal/length else ++ - Swapping window/direct else ++ - Larger unrolled copy loops (three is about right) ++ - Moving len -= 3 statement into middle of loop ++ */ ++#endif ++ + void ZLIB_INTERNAL inflate_fast(strm, start) + z_streamp strm; + unsigned start; /* inflate()'s starting value for strm->avail_out */ + { ++#ifdef ENABLE_ARM_CHUNK_NEON ++ return arm_inflate_fast(strm, start); ++#endif ++ + struct inflate_state FAR *state; + z_const unsigned char FAR *in; /* local strm->next_in */ + z_const unsigned char FAR *last; /* have enough input while in < last */ +@@ -306,18 +564,4 @@ + return; + } + +-/* +- inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe): +- - Using bit fields for code structure +- - Different op definition to avoid & for extra bits (do & for table bits) +- - Three separate decoding do-loops for direct, window, and wnext == 0 +- - Special case for distance > 1 copies to do overlapped load and store copy +- - Explicit branch predictions (based on measured branch probabilities) +- - Deferring match copy and interspersed it with decoding subsequent codes +- - Swapping literal/length else +- - Swapping window/direct else +- - Larger unrolled copy loops (three is about right) +- - Moving len -= 3 statement into middle of loop +- */ +- + #endif /* !ASMINF */ diff --git a/zlib.spec b/zlib.spec index 749ff83..3d9582b 100644 --- a/zlib.spec +++ b/zlib.spec @@ -1,4 +1,4 @@ -%define anolis_release .0.3 +%define anolis_release .0.4 # disabled, per rhbz#1609830 and rhbz#1602742 %bcond_with minizip @@ -30,6 +30,8 @@ Patch1006: 1006-zlib-anolis-Optimize-slide_hash.patch Patch1007: 1007-zlib-anolis-Neon-Optimized-adler32.patch # optimized crc32 function with crc32 + pmul instruction in armv8 Patch1008: 1008-zlib-anolis-Optimized-crc32-pmul-mix.patch +# optimized chunk copy by neon in armv8 +Patch1009: 1009-zlib-anolis-Neon-Optimized-chunkcopy_neon.patch BuildRequires: automake, autoconf, libtool @@ -91,6 +93,7 @@ developing applications which use minizip. %patch1005 -p1 %patch1007 -p1 %patch1008 -p1 +%patch1009 -p1 %endif %ifarch x86_64 @@ -173,6 +176,9 @@ find $RPM_BUILD_ROOT -name '*.la' -delete %changelog +* Tue Aug 30 2022 binbin Xu - 1.2.11-17.0.4 +- add optimized chunk copy by neon patch for aarch64 + * Thu Aug 25 2022 binbin Xu - 1.2.11-17.0.3 - add optimized crc32 with pmul mix crc patch for aarch64 -- Gitee