diff --git a/1007-zlib-anolis-Neon-Optimized-adler32.patch b/1007-zlib-anolis-Neon-Optimized-adler32.patch new file mode 100644 index 0000000000000000000000000000000000000000..a8e3e424c85e3eb4582b5ab862c9f0ef66bfd718 --- /dev/null +++ b/1007-zlib-anolis-Neon-Optimized-adler32.patch @@ -0,0 +1,247 @@ +diff -uNr zlib-1.2.11/adler32.c ../zlib-1.2.11/adler32.c +--- zlib-1.2.11/adler32.c 2017-01-01 15:37:10.000000000 +0800 ++++ ../zlib-1.2.11/adler32.c 2022-08-23 11:44:03.435348281 +0800 +@@ -59,6 +59,11 @@ + # define MOD63(a) a %= BASE + #endif + ++/* adler32 neon optimize flag */ ++#define ENABLE_ADLER32_NEON 1 ++#ifdef ENABLE_ADLER32_NEON ++extern uLong adler32_neon(uLong adler, const Bytef *buf, z_size_t len); ++#endif + /* ========================================================================= */ + uLong ZEXPORT adler32_z(adler, buf, len) + uLong adler; +@@ -68,6 +73,10 @@ + unsigned long sum2; + unsigned n; + ++#ifdef ENABLE_ADLER32_NEON ++ return adler32_neon(adler, buf, len); ++#endif ++ + /* split Adler-32 into component sums */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; +diff -uNr zlib-1.2.11/adler32_neon.S ../zlib-1.2.11/adler32_neon.S +--- zlib-1.2.11/adler32_neon.S 1970-01-01 08:00:00.000000000 +0800 ++++ ../zlib-1.2.11/adler32_neon.S 2022-08-23 11:26:42.438306324 +0800 +@@ -0,0 +1,178 @@ ++/********************************************************************** ++ Copyright(c) 2019 Arm Corporation All rights reserved. ++ ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions ++ are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in ++ the documentation and/or other materials provided with the ++ distribution. ++ * Neither the name of Arm Corporation nor the names of its ++ contributors may be used to endorse or promote products derived ++ from this software without specific prior written permission. ++ ++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++**********************************************************************/ ++ .arch armv8-a+crypto ++ .text ++ .align 3 ++ ++/* ++Macros ++*/ ++ ++.macro declare_var_vector_reg name:req,reg:req ++ \name\()_q .req q\reg ++ \name\()_v .req v\reg ++ \name\()_s .req s\reg ++ \name\()_d .req d\reg ++.endm ++ ++.macro mod_adler dest:req,tmp:req ++ umull \tmp\()_x,\dest,const_div1 ++ lsr \tmp\()_x,\tmp\()_x,47 ++ msub \dest,\tmp,const_div2,\dest ++.endm ++ ++/* ++ uint32_t adler32_neon(uint32_t adler32, uint8_t * start, uint32_t length); ++*/ ++/* ++Arguements list ++*/ ++ adler32 .req w0 ++ start .req x1 ++ length .req x2 ++ .global adler32_neon ++ .type adler32_neon, %function ++adler32_neon: ++/* ++local variables ++*/ ++ declare_var_vector_reg factor0 , 6 ++ declare_var_vector_reg factor1 , 7 ++ declare_var_vector_reg d0 , 4 ++ declare_var_vector_reg d1 , 5 ++ declare_var_vector_reg adacc , 2 ++ declare_var_vector_reg s2acc , 3 ++ declare_var_vector_reg zero , 16 ++ declare_var_vector_reg adler , 17 ++ declare_var_vector_reg back_d0 , 18 ++ declare_var_vector_reg back_d1 , 19 ++ declare_var_vector_reg sum2 , 20 ++ declare_var_vector_reg tmp2 , 20 ++ ++ adler0 .req w4 ++ adler1 .req w5 ++ adler0_x .req x4 ++ adler1_x .req x5 ++ end .req x0 ++ tmp .req w8 ++ tmp_x .req x8 ++ tmp1_x .req x9 ++ loop_cnt .req x10 ++ loop_const .req x11 ++ const_div1 .req w6 ++ const_div2 .req w7 ++ mov const_div1, 32881 ++ movk const_div1, 0x8007, lsl 16 ++ mov const_div2, 65521 ++ and adler0, adler32, 0xffff ++ lsr adler1, adler32, 16 ++ ++ lsr loop_cnt,length,5 ++ adrp x3,factors ++ add x3,x3,:lo12:factors ++ ld1 {factor0_v.16b-factor1_v.16b},[x3] ++ ++ add end,start,length ++ cbz loop_cnt,final_accum32 ++ ld1 {back_d0_v.16b-back_d1_v.16b},[start] ++ mov loop_const,173 ++ ++ movi v16.4s,0 ++ ++ ++ ++ ++great_than_32: ++ cmp loop_cnt,173 ++ csel loop_const,loop_cnt,loop_const,le ++ mov adacc_v.16b,zero_v.16b ++ mov s2acc_v.16b,zero_v.16b ++ ins adacc_v.s[0],adler0 ++ ins s2acc_v.s[0],adler1 ++ add tmp_x,start,loop_const,lsl 5 ++ ++accum32_neon: ++ add start,start,32 ++ mov d0_v.16b,back_d0_v.16b ++ mov d1_v.16b,back_d1_v.16b ++ ld1 {back_d0_v.16b-back_d1_v.16b},[start] ++ ++ shl tmp2_v.4s,adacc_v.4s,5 ++ add s2acc_v.4s,s2acc_v.4s,tmp2_v.4s ++ ++ uaddlp adler_v.8h,d0_v.16b ++ uadalp adler_v.8h,d1_v.16b ++ uadalp adacc_v.4s,adler_v.8h ++ ++ umull sum2_v.8h,factor0_v.8b ,d0_v.8b ++ umlal2 sum2_v.8h,factor0_v.16b,d0_v.16b ++ umlal sum2_v.8h,factor1_v.8b ,d1_v.8b ++ umlal2 sum2_v.8h,factor1_v.16b,d1_v.16b ++ uadalp s2acc_v.4s,sum2_v.8h ++ ++ cmp start,tmp_x ++ bne accum32_neon ++ ++ uaddlv adacc_d,adacc_v.4s ++ uaddlv s2acc_d,s2acc_v.4s ++ fmov adler0_x,adacc_d ++ fmov adler1_x,s2acc_d ++ ++ mod_adler adler0,tmp ++ mod_adler adler1,tmp ++ sub loop_cnt,loop_cnt,loop_const ++ cbnz loop_cnt,great_than_32 ++ ++final_accum32: ++ and length,length,31 ++ cbz length,end_func ++ ++accum32_body: ++ cmp start,end ++ beq end_func ++ ldrb tmp,[start],1 ++ add adler0,adler0,tmp ++ add adler1,adler1,adler0 ++ b accum32_body ++ ++end_func: ++ mod_adler adler0,tmp ++ mod_adler adler1,tmp ++ orr w0,adler0,adler1,lsl 16 ++ ret ++ ++ .size adler32_neon, .-adler32_neon ++ .section .rodata.cst16,"aM",@progbits,16 ++ .align 4 ++factors: ++ .quad 0x191a1b1c1d1e1f20 ++ .quad 0x1112131415161718 ++ .quad 0x090a0b0c0d0e0f10 ++ .quad 0x0102030405060708 ++ +diff -uNr zlib-1.2.11/Makefile.in ../zlib-1.2.11/Makefile.in +--- zlib-1.2.11/Makefile.in 2017-01-16 01:29:40.000000000 +0800 ++++ ../zlib-1.2.11/Makefile.in 2022-08-23 11:28:46.078154357 +0800 +@@ -57,11 +57,11 @@ + ZINC= + ZINCOUT=-I. + +-OBJZ = adler32.o crc32.o deflate.o infback.o inffast.o inflate.o inftrees.o trees.o zutil.o ++OBJZ = adler32.o adler32_neon.o crc32.o deflate.o infback.o inffast.o inflate.o inftrees.o trees.o zutil.o + OBJG = compress.o uncompr.o gzclose.o gzlib.o gzread.o gzwrite.o + OBJC = $(OBJZ) $(OBJG) + +-PIC_OBJZ = adler32.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo ++PIC_OBJZ = adler32.lo adler32_neon.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo + PIC_OBJG = compress.lo uncompr.lo gzclose.lo gzlib.lo gzread.lo gzwrite.lo + PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG) + +@@ -159,6 +159,9 @@ + adler32.o: $(SRCDIR)adler32.c + $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)adler32.c + ++adler32_neon.o: $(SRCDIR)adler32_neon.S ++ $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)adler32_neon.S ++ + crc32.o: $(SRCDIR)crc32.c + $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c + +@@ -207,6 +210,11 @@ + $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/adler32.o $(SRCDIR)adler32.c + -@mv objs/adler32.o $@ + ++adler32_neon.lo: $(SRCDIR)adler32_neon.S ++ -@mkdir objs 2>/dev/null || test -d objs ++ $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/adler32_neon.o $(SRCDIR)adler32_neon.S ++ -@mv objs/adler32_neon.o $@ ++ + crc32.lo: $(SRCDIR)crc32.c + -@mkdir objs 2>/dev/null || test -d objs + $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/crc32.o $(SRCDIR)crc32.c diff --git a/zlib.spec b/zlib.spec index a6887523154c5c019eac4b7a9292d25e3fd8a83f..7bffece1b6874f4ece781f7516b3f1cd059357d4 100644 --- a/zlib.spec +++ b/zlib.spec @@ -26,6 +26,8 @@ Patch1004: 1004-zlib-anolis-compute-crc32-using-armv8-specific-instruction.patch Patch1005: 1005-zlib-anolis-ARM-optimized-insert_string.patch # x86_64 optimized slide_hash Patch1006: 1006-zlib-anolis-Optimize-slide_hash.patch +# optimized adler32 function in armv8 +Patch1007: 1007-zlib-anolis-Neon-Optimized-adler32.patch BuildRequires: automake, autoconf, libtool @@ -85,6 +87,7 @@ developing applications which use minizip. %patch1003 -p1 %patch1004 -p1 %patch1005 -p1 +%patch1007 -p1 %endif %ifarch x86_64