diff --git a/1008-zlib-anolis-Optimized-crc32-pmul-mix.patch b/1008-zlib-anolis-Optimized-crc32-pmul-mix.patch new file mode 100644 index 0000000000000000000000000000000000000000..dbfecf54c199ce31f1e3970b853b01f9fd07a1a8 --- /dev/null +++ b/1008-zlib-anolis-Optimized-crc32-pmul-mix.patch @@ -0,0 +1,580 @@ +diff -uNr zlib-1.2.11/contrib/arm/crc32_common_mix.S ../zlib-1.2.11/contrib/arm/crc32_common_mix.S +--- zlib-1.2.11/contrib/arm/crc32_common_mix.S 1970-01-01 08:00:00.000000000 +0800 ++++ ../zlib-1.2.11/contrib/arm/crc32_common_mix.S 2022-08-25 19:26:40.154179151 +0800 +@@ -0,0 +1,437 @@ ++/********************************************************************** ++ Copyright(c) 2020 Arm Corporation All rights reserved. ++ ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions ++ are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in ++ the documentation and/or other materials provided with the ++ distribution. ++ * Neither the name of Arm Corporation nor the names of its ++ contributors may be used to endorse or promote products derived ++ from this software without specific prior written permission. ++ ++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++**********************************************************************/ ++ ++ ++.macro declare_var_vector_reg name:req,reg:req ++ \name\()_q .req q\reg ++ \name\()_v .req v\reg ++ \name\()_s .req s\reg ++ \name\()_d .req d\reg ++.endm ++ declare_var_vector_reg k1k2,20 ++ declare_var_vector_reg k3k4,21 ++ declare_var_vector_reg poly,22 ++ declare_var_vector_reg k5k0,23 ++ declare_var_vector_reg mask,24 ++ declare_var_vector_reg fold_poly,25 ++ ++ declare_var_vector_reg tmp0,0 ++ declare_var_vector_reg tmp1,1 ++ declare_var_vector_reg tmp2,2 ++ declare_var_vector_reg tmp3,3 ++ declare_var_vector_reg tmp4,4 ++ declare_var_vector_reg tmp5,5 ++ declare_var_vector_reg tmp6,6 ++ declare_var_vector_reg tmp7,7 ++ declare_var_vector_reg pmull_data0,16 ++ declare_var_vector_reg pmull_data1,17 ++ declare_var_vector_reg pmull_data2,18 ++ declare_var_vector_reg pmull_data3,19 ++ ++ vzr .req v26 ++ ++ const_addr .req x3 ++ crc_blk_ptr .req x4 ++ pmull_blk_ptr .req x5 ++ crc_data0 .req x6 ++ crc_data1 .req x7 ++ crc_data2 .req x9 ++ crc_data3 .req x10 ++ wPmull .req w11 ++ xPmull .req x11 ++ ++ data0 .req x4 ++ data1 .req x5 ++ data2 .req x6 ++ data3 .req x7 ++ wdata .req w4 ++ ++.macro pmull_fold ++ ++ pmull2 tmp4_v.1q, tmp0_v.2d, k1k2_v.2d ++ pmull2 tmp5_v.1q, tmp1_v.2d, k1k2_v.2d ++ pmull2 tmp6_v.1q, tmp2_v.2d, k1k2_v.2d ++ pmull2 tmp7_v.1q, tmp3_v.2d, k1k2_v.2d ++ ++ pmull tmp0_v.1q, tmp0_v.1d, k1k2_v.1d ++ pmull tmp1_v.1q, tmp1_v.1d, k1k2_v.1d ++ pmull tmp2_v.1q, tmp2_v.1d, k1k2_v.1d ++ pmull tmp3_v.1q, tmp3_v.1d, k1k2_v.1d ++ ld1 {pmull_data0_v.16b-pmull_data3_v.16b},[pmull_blk_ptr],#64 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ ++ eor tmp0_v.16b, tmp0_v.16b, tmp4_v.16b ++ eor tmp1_v.16b, tmp1_v.16b, tmp5_v.16b ++ eor tmp2_v.16b, tmp2_v.16b, tmp6_v.16b ++ eor tmp3_v.16b, tmp3_v.16b, tmp7_v.16b ++ ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, v16.16b ++ eor tmp1_v.16b, tmp1_v.16b, v17.16b ++ eor tmp2_v.16b, tmp2_v.16b, v18.16b ++ eor tmp3_v.16b, tmp3_v.16b, v19.16b ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++.endm ++ ++ ++ ++.macro crc32_common_mix poly_type ++ .set MIX_BLK_SIZE,2048 ++ ++.ifc \poly_type,crc32 ++ mvn wCRC,wCRC ++.endif ++ cmp LEN,MIX_BLK_SIZE-1 ++ adr const_addr, .Lconstants ++ bls start_final ++ ld1 {k1k2_v.16b,k3k4_v.16b,poly_v.16b},[const_addr],#48 ++ movi vzr.16b, #0 ++ ld1 {k5k0_v.8b,mask_v.8b,fold_poly_v.8b},[const_addr] ++ ++loop_2048: ++ ld1 {tmp0_v.16b-tmp3_v.16b}, [BUF] ++ add pmull_blk_ptr,BUF,0x40 ++ add crc_blk_ptr, BUF,512 ++ mov tmp4_v.16b,vzr.16b ++ fmov tmp4_s, wCRC ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ eor tmp0_v.16b,tmp0_v.16b,tmp4_v.16b ++ mov wCRC, 0 ++ sub LEN,LEN,MIX_BLK_SIZE ++ cmp LEN,MIX_BLK_SIZE ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ ++ pmull_fold ++ pmull_fold ++ pmull_fold ++ pmull_fold ++ pmull_fold ++ pmull_fold ++ pmull_fold ++ ++ /* Folding cache line into 128bit */ ++ pmull2 tmp4_v.1q, tmp0_v.2d, k3k4_v.2d ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ pmull tmp0_v.1q, tmp0_v.1d, k3k4_v.1d ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, tmp4_v.16b ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, tmp1_v.16b ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ pmull2 tmp4_v.1q, tmp0_v.2d, k3k4_v.2d ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ pmull tmp0_v.1q, tmp0_v.1d, k3k4_v.1d ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, tmp4_v.16b ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, tmp2_v.16b ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ pmull2 tmp4_v.1q, tmp0_v.2d, k3k4_v.2d ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ pmull tmp0_v.1q, tmp0_v.1d, k3k4_v.1d ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, tmp4_v.16b ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, tmp3_v.16b ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ ++ ++ /** ++ * perform the last 64 bit fold, also ++ * adds 32 zeroes to the input stream ++ */ ++ ext tmp1_v.16b, tmp0_v.16b, tmp0_v.16b, #8 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ pmull2 tmp1_v.1q, tmp1_v.2d, k3k4_v.2d ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ ext tmp0_v.16b, tmp0_v.16b, vzr.16b, #8 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, tmp1_v.16b ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ ++ /* final 32-bit fold */ ++ ext tmp1_v.16b, tmp0_v.16b, vzr.16b, #4 ++ and tmp0_v.16b, tmp0_v.16b, mask_v.16b ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ pmull tmp0_v.1q, tmp0_v.1d, k5k0_v.1d ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, tmp1_v.16b ++ ++ /** ++ * Finish up with the bit-reversed barrett ++ * reduction 64 ==> 32 bits ++ */ ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ and tmp1_v.16b, tmp0_v.16b, mask_v.16b ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ ext tmp1_v.16b, vzr.16b, tmp1_v.16b, #8 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ pmull2 tmp1_v.1q, tmp1_v.2d, poly_v.2d ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ and tmp1_v.16b, tmp1_v.16b, mask_v.16b ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ pmull tmp1_v.1q, tmp1_v.1d, poly_v.1d ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, tmp1_v.16b ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ mov tmp4_v.16b,vzr.16b ++ mov tmp4_v.s[0], tmp0_v.s[1] ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ++ pmull tmp4_v.1q, tmp4_v.1d, fold_poly_v.1d ++ add BUF,BUF,MIX_BLK_SIZE ++ fmov xPmull, tmp4_d ++ crc32_u64 wPmull, wzr, xPmull ++ eor wCRC, wPmull, wCRC ++ bge loop_2048 ++start_final: ++ cmp LEN, 63 ++ bls .loop_16B ++.p2align 6 ++.loop_64B: ++ ldp data0, data1, [BUF],#16 ++ sub LEN,LEN,#64 ++ ldp data2, data3, [BUF],#16 ++ crc32_u64 wCRC, wCRC, data0 ++ crc32_u64 wCRC, wCRC, data1 ++ ldp data0, data1, [BUF],#16 ++ crc32_u64 wCRC, wCRC, data2 ++ crc32_u64 wCRC, wCRC, data3 ++ ldp data2, data3, [BUF],#16 ++ crc32_u64 wCRC, wCRC, data0 ++ crc32_u64 wCRC, wCRC, data1 ++ crc32_u64 wCRC, wCRC, data2 ++ crc32_u64 wCRC, wCRC, data3 ++ cmp LEN,#64 ++ bge .loop_64B ++ cbz LEN, .finish_exit ++ ++.p2align 6 ++.loop_16B: ++ cmp LEN, 15 ++ bls .less_16B ++ ldp data0, data1, [BUF],#16 ++ sub LEN,LEN,#16 ++ crc32_u64 wCRC, wCRC, data0 ++ crc32_u64 wCRC, wCRC, data1 ++ cmp LEN,15 ++ bls .less_16B ++ ldp data0, data1, [BUF],#16 ++ sub LEN,LEN,#16 ++ crc32_u64 wCRC, wCRC, data0 ++ crc32_u64 wCRC, wCRC, data1 ++ cmp LEN,15 ++ bls .less_16B ++ ldp data0, data1, [BUF],#16 ++ sub LEN,LEN,#16 //MUST less than 16B ++ crc32_u64 wCRC, wCRC, data0 ++ crc32_u64 wCRC, wCRC, data1 ++.less_16B: ++ cbz LEN, .finish_exit ++ cmp LEN, 7 ++ bls .less_8B ++ ldr data0, [BUF], 8 ++ sub LEN, LEN, #8 ++ crc32_u64 wCRC, wCRC, data0 ++.less_8B: ++ cbz LEN, .finish_exit ++ cmp LEN, 3 ++ bls .less_4B ++ ldr wdata, [BUF], 4 ++ sub LEN, LEN, #4 ++ crc32_u32 wCRC, wCRC, wdata ++.less_4B: ++ cbz LEN, .finish_exit ++ cmp LEN, 1 ++ bls .less_2B ++ ldrh wdata, [BUF], 2 ++ sub LEN, LEN, #2 ++ crc32_u16 wCRC, wCRC, wdata ++.less_2B: ++ cbz LEN, .finish_exit ++ ldrb wdata, [BUF] ++ crc32_u8 wCRC, wCRC, wdata ++.finish_exit: ++.ifc \poly_type,crc32 ++ mvn w0, wCRC ++.else ++ mov w0, wCRC ++.endif ++ ret ++.endm +diff -uNr zlib-1.2.11/contrib/arm/crc32_mix.S ../zlib-1.2.11/contrib/arm/crc32_mix.S +--- zlib-1.2.11/contrib/arm/crc32_mix.S 1970-01-01 08:00:00.000000000 +0800 ++++ ../zlib-1.2.11/contrib/arm/crc32_mix.S 2022-08-25 19:44:42.946181615 +0800 +@@ -0,0 +1,71 @@ ++/********************************************************************** ++ Copyright(c) 2020 Arm Corporation All rights reserved. ++ ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions ++ are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in ++ the documentation and/or other materials provided with the ++ distribution. ++ * Neither the name of Arm Corporation nor the names of its ++ contributors may be used to endorse or promote products derived ++ from this software without specific prior written permission. ++ ++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++**********************************************************************/ ++#if defined(__ARM_FEATURE_CRC32) && defined(ARM_NEON) ++ ++ .text ++ .align 6 ++ .arch armv8-a+crypto+crc ++ ++#include "crc32_common_mix.S" ++.Lconstants: ++ .octa 0x00000001c6e415960000000154442bd4 ++ .octa 0x00000000ccaa009e00000001751997d0 ++ .octa 0x00000001F701164100000001DB710641 ++ .quad 0x0000000163cd6124 ++ .quad 0x00000000FFFFFFFF ++ .quad 0x000000001753ab84 ++.macro crc32_u64 dst,src,data ++ crc32x \dst,\src,\data ++.endm ++.macro crc32_u32 dst,src,data ++ crc32w \dst,\src,\data ++.endm ++.macro crc32_u16 dst,src,data ++ crc32h \dst,\src,\data ++.endm ++.macro crc32_u8 dst,src,data ++ crc32b \dst,\src,\data ++.endm ++ ++ ++/** ++ * uint32_t ptg_crc32(uint32_t crc, const unsigned char *data, size_t len); ++ * ++ */ ++ BUF .req x1 ++ LEN .req x2 ++ CRC .req x0 ++ wCRC .req w0 ++ .align 6 ++ .global ptg_crc32 ++ .type ptg_crc32, %function ++ptg_crc32: ++ crc32_common_mix crc32 ++ .size ptg_crc32, .-ptg_crc32 ++#endif +diff -uNr zlib-1.2.11/crc32.c ../zlib-1.2.11/crc32.c +--- zlib-1.2.11/crc32.c 2022-08-25 19:18:06.943264265 +0800 ++++ ../zlib-1.2.11/crc32.c 2022-08-25 19:30:51.765812383 +0800 +@@ -206,6 +206,10 @@ + extern uint32_t crc32_acle(uint32_t, const unsigned char *, uint64_t); + #endif + ++#if defined(__ARM_FEATURE_CRC32) && defined(ARM_NEON) ++extern uint32_t ptg_crc32(uint32_t, const unsigned char *, uint64_t); ++#endif ++ + /* ========================================================================= */ + unsigned long ZEXPORT crc32_z(crc, buf, len) + unsigned long crc; +@@ -225,7 +229,9 @@ + + endian = 1; + if (*((unsigned char *)(&endian))) +-#if defined(__ARM_FEATURE_CRC32) ++#if defined(__ARM_FEATURE_CRC32) && defined(ARM_NEON) ++ return ptg_crc32(crc, buf, len); ++#elif defined(__ARM_FEATURE_CRC32) + return crc32_acle(crc, buf, len); + #else + return crc32_little(crc, buf, len); +diff -uNr zlib-1.2.11/Makefile.in ../zlib-1.2.11/Makefile.in +--- zlib-1.2.11/Makefile.in 2022-08-25 19:18:23.206722650 +0800 ++++ ../zlib-1.2.11/Makefile.in 2022-08-25 19:35:00.657538475 +0800 +@@ -57,11 +57,11 @@ + ZINC= + ZINCOUT=-I. + +-OBJZ = adler32.o adler32_neon.o crc32_acle.o crc32.o deflate.o infback.o inffast.o inffast.o inflate.o inftrees.o trees.o zutil.o ++OBJZ = adler32.o adler32_neon.o crc32_acle.o crc32_mix.o crc32.o deflate.o infback.o inffast.o inffast.o inflate.o inftrees.o trees.o zutil.o + OBJG = compress.o uncompr.o gzclose.o gzlib.o gzread.o gzwrite.o + OBJC = $(OBJZ) $(OBJG) + +-PIC_OBJZ = adler32.lo adler32_neon.lo crc32_acle.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo ++PIC_OBJZ = adler32.lo adler32_neon.lo crc32_acle.lo crc32_mix.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo + PIC_OBJG = compress.lo uncompr.lo gzclose.lo gzlib.lo gzread.lo gzwrite.lo + PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG) + +@@ -164,6 +164,9 @@ + crc32_acle.o: $(SRCDIR)contrib/arm/crc32_acle.c + $(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -c -o $@ $(SRCDIR)contrib/arm/crc32_acle.c + ++crc32_mix.o: $(SRCDIR)contrib/arm/crc32_mix.S ++ $(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -c -o $@ $(SRCDIR)contrib/arm/crc32_mix.S ++ + crc32.o: $(SRCDIR)crc32.c + $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c + +@@ -221,6 +224,11 @@ + $(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -DPIC -c -o objs/crc32_acle.o $(SRCDIR)contrib/arm/crc32_acle.c + -@mv objs/crc32_acle.o $@ + ++crc32_mix.lo: $(SRCDIR)contrib/arm/crc32_mix.S ++ -@mkdir objs 2>/dev/null || test -d objs ++ $(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -DPIC -c -o objs/crc32_mix.o $(SRCDIR)contrib/arm/crc32_mix.S ++ -@mv objs/crc32_mix.o $@ ++ + crc32.lo: $(SRCDIR)crc32.c + -@mkdir objs 2>/dev/null || test -d objs + $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/crc32.o $(SRCDIR)crc32.c diff --git a/zlib.spec b/zlib.spec index 801c6f2a457eabb8bbcf3b91489082bd10a6800e..749ff834594af4b2f6dffe743dbd60d437f35e42 100644 --- a/zlib.spec +++ b/zlib.spec @@ -1,4 +1,4 @@ -%define anolis_release .0.2 +%define anolis_release .0.3 # disabled, per rhbz#1609830 and rhbz#1602742 %bcond_with minizip @@ -28,6 +28,8 @@ Patch1005: 1005-zlib-anolis-ARM-optimized-insert_string.patch Patch1006: 1006-zlib-anolis-Optimize-slide_hash.patch # optimized adler32 function in armv8 Patch1007: 1007-zlib-anolis-Neon-Optimized-adler32.patch +# optimized crc32 function with crc32 + pmul instruction in armv8 +Patch1008: 1008-zlib-anolis-Optimized-crc32-pmul-mix.patch BuildRequires: automake, autoconf, libtool @@ -88,6 +90,7 @@ developing applications which use minizip. %patch1004 -p1 %patch1005 -p1 %patch1007 -p1 +%patch1008 -p1 %endif %ifarch x86_64 @@ -170,6 +173,9 @@ find $RPM_BUILD_ROOT -name '*.la' -delete %changelog +* Thu Aug 25 2022 binbin Xu - 1.2.11-17.0.3 +- add optimized crc32 with pmul mix crc patch for aarch64 + * Wed Aug 24 2022 binbin Xu - 1.2.11-17.0.2 - add optimized adler32_neon patch for aarch64