From 8e2e5faa0383a37a908a2579355a2b98f8f4f772 Mon Sep 17 00:00:00 2001 From: xubinbin Date: Thu, 25 Aug 2022 20:05:53 +0800 Subject: [PATCH] Using crc_mix that has been implemented by the arm community, and using the cooperation of the crc32 instruction and the pmul instruction, the cpu pipeline efficiency can be improved. According to the arm community and local tests, the interface is increased by 30% when the string is more than 1k. upstream link: https://github.com/intel/isa-l/blob/master/crc/aarch64/crc32_common_mix_neoverse_n1.S --- ...zlib-anolis-Optimized-crc32-pmul-mix.patch | 580 ++++++++++++++++++ zlib.spec | 8 +- 2 files changed, 587 insertions(+), 1 deletion(-) create mode 100644 1008-zlib-anolis-Optimized-crc32-pmul-mix.patch diff --git a/1008-zlib-anolis-Optimized-crc32-pmul-mix.patch b/1008-zlib-anolis-Optimized-crc32-pmul-mix.patch new file mode 100644 index 0000000..dbfecf5 --- /dev/null +++ b/1008-zlib-anolis-Optimized-crc32-pmul-mix.patch @@ -0,0 +1,580 @@ +diff -uNr zlib-1.2.11/contrib/arm/crc32_common_mix.S ../zlib-1.2.11/contrib/arm/crc32_common_mix.S +--- zlib-1.2.11/contrib/arm/crc32_common_mix.S 1970-01-01 08:00:00.000000000 +0800 ++++ ../zlib-1.2.11/contrib/arm/crc32_common_mix.S 2022-08-25 19:26:40.154179151 +0800 +@@ -0,0 +1,437 @@ ++/********************************************************************** ++ Copyright(c) 2020 Arm Corporation All rights reserved. ++ ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions ++ are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in ++ the documentation and/or other materials provided with the ++ distribution. ++ * Neither the name of Arm Corporation nor the names of its ++ contributors may be used to endorse or promote products derived ++ from this software without specific prior written permission. ++ ++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++**********************************************************************/ ++ ++ ++.macro declare_var_vector_reg name:req,reg:req ++ \name\()_q .req q\reg ++ \name\()_v .req v\reg ++ \name\()_s .req s\reg ++ \name\()_d .req d\reg ++.endm ++ declare_var_vector_reg k1k2,20 ++ declare_var_vector_reg k3k4,21 ++ declare_var_vector_reg poly,22 ++ declare_var_vector_reg k5k0,23 ++ declare_var_vector_reg mask,24 ++ declare_var_vector_reg fold_poly,25 ++ ++ declare_var_vector_reg tmp0,0 ++ declare_var_vector_reg tmp1,1 ++ declare_var_vector_reg tmp2,2 ++ declare_var_vector_reg tmp3,3 ++ declare_var_vector_reg tmp4,4 ++ declare_var_vector_reg tmp5,5 ++ declare_var_vector_reg tmp6,6 ++ declare_var_vector_reg tmp7,7 ++ declare_var_vector_reg pmull_data0,16 ++ declare_var_vector_reg pmull_data1,17 ++ declare_var_vector_reg pmull_data2,18 ++ declare_var_vector_reg pmull_data3,19 ++ ++ vzr .req v26 ++ ++ const_addr .req x3 ++ crc_blk_ptr .req x4 ++ pmull_blk_ptr .req x5 ++ crc_data0 .req x6 ++ crc_data1 .req x7 ++ crc_data2 .req x9 ++ crc_data3 .req x10 ++ wPmull .req w11 ++ xPmull .req x11 ++ ++ data0 .req x4 ++ data1 .req x5 ++ data2 .req x6 ++ data3 .req x7 ++ wdata .req w4 ++ ++.macro pmull_fold ++ ++ pmull2 tmp4_v.1q, tmp0_v.2d, k1k2_v.2d ++ pmull2 tmp5_v.1q, tmp1_v.2d, k1k2_v.2d ++ pmull2 tmp6_v.1q, tmp2_v.2d, k1k2_v.2d ++ pmull2 tmp7_v.1q, tmp3_v.2d, k1k2_v.2d ++ ++ pmull tmp0_v.1q, tmp0_v.1d, k1k2_v.1d ++ pmull tmp1_v.1q, tmp1_v.1d, k1k2_v.1d ++ pmull tmp2_v.1q, tmp2_v.1d, k1k2_v.1d ++ pmull tmp3_v.1q, tmp3_v.1d, k1k2_v.1d ++ ld1 {pmull_data0_v.16b-pmull_data3_v.16b},[pmull_blk_ptr],#64 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ ++ eor tmp0_v.16b, tmp0_v.16b, tmp4_v.16b ++ eor tmp1_v.16b, tmp1_v.16b, tmp5_v.16b ++ eor tmp2_v.16b, tmp2_v.16b, tmp6_v.16b ++ eor tmp3_v.16b, tmp3_v.16b, tmp7_v.16b ++ ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, v16.16b ++ eor tmp1_v.16b, tmp1_v.16b, v17.16b ++ eor tmp2_v.16b, tmp2_v.16b, v18.16b ++ eor tmp3_v.16b, tmp3_v.16b, v19.16b ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++.endm ++ ++ ++ ++.macro crc32_common_mix poly_type ++ .set MIX_BLK_SIZE,2048 ++ ++.ifc \poly_type,crc32 ++ mvn wCRC,wCRC ++.endif ++ cmp LEN,MIX_BLK_SIZE-1 ++ adr const_addr, .Lconstants ++ bls start_final ++ ld1 {k1k2_v.16b,k3k4_v.16b,poly_v.16b},[const_addr],#48 ++ movi vzr.16b, #0 ++ ld1 {k5k0_v.8b,mask_v.8b,fold_poly_v.8b},[const_addr] ++ ++loop_2048: ++ ld1 {tmp0_v.16b-tmp3_v.16b}, [BUF] ++ add pmull_blk_ptr,BUF,0x40 ++ add crc_blk_ptr, BUF,512 ++ mov tmp4_v.16b,vzr.16b ++ fmov tmp4_s, wCRC ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ eor tmp0_v.16b,tmp0_v.16b,tmp4_v.16b ++ mov wCRC, 0 ++ sub LEN,LEN,MIX_BLK_SIZE ++ cmp LEN,MIX_BLK_SIZE ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ ++ pmull_fold ++ pmull_fold ++ pmull_fold ++ pmull_fold ++ pmull_fold ++ pmull_fold ++ pmull_fold ++ ++ /* Folding cache line into 128bit */ ++ pmull2 tmp4_v.1q, tmp0_v.2d, k3k4_v.2d ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ pmull tmp0_v.1q, tmp0_v.1d, k3k4_v.1d ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, tmp4_v.16b ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, tmp1_v.16b ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ pmull2 tmp4_v.1q, tmp0_v.2d, k3k4_v.2d ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ pmull tmp0_v.1q, tmp0_v.1d, k3k4_v.1d ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, tmp4_v.16b ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, tmp2_v.16b ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ pmull2 tmp4_v.1q, tmp0_v.2d, k3k4_v.2d ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ pmull tmp0_v.1q, tmp0_v.1d, k3k4_v.1d ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, tmp4_v.16b ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, tmp3_v.16b ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ ++ ++ /** ++ * perform the last 64 bit fold, also ++ * adds 32 zeroes to the input stream ++ */ ++ ext tmp1_v.16b, tmp0_v.16b, tmp0_v.16b, #8 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ pmull2 tmp1_v.1q, tmp1_v.2d, k3k4_v.2d ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ ext tmp0_v.16b, tmp0_v.16b, vzr.16b, #8 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, tmp1_v.16b ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ ++ /* final 32-bit fold */ ++ ext tmp1_v.16b, tmp0_v.16b, vzr.16b, #4 ++ and tmp0_v.16b, tmp0_v.16b, mask_v.16b ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ pmull tmp0_v.1q, tmp0_v.1d, k5k0_v.1d ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, tmp1_v.16b ++ ++ /** ++ * Finish up with the bit-reversed barrett ++ * reduction 64 ==> 32 bits ++ */ ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ and tmp1_v.16b, tmp0_v.16b, mask_v.16b ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ ext tmp1_v.16b, vzr.16b, tmp1_v.16b, #8 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ pmull2 tmp1_v.1q, tmp1_v.2d, poly_v.2d ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ and tmp1_v.16b, tmp1_v.16b, mask_v.16b ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ pmull tmp1_v.1q, tmp1_v.1d, poly_v.1d ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ eor tmp0_v.16b, tmp0_v.16b, tmp1_v.16b ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ mov tmp4_v.16b,vzr.16b ++ mov tmp4_v.s[0], tmp0_v.s[1] ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ ldp crc_data0,crc_data1,[crc_blk_ptr],16 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ldp crc_data2,crc_data3,[crc_blk_ptr],16 ++ ++ crc32_u64 wCRC,wCRC,crc_data0 ++ crc32_u64 wCRC,wCRC,crc_data1 ++ crc32_u64 wCRC,wCRC,crc_data2 ++ crc32_u64 wCRC,wCRC,crc_data3 ++ ++ pmull tmp4_v.1q, tmp4_v.1d, fold_poly_v.1d ++ add BUF,BUF,MIX_BLK_SIZE ++ fmov xPmull, tmp4_d ++ crc32_u64 wPmull, wzr, xPmull ++ eor wCRC, wPmull, wCRC ++ bge loop_2048 ++start_final: ++ cmp LEN, 63 ++ bls .loop_16B ++.p2align 6 ++.loop_64B: ++ ldp data0, data1, [BUF],#16 ++ sub LEN,LEN,#64 ++ ldp data2, data3, [BUF],#16 ++ crc32_u64 wCRC, wCRC, data0 ++ crc32_u64 wCRC, wCRC, data1 ++ ldp data0, data1, [BUF],#16 ++ crc32_u64 wCRC, wCRC, data2 ++ crc32_u64 wCRC, wCRC, data3 ++ ldp data2, data3, [BUF],#16 ++ crc32_u64 wCRC, wCRC, data0 ++ crc32_u64 wCRC, wCRC, data1 ++ crc32_u64 wCRC, wCRC, data2 ++ crc32_u64 wCRC, wCRC, data3 ++ cmp LEN,#64 ++ bge .loop_64B ++ cbz LEN, .finish_exit ++ ++.p2align 6 ++.loop_16B: ++ cmp LEN, 15 ++ bls .less_16B ++ ldp data0, data1, [BUF],#16 ++ sub LEN,LEN,#16 ++ crc32_u64 wCRC, wCRC, data0 ++ crc32_u64 wCRC, wCRC, data1 ++ cmp LEN,15 ++ bls .less_16B ++ ldp data0, data1, [BUF],#16 ++ sub LEN,LEN,#16 ++ crc32_u64 wCRC, wCRC, data0 ++ crc32_u64 wCRC, wCRC, data1 ++ cmp LEN,15 ++ bls .less_16B ++ ldp data0, data1, [BUF],#16 ++ sub LEN,LEN,#16 //MUST less than 16B ++ crc32_u64 wCRC, wCRC, data0 ++ crc32_u64 wCRC, wCRC, data1 ++.less_16B: ++ cbz LEN, .finish_exit ++ cmp LEN, 7 ++ bls .less_8B ++ ldr data0, [BUF], 8 ++ sub LEN, LEN, #8 ++ crc32_u64 wCRC, wCRC, data0 ++.less_8B: ++ cbz LEN, .finish_exit ++ cmp LEN, 3 ++ bls .less_4B ++ ldr wdata, [BUF], 4 ++ sub LEN, LEN, #4 ++ crc32_u32 wCRC, wCRC, wdata ++.less_4B: ++ cbz LEN, .finish_exit ++ cmp LEN, 1 ++ bls .less_2B ++ ldrh wdata, [BUF], 2 ++ sub LEN, LEN, #2 ++ crc32_u16 wCRC, wCRC, wdata ++.less_2B: ++ cbz LEN, .finish_exit ++ ldrb wdata, [BUF] ++ crc32_u8 wCRC, wCRC, wdata ++.finish_exit: ++.ifc \poly_type,crc32 ++ mvn w0, wCRC ++.else ++ mov w0, wCRC ++.endif ++ ret ++.endm +diff -uNr zlib-1.2.11/contrib/arm/crc32_mix.S ../zlib-1.2.11/contrib/arm/crc32_mix.S +--- zlib-1.2.11/contrib/arm/crc32_mix.S 1970-01-01 08:00:00.000000000 +0800 ++++ ../zlib-1.2.11/contrib/arm/crc32_mix.S 2022-08-25 19:44:42.946181615 +0800 +@@ -0,0 +1,71 @@ ++/********************************************************************** ++ Copyright(c) 2020 Arm Corporation All rights reserved. ++ ++ Redistribution and use in source and binary forms, with or without ++ modification, are permitted provided that the following conditions ++ are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in ++ the documentation and/or other materials provided with the ++ distribution. ++ * Neither the name of Arm Corporation nor the names of its ++ contributors may be used to endorse or promote products derived ++ from this software without specific prior written permission. ++ ++ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ++ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ++ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ++ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ++ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ++ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ++ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ++ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ++ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ++ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++**********************************************************************/ ++#if defined(__ARM_FEATURE_CRC32) && defined(ARM_NEON) ++ ++ .text ++ .align 6 ++ .arch armv8-a+crypto+crc ++ ++#include "crc32_common_mix.S" ++.Lconstants: ++ .octa 0x00000001c6e415960000000154442bd4 ++ .octa 0x00000000ccaa009e00000001751997d0 ++ .octa 0x00000001F701164100000001DB710641 ++ .quad 0x0000000163cd6124 ++ .quad 0x00000000FFFFFFFF ++ .quad 0x000000001753ab84 ++.macro crc32_u64 dst,src,data ++ crc32x \dst,\src,\data ++.endm ++.macro crc32_u32 dst,src,data ++ crc32w \dst,\src,\data ++.endm ++.macro crc32_u16 dst,src,data ++ crc32h \dst,\src,\data ++.endm ++.macro crc32_u8 dst,src,data ++ crc32b \dst,\src,\data ++.endm ++ ++ ++/** ++ * uint32_t ptg_crc32(uint32_t crc, const unsigned char *data, size_t len); ++ * ++ */ ++ BUF .req x1 ++ LEN .req x2 ++ CRC .req x0 ++ wCRC .req w0 ++ .align 6 ++ .global ptg_crc32 ++ .type ptg_crc32, %function ++ptg_crc32: ++ crc32_common_mix crc32 ++ .size ptg_crc32, .-ptg_crc32 ++#endif +diff -uNr zlib-1.2.11/crc32.c ../zlib-1.2.11/crc32.c +--- zlib-1.2.11/crc32.c 2022-08-25 19:18:06.943264265 +0800 ++++ ../zlib-1.2.11/crc32.c 2022-08-25 19:30:51.765812383 +0800 +@@ -206,6 +206,10 @@ + extern uint32_t crc32_acle(uint32_t, const unsigned char *, uint64_t); + #endif + ++#if defined(__ARM_FEATURE_CRC32) && defined(ARM_NEON) ++extern uint32_t ptg_crc32(uint32_t, const unsigned char *, uint64_t); ++#endif ++ + /* ========================================================================= */ + unsigned long ZEXPORT crc32_z(crc, buf, len) + unsigned long crc; +@@ -225,7 +229,9 @@ + + endian = 1; + if (*((unsigned char *)(&endian))) +-#if defined(__ARM_FEATURE_CRC32) ++#if defined(__ARM_FEATURE_CRC32) && defined(ARM_NEON) ++ return ptg_crc32(crc, buf, len); ++#elif defined(__ARM_FEATURE_CRC32) + return crc32_acle(crc, buf, len); + #else + return crc32_little(crc, buf, len); +diff -uNr zlib-1.2.11/Makefile.in ../zlib-1.2.11/Makefile.in +--- zlib-1.2.11/Makefile.in 2022-08-25 19:18:23.206722650 +0800 ++++ ../zlib-1.2.11/Makefile.in 2022-08-25 19:35:00.657538475 +0800 +@@ -57,11 +57,11 @@ + ZINC= + ZINCOUT=-I. + +-OBJZ = adler32.o adler32_neon.o crc32_acle.o crc32.o deflate.o infback.o inffast.o inffast.o inflate.o inftrees.o trees.o zutil.o ++OBJZ = adler32.o adler32_neon.o crc32_acle.o crc32_mix.o crc32.o deflate.o infback.o inffast.o inffast.o inflate.o inftrees.o trees.o zutil.o + OBJG = compress.o uncompr.o gzclose.o gzlib.o gzread.o gzwrite.o + OBJC = $(OBJZ) $(OBJG) + +-PIC_OBJZ = adler32.lo adler32_neon.lo crc32_acle.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo ++PIC_OBJZ = adler32.lo adler32_neon.lo crc32_acle.lo crc32_mix.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo + PIC_OBJG = compress.lo uncompr.lo gzclose.lo gzlib.lo gzread.lo gzwrite.lo + PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG) + +@@ -164,6 +164,9 @@ + crc32_acle.o: $(SRCDIR)contrib/arm/crc32_acle.c + $(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -c -o $@ $(SRCDIR)contrib/arm/crc32_acle.c + ++crc32_mix.o: $(SRCDIR)contrib/arm/crc32_mix.S ++ $(CC) $(CFLAGS) $(ZINC) -I$(SRCDIR) -c -o $@ $(SRCDIR)contrib/arm/crc32_mix.S ++ + crc32.o: $(SRCDIR)crc32.c + $(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)crc32.c + +@@ -221,6 +224,11 @@ + $(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -DPIC -c -o objs/crc32_acle.o $(SRCDIR)contrib/arm/crc32_acle.c + -@mv objs/crc32_acle.o $@ + ++crc32_mix.lo: $(SRCDIR)contrib/arm/crc32_mix.S ++ -@mkdir objs 2>/dev/null || test -d objs ++ $(CC) $(SFLAGS) $(ZINC) -I$(SRCDIR) -DPIC -c -o objs/crc32_mix.o $(SRCDIR)contrib/arm/crc32_mix.S ++ -@mv objs/crc32_mix.o $@ ++ + crc32.lo: $(SRCDIR)crc32.c + -@mkdir objs 2>/dev/null || test -d objs + $(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/crc32.o $(SRCDIR)crc32.c diff --git a/zlib.spec b/zlib.spec index 801c6f2..749ff83 100644 --- a/zlib.spec +++ b/zlib.spec @@ -1,4 +1,4 @@ -%define anolis_release .0.2 +%define anolis_release .0.3 # disabled, per rhbz#1609830 and rhbz#1602742 %bcond_with minizip @@ -28,6 +28,8 @@ Patch1005: 1005-zlib-anolis-ARM-optimized-insert_string.patch Patch1006: 1006-zlib-anolis-Optimize-slide_hash.patch # optimized adler32 function in armv8 Patch1007: 1007-zlib-anolis-Neon-Optimized-adler32.patch +# optimized crc32 function with crc32 + pmul instruction in armv8 +Patch1008: 1008-zlib-anolis-Optimized-crc32-pmul-mix.patch BuildRequires: automake, autoconf, libtool @@ -88,6 +90,7 @@ developing applications which use minizip. %patch1004 -p1 %patch1005 -p1 %patch1007 -p1 +%patch1008 -p1 %endif %ifarch x86_64 @@ -170,6 +173,9 @@ find $RPM_BUILD_ROOT -name '*.la' -delete %changelog +* Thu Aug 25 2022 binbin Xu - 1.2.11-17.0.3 +- add optimized crc32 with pmul mix crc patch for aarch64 + * Wed Aug 24 2022 binbin Xu - 1.2.11-17.0.2 - add optimized adler32_neon patch for aarch64 -- Gitee