From 2882adf91a9a348307b31ae60138b66207ef9e6a Mon Sep 17 00:00:00 2001 From: Xu Yizhou Date: Wed, 2 Nov 2022 11:18:01 +0800 Subject: [PATCH] SM3/SM4 optimization for ARM by HW instruction --- ...-with-SM3-hardware-instruction-on-aa.patch | 492 ++++++++ ...timization-for-ARM-by-HW-instruction.patch | 1032 +++++++++++++++++ ...timization-for-ARM-by-HW-instruction.patch | 621 ++++++++++ openssl.spec | 12 +- 4 files changed, 2155 insertions(+), 2 deletions(-) create mode 100644 Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch create mode 100644 Backport-SM4-optimization-for-ARM-by-HW-instruction.patch create mode 100644 Feature-SM4-XTS-optimization-for-ARM-by-HW-instruction.patch diff --git a/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch b/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch new file mode 100644 index 0000000..722d506 --- /dev/null +++ b/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch @@ -0,0 +1,492 @@ +From 4d2e328357ac4b468d4762a5a5f615d7e7bf46a6 Mon Sep 17 00:00:00 2001 +From: Xu Yizhou +Date: Thu, 27 Oct 2022 20:49:34 +0800 +Subject: [PATCH 1/3] SM3 acceleration with SM3 hardware instruction on aarch64 + +This patch contains the following two PRs, + +1. SM3 acceleration with SM3 hardware instruction on aarch64 + +SM3 hardware instruction is optional feature of crypto extension for +aarch64. This implementation accelerates SM3 via SM3 instructions. For +the platform not supporting SM3 instruction, the original C +implementation still works. Thanks to AliBaba for testing and reporting +the following perf numbers for Yitian710: + +Benchmark on T-Head Yitian-710 2.75GHz: + +Before: +type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes +sm3 49297.82k 121062.63k 223106.05k 283371.52k 307574.10k 309400.92k + +After (33% - 74% faster): +type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes +sm3 65640.01k 179121.79k 359854.59k 481448.96k 534055.59k 538274.47k + +Reviewed-by: Paul Dale +Reviewed-by: Tomas Mraz +(Merged from https://github.com/openssl/openssl/pull/17454) + +2. Fix sm3ss1 translation issue in sm3-armv8.pl + +Reviewed-by: Tomas Mraz +Reviewed-by: Matt Caswell +Reviewed-by: Paul Dale +(Merged from https://github.com/openssl/openssl/pull/17542) + +Signed-off-by: Xu Yizhou +--- + Configurations/00-base-templates.conf | 1 + + Configure | 4 + + crypto/arm64cpuid.pl | 7 + + crypto/arm_arch.h | 1 + + crypto/armcap.c | 10 + + crypto/sm3/asm/sm3-armv8.pl | 280 ++++++++++++++++++++++++++ + crypto/sm3/build.info | 15 +- + crypto/sm3/sm3_local.h | 16 +- + 8 files changed, 332 insertions(+), 2 deletions(-) + create mode 100644 crypto/sm3/asm/sm3-armv8.pl + +diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf +index 1d35012..a67ae65 100644 +--- a/Configurations/00-base-templates.conf ++++ b/Configurations/00-base-templates.conf +@@ -322,6 +322,7 @@ my %targets=( + poly1305_asm_src=> "poly1305-armv8.S", + keccak1600_asm_src => "keccak1600-armv8.S", + sm4_asm_src => "vpsm4_ex-armv8.S", ++ sm3_asm_src => "sm3-armv8.S", + }, + parisc11_asm => { + template => 1, +diff --git a/Configure b/Configure +index 3bfe360..fce460d 100755 +--- a/Configure ++++ b/Configure +@@ -1423,6 +1423,9 @@ unless ($disabled{asm}) { + if ($target{sm4_asm_src} ne "") { + push @{$config{lib_defines}}, "VPSM4_EX_ASM"; + } ++ if ($target{sm3_asm_src} ne "") { ++ push @{$config{lib_defines}}, "SM3_ASM"; ++ } + } + + my %predefined_C = compiler_predefined($config{CROSS_COMPILE}.$config{CC}); +@@ -3379,6 +3382,7 @@ sub print_table_entry + "multilib", + "build_scheme", + "sm4_asm_src", ++ "sm3_asm_src", + ); + + if ($type eq "TABLE") { +diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl +index 319927e..1e9b167 100755 +--- a/crypto/arm64cpuid.pl ++++ b/crypto/arm64cpuid.pl +@@ -78,6 +78,13 @@ _armv8_sha512_probe: + ret + .size _armv8_sha512_probe,.-_armv8_sha512_probe + ++.globl _armv8_sm3_probe ++.type _armv8_sm3_probe,%function ++_armv8_sm3_probe: ++ .long 0xce63c004 // sm3partw1 v4.4s, v0.4s, v3.4s ++ ret ++.size _armv8_sm3_probe,.-_armv8_sm3_probe ++ + .globl OPENSSL_cleanse + .type OPENSSL_cleanse,%function + .align 5 +diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h +index 8b71055..8839b21 100644 +--- a/crypto/arm_arch.h ++++ b/crypto/arm_arch.h +@@ -80,5 +80,6 @@ extern unsigned int OPENSSL_armcap_P; + # define ARMV8_SHA256 (1<<4) + # define ARMV8_PMULL (1<<5) + # define ARMV8_SHA512 (1<<6) ++# define ARMV8_SM3 (1<<9) + + #endif +diff --git a/crypto/armcap.c b/crypto/armcap.c +index 48c5d4d..8b2f4a5 100644 +--- a/crypto/armcap.c ++++ b/crypto/armcap.c +@@ -47,6 +47,7 @@ void _armv8_sha1_probe(void); + void _armv8_sha256_probe(void); + void _armv8_pmull_probe(void); + # ifdef __aarch64__ ++void _armv8_sm3_probe(void); + void _armv8_sha512_probe(void); + # endif + uint32_t _armv7_tick(void); +@@ -130,6 +131,7 @@ static unsigned long getauxval(unsigned long key) + # define HWCAP_CE_PMULL (1 << 4) + # define HWCAP_CE_SHA1 (1 << 5) + # define HWCAP_CE_SHA256 (1 << 6) ++# define HWCAP_CE_SM3 (1 << 18) + # define HWCAP_CE_SHA512 (1 << 21) + # endif + +@@ -190,6 +192,9 @@ void OPENSSL_cpuid_setup(void) + # ifdef __aarch64__ + if (hwcap & HWCAP_CE_SHA512) + OPENSSL_armcap_P |= ARMV8_SHA512; ++ ++ if (hwcap & HWCAP_CE_SM3) ++ OPENSSL_armcap_P |= ARMV8_SM3; + # endif + } + # endif +@@ -233,6 +238,11 @@ void OPENSSL_cpuid_setup(void) + _armv8_sha512_probe(); + OPENSSL_armcap_P |= ARMV8_SHA512; + } ++ ++ if (sigsetjmp(ill_jmp, 1) == 0) { ++ _armv8_sm3_probe(); ++ OPENSSL_armcap_P |= ARMV8_SM3; ++ } + # endif + } + # endif +diff --git a/crypto/sm3/asm/sm3-armv8.pl b/crypto/sm3/asm/sm3-armv8.pl +new file mode 100644 +index 0000000..677ca52 +--- /dev/null ++++ b/crypto/sm3/asm/sm3-armv8.pl +@@ -0,0 +1,280 @@ ++#! /usr/bin/env perl ++# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++# ++# This module implements support for Armv8 SM3 instructions ++ ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or ++die "can't locate arm-xlate.pl"; ++ ++open OUT,"| \"$^X\" $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++*STDOUT=*OUT; ++ ++# Message expanding: ++# Wj <- P1(W[j-16]^W[j-9]^(W[j-3]<<<15))^(W[j-13]<<<7)^W[j-6] ++# Input: s0, s1, s2, s3 ++# s0 = w0 | w1 | w2 | w3 ++# s1 = w4 | w5 | w6 | w7 ++# s2 = w8 | w9 | w10 | w11 ++# s3 = w12 | w13 | w14 | w15 ++# Output: s4 ++sub msg_exp () { ++my $s0 = shift; ++my $s1 = shift; ++my $s2 = shift; ++my $s3 = shift; ++my $s4 = shift; ++my $vtmp1 = shift; ++my $vtmp2 = shift; ++$code.=<<___; ++ // s4 = w7 | w8 | w9 | w10 ++ ext $s4.16b, $s1.16b, $s2.16b, #12 ++ // vtmp1 = w3 | w4 | w5 | w6 ++ ext $vtmp1.16b, $s0.16b, $s1.16b, #12 ++ // vtmp2 = w10 | w11 | w12 | w13 ++ ext $vtmp2.16b, $s2.16b, $s3.16b, #8 ++ sm3partw1 $s4.4s, $s0.4s, $s3.4s ++ sm3partw2 $s4.4s, $vtmp2.4s, $vtmp1.4s ++___ ++} ++ ++# A round of compresson function ++# Input: ++# ab - choose instruction among sm3tt1a, sm3tt1b, sm3tt2a, sm3tt2b ++# vstate0 - vstate1, store digest status(A - H) ++# vconst0 - vconst1, interleaved used to store Tj <<< j ++# vtmp - temporary register ++# vw - for sm3tt1ab, vw = s0 eor s1 ++# s0 - for sm3tt2ab, just be s0 ++# i, choose wj' or wj from vw ++sub round () { ++my $ab = shift; ++my $vstate0 = shift; ++my $vstate1 = shift; ++my $vconst0 = shift; ++my $vconst1 = shift; ++my $vtmp = shift; ++my $vw = shift; ++my $s0 = shift; ++my $i = shift; ++$code.=<<___; ++ sm3ss1 $vtmp.4s, $vstate0.4s, $vconst0.4s, $vstate1.4s ++ shl $vconst1.4s, $vconst0.4s, #1 ++ sri $vconst1.4s, $vconst0.4s, #31 ++ sm3tt1$ab $vstate0.4s, $vtmp.4s, $vw.4s[$i] ++ sm3tt2$ab $vstate1.4s, $vtmp.4s, $s0.4s[$i] ++___ ++} ++ ++sub qround () { ++my $ab = shift; ++my $vstate0 = shift; ++my $vstate1 = shift; ++my $vconst0 = shift; ++my $vconst1 = shift; ++my $vtmp1 = shift; ++my $vtmp2 = shift; ++my $s0 = shift; ++my $s1 = shift; ++my $s2 = shift; ++my $s3 = shift; ++my $s4 = shift; ++ if($s4) { ++ &msg_exp($s0, $s1, $s2, $s3, $s4, $vtmp1, $vtmp2); ++ } ++$code.=<<___; ++ eor $vtmp1.16b, $s0.16b, $s1.16b ++___ ++ &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2, ++ $vtmp1, $s0, 0); ++ &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2, ++ $vtmp1, $s0, 1); ++ &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2, ++ $vtmp1, $s0, 2); ++ &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2, ++ $vtmp1, $s0, 3); ++} ++ ++$code=<<___; ++#include "arm_arch.h" ++.arch armv8.2-a ++.text ++___ ++ ++{{{ ++my ($pstate,$pdata,$num)=("x0","x1","w2"); ++my ($state1,$state2)=("v5","v6"); ++my ($sconst1, $sconst2)=("s16","s17"); ++my ($vconst1, $vconst2)=("v16","v17"); ++my ($s0,$s1,$s2,$s3,$s4)=map("v$_",(0..4)); ++my ($bkstate1,$bkstate2)=("v18","v19"); ++my ($vconst_tmp1,$vconst_tmp2)=("v20","v21"); ++my ($vtmp1,$vtmp2)=("v22","v23"); ++my $constaddr="x8"; ++# void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num) ++$code.=<<___; ++.globl ossl_hwsm3_block_data_order ++.type ossl_hwsm3_block_data_order,%function ++.align 5 ++ossl_hwsm3_block_data_order: ++ // load state ++ ld1 {$state1.4s-$state2.4s}, [$pstate] ++ rev64 $state1.4s, $state1.4s ++ rev64 $state2.4s, $state2.4s ++ ext $state1.16b, $state1.16b, $state1.16b, #8 ++ ext $state2.16b, $state2.16b, $state2.16b, #8 ++ ++ adr $constaddr, .Tj ++ ldp $sconst1, $sconst2, [$constaddr] ++ ++.Loop: ++ // load input ++ ld1 {$s0.16b-$s3.16b}, [$pdata], #64 ++ sub $num, $num, #1 ++ ++ mov $bkstate1.16b, $state1.16b ++ mov $bkstate2.16b, $state2.16b ++ ++#ifndef __ARMEB__ ++ rev32 $s0.16b, $s0.16b ++ rev32 $s1.16b, $s1.16b ++ rev32 $s2.16b, $s2.16b ++ rev32 $s3.16b, $s3.16b ++#endif ++ ++ ext $vconst_tmp1.16b, $vconst1.16b, $vconst1.16b, #4 ++___ ++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s0,$s1,$s2,$s3,$s4); ++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s1,$s2,$s3,$s4,$s0); ++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s2,$s3,$s4,$s0,$s1); ++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s3,$s4,$s0,$s1,$s2); ++ ++$code.=<<___; ++ ext $vconst_tmp1.16b, $vconst2.16b, $vconst2.16b, #4 ++___ ++ ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s4,$s0,$s1,$s2,$s3); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s0,$s1,$s2,$s3,$s4); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s1,$s2,$s3,$s4,$s0); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s2,$s3,$s4,$s0,$s1); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s3,$s4,$s0,$s1,$s2); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s4,$s0,$s1,$s2,$s3); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s0,$s1,$s2,$s3,$s4); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s1,$s2,$s3,$s4,$s0); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s2,$s3,$s4,$s0,$s1); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s3,$s4); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s4,$s0); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s0,$s1); ++ ++$code.=<<___; ++ eor $state1.16b, $state1.16b, $bkstate1.16b ++ eor $state2.16b, $state2.16b, $bkstate2.16b ++ ++ // any remained blocks? ++ cbnz $num, .Loop ++ ++ // save state ++ rev64 $state1.4s, $state1.4s ++ rev64 $state2.4s, $state2.4s ++ ext $state1.16b, $state1.16b, $state1.16b, #8 ++ ext $state2.16b, $state2.16b, $state2.16b, #8 ++ st1 {$state1.4s-$state2.4s}, [$pstate] ++ ret ++.size ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order ++ ++.align 3 ++.Tj: ++.word 0x79cc4519, 0x9d8a7a87 ++___ ++}}} ++ ++######################################### ++my %sm3partopcode = ( ++ "sm3partw1" => 0xce60C000, ++ "sm3partw2" => 0xce60C400); ++ ++my %sm3ss1opcode = ( ++ "sm3ss1" => 0xce400000); ++ ++my %sm3ttopcode = ( ++ "sm3tt1a" => 0xce408000, ++ "sm3tt1b" => 0xce408400, ++ "sm3tt2a" => 0xce408800, ++ "sm3tt2b" => 0xce408C00); ++ ++sub unsm3part { ++ my ($mnemonic,$arg)=@_; ++ ++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o ++ && ++ sprintf ".inst\t0x%08x\t//%s %s", ++ $sm3partopcode{$mnemonic}|$1|($2<<5)|($3<<16), ++ $mnemonic,$arg; ++} ++ ++sub unsm3ss1 { ++ my ($mnemonic,$arg)=@_; ++ ++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o ++ && ++ sprintf ".inst\t0x%08x\t//%s %s", ++ $sm3ss1opcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10), ++ $mnemonic,$arg; ++} ++ ++sub unsm3tt { ++ my ($mnemonic,$arg)=@_; ++ ++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*\[([0-3])\]/o ++ && ++ sprintf ".inst\t0x%08x\t//%s %s", ++ $sm3ttopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<12), ++ $mnemonic,$arg; ++} ++ ++open SELF,$0; ++while() { ++ next if (/^#!/); ++ last if (!s/^#/\/\// and !/^$/); ++ print; ++} ++close SELF; ++ ++foreach(split("\n",$code)) { ++ s/\`([^\`]*)\`/eval($1)/ge; ++ ++ s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge; ++ s/\b(sm3ss1)\s+([qv].*)/unsm3ss1($1,$2)/ge; ++ s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge; ++ print $_,"\n"; ++} ++ ++close STDOUT or die "error closing STDOUT: $!"; +diff --git a/crypto/sm3/build.info b/crypto/sm3/build.info +index 6009b19..e113729 100644 +--- a/crypto/sm3/build.info ++++ b/crypto/sm3/build.info +@@ -1,2 +1,15 @@ + LIBS=../../libcrypto +-SOURCE[../../libcrypto]=sm3.c m_sm3.c ++SOURCE[../../libcrypto]=\ ++ sm3.c m_sm3.c {- $target{sm3_asm_src} -} ++ ++GENERATE[sm3-armv8.S]=asm/sm3-armv8.pl $(PERLASM_SCHEME) ++INCLUDE[sm3-armv8.o]=.. ++ ++BEGINRAW[Makefile] ++##### SM3 assembler implementations ++ ++# GNU make "catch all" ++{- $builddir -}/sm3-%.S: {- $sourcedir -}/asm/sm3-%.pl ++ CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ ++ ++ENDRAW[Makefile] +\ No newline at end of file +diff --git a/crypto/sm3/sm3_local.h b/crypto/sm3/sm3_local.h +index 7171de5..aafff63 100644 +--- a/crypto/sm3/sm3_local.h ++++ b/crypto/sm3/sm3_local.h +@@ -32,7 +32,21 @@ + ll=(c)->G; (void)HOST_l2c(ll, (s)); \ + ll=(c)->H; (void)HOST_l2c(ll, (s)); \ + } while (0) +-#define HASH_BLOCK_DATA_ORDER sm3_block_data_order ++ ++#if defined(SM3_ASM) ++# if defined(__aarch64__) ++# include "crypto/arm_arch.h" ++# define HWSM3_CAPABLE (OPENSSL_armcap_P & ARMV8_SM3) ++void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num); ++# endif ++#endif ++ ++#if defined(HWSM3_CAPABLE) ++# define HASH_BLOCK_DATA_ORDER (HWSM3_CAPABLE ? ossl_hwsm3_block_data_order \ ++ : sm3_block_data_order) ++#else ++# define HASH_BLOCK_DATA_ORDER sm3_block_data_order ++#endif + + void sm3_transform(SM3_CTX *c, const unsigned char *data); + +-- +2.36.1 + diff --git a/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch b/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch new file mode 100644 index 0000000..f525e70 --- /dev/null +++ b/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch @@ -0,0 +1,1032 @@ +From 4f7e522f7fda2c55c4915396d08f8c9cf3b3fba8 Mon Sep 17 00:00:00 2001 +From: Xu Yizhou +Date: Fri, 28 Oct 2022 11:24:28 +0800 +Subject: [PATCH 2/3] SM4 optimization for ARM by HW instruction + +This patch is a copy of the following PR, with +some extra supporting code. + +1. SM4 optimization for ARM by HW instruction + +This patch implements the SM4 optimization for ARM processor, +using SM4 HW instruction, which is an optional feature of +crypto extension for aarch64 V8. + +Tested on some modern ARM micro-architectures with SM4 support, the +performance uplift can be observed around 8X~40X over existing +C implementation in openssl. Algorithms that can be parallelized +(like CTR, ECB, CBC decryption) are on higher end, with algorithm +like CBC encryption on lower end (due to inter-block dependency) + +Perf data on Yitian-710 2.75GHz hardware, before and after optimization: + +Before: +type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes +SM4-CTR 105787.80k 107837.87k 108380.84k 108462.08k 108549.46k 108554.92k +SM4-ECB 111924.58k 118173.76k 119776.00k 120093.70k 120264.02k 120274.94k +SM4-CBC 106428.09k 109190.98k 109674.33k 109774.51k 109827.41k 109827.41k + +After (7.4x - 36.6x faster): +type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes +SM4-CTR 781979.02k 2432994.28k 3437753.86k 3834177.88k 3963715.58k 3974556.33k +SM4-ECB 937590.69k 2941689.02k 3945751.81k 4328655.87k 4459181.40k 4468692.31k +SM4-CBC 890639.88k 1027746.58k 1050621.78k 1056696.66k 1058613.93k 1058701.31k + +Signed-off-by: Daniel Hu + +Reviewed-by: Paul Dale +Reviewed-by: Tomas Mraz +(Merged from https://github.com/openssl/openssl/pull/17455\) + +Signed-off-by: Xu Yizhou +--- + Configurations/00-base-templates.conf | 2 +- + Configure | 3 +- + crypto/arm64cpuid.pl | 7 + + crypto/arm_arch.h | 1 + + crypto/armcap.c | 10 + + crypto/evp/e_sm4.c | 88 ++-- + crypto/sm4/asm/sm4-armv8.pl | 629 ++++++++++++++++++++++++++ + crypto/sm4/build.info | 13 +- + include/crypto/sm4_platform.h | 70 +++ + 9 files changed, 788 insertions(+), 35 deletions(-) + create mode 100644 crypto/sm4/asm/sm4-armv8.pl + create mode 100644 include/crypto/sm4_platform.h + +diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf +index a67ae65..a26d081 100644 +--- a/Configurations/00-base-templates.conf ++++ b/Configurations/00-base-templates.conf +@@ -321,7 +321,7 @@ my %targets=( + chacha_asm_src => "chacha-armv8.S", + poly1305_asm_src=> "poly1305-armv8.S", + keccak1600_asm_src => "keccak1600-armv8.S", +- sm4_asm_src => "vpsm4_ex-armv8.S", ++ sm4_asm_src => "sm4-armv8.S vpsm4_ex-armv8.S", + sm3_asm_src => "sm3-armv8.S", + }, + parisc11_asm => { +diff --git a/Configure b/Configure +index fce460d..d013204 100755 +--- a/Configure ++++ b/Configure +@@ -1421,7 +1421,8 @@ unless ($disabled{asm}) { + push @{$config{lib_defines}}, "POLY1305_ASM"; + } + if ($target{sm4_asm_src} ne "") { +- push @{$config{lib_defines}}, "VPSM4_EX_ASM"; ++ push @{$config{lib_defines}}, "SM4_ASM" if ($target{sm4_asm_src} =~ m/sm4/); ++ push @{$config{lib_defines}}, "VPSM4_EX_ASM" if ($target{sm4_asm_src} =~ m/vpsm4_ex/); + } + if ($target{sm3_asm_src} ne "") { + push @{$config{lib_defines}}, "SM3_ASM"; +diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl +index 1e9b167..341167b 100755 +--- a/crypto/arm64cpuid.pl ++++ b/crypto/arm64cpuid.pl +@@ -71,6 +71,13 @@ _armv8_pmull_probe: + ret + .size _armv8_pmull_probe,.-_armv8_pmull_probe + ++.globl _armv8_sm4_probe ++.type _armv8_sm4_probe,%function ++_armv8_sm4_probe: ++ .long 0xcec08400 // sm4e v0.4s, v0.4s ++ ret ++.size _armv8_sm4_probe,.-_armv8_sm4_probe ++ + .globl _armv8_sha512_probe + .type _armv8_sha512_probe,%function + _armv8_sha512_probe: +diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h +index 8839b21..0f6f7ca 100644 +--- a/crypto/arm_arch.h ++++ b/crypto/arm_arch.h +@@ -81,5 +81,6 @@ extern unsigned int OPENSSL_armcap_P; + # define ARMV8_PMULL (1<<5) + # define ARMV8_SHA512 (1<<6) + # define ARMV8_SM3 (1<<9) ++# define ARMV8_SM4 (1<<10) + + #endif +diff --git a/crypto/armcap.c b/crypto/armcap.c +index 8b2f4a5..73bcad1 100644 +--- a/crypto/armcap.c ++++ b/crypto/armcap.c +@@ -48,6 +48,7 @@ void _armv8_sha256_probe(void); + void _armv8_pmull_probe(void); + # ifdef __aarch64__ + void _armv8_sm3_probe(void); ++void _armv8_sm4_probe(void); + void _armv8_sha512_probe(void); + # endif + uint32_t _armv7_tick(void); +@@ -132,6 +133,7 @@ static unsigned long getauxval(unsigned long key) + # define HWCAP_CE_SHA1 (1 << 5) + # define HWCAP_CE_SHA256 (1 << 6) + # define HWCAP_CE_SM3 (1 << 18) ++# define HWCAP_CE_SM4 (1 << 19) + # define HWCAP_CE_SHA512 (1 << 21) + # endif + +@@ -190,6 +192,9 @@ void OPENSSL_cpuid_setup(void) + OPENSSL_armcap_P |= ARMV8_SHA256; + + # ifdef __aarch64__ ++ if (hwcap & HWCAP_CE_SM4) ++ OPENSSL_armcap_P |= ARMV8_SM4; ++ + if (hwcap & HWCAP_CE_SHA512) + OPENSSL_armcap_P |= ARMV8_SHA512; + +@@ -234,6 +239,11 @@ void OPENSSL_cpuid_setup(void) + OPENSSL_armcap_P |= ARMV8_SHA256; + } + # if defined(__aarch64__) && !defined(__APPLE__) ++ if (sigsetjmp(ill_jmp, 1) == 0) { ++ _armv8_sm4_probe(); ++ OPENSSL_armcap_P |= ARMV8_SM4; ++ } ++ + if (sigsetjmp(ill_jmp, 1) == 0) { + _armv8_sha512_probe(); + OPENSSL_armcap_P |= ARMV8_SHA512; +diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c +index 169d6c7..eaa5ba0 100644 +--- a/crypto/evp/e_sm4.c ++++ b/crypto/evp/e_sm4.c +@@ -15,17 +15,11 @@ + # include + # include "crypto/sm4.h" + # include "crypto/evp.h" ++# include "crypto/sm4_platform.h" + # include "evp_local.h" + # include "modes_local.h" + +-#if defined(OPENSSL_CPUID_OBJ) && (defined(__arm__) || defined(__arm) || defined(__aarch64__)) +-# include "arm_arch.h" +-# if __ARM_MAX_ARCH__>=7 +-# if defined(VPSM4_EX_ASM) +-# define VPSM4_EX_CAPABLE (OPENSSL_armcap_P & ARMV8_AES) +-# endif +-# endif +-#endif ++ + + typedef struct { + union { +@@ -35,28 +29,11 @@ typedef struct { + block128_f block; + union { + ecb128_f ecb; ++ cbc128_f cbc; ++ ctr128_f ctr; + } stream; + } EVP_SM4_KEY; + +-#ifdef VPSM4_EX_CAPABLE +-void vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); +-void vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); +-#define vpsm4_ex_encrypt SM4_encrypt +-#define vpsm4_ex_decrypt SM4_encrypt +-void vpsm4_ex_ecb_encrypt( +- const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key, const int enc); +-/* xts mode in GB/T 17964-2021 */ +-void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, +- const SM4_KEY *key2, const uint8_t iv[16]); +-void vpsm4_ex_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, +- const SM4_KEY *key2, const uint8_t iv[16]); +-/* xts mode in IEEE Std 1619-2007 */ +-void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, +- const SM4_KEY *key2, const uint8_t iv[16]); +-void vpsm4_ex_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, +- const SM4_KEY *key2, const uint8_t iv[16]); +-#endif +- + # define BLOCK_CIPHER_generic(nid,blocksize,ivlen,nmode,mode,MODE,flags) \ + static const EVP_CIPHER sm4_##mode = { \ + nid##_##nmode,blocksize,128/8,ivlen, \ +@@ -84,6 +61,21 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + + mode = EVP_CIPHER_CTX_mode(ctx); + if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) && !enc) { ++#ifdef HWSM4_CAPABLE ++ if (HWSM4_CAPABLE) { ++ HWSM4_set_decrypt_key(key, &dat->ks.ks); ++ dat->block = (block128_f) HWSM4_decrypt; ++ dat->stream.cbc = NULL; ++# ifdef HWSM4_cbc_encrypt ++ if (mode == EVP_CIPH_CBC_MODE) ++ dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt; ++# endif ++# ifdef HWSM4_ecb_encrypt ++ if (mode == EVP_CIPH_ECB_MODE) ++ dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt; ++# endif ++ } else ++#endif + #ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + vpsm4_ex_set_decrypt_key(key, &dat->ks.ks); +@@ -97,6 +89,29 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + SM4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); + } + } else { ++#ifdef HWSM4_CAPABLE ++ if (HWSM4_CAPABLE) { ++ HWSM4_set_encrypt_key(key, &dat->ks.ks); ++ dat->block = (block128_f) HWSM4_encrypt; ++ dat->stream.cbc = NULL; ++# ifdef HWSM4_cbc_encrypt ++ if (mode == EVP_CIPH_CBC_MODE) ++ dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt; ++ else ++# endif ++# ifdef HWSM4_ecb_encrypt ++ if (mode == EVP_CIPH_ECB_MODE) ++ dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt; ++ else ++# endif ++# ifdef HWSM4_ctr32_encrypt_blocks ++ if (mode == EVP_CIPH_CTR_MODE) ++ dat->stream.ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks; ++ else ++# endif ++ (void)0; /* terminate potentially open 'else' */ ++ } else ++#endif + #ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + vpsm4_ex_set_encrypt_key(key, &dat->ks.ks); +@@ -118,7 +133,10 @@ static int sm4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + { + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + +- if (EVP_CIPHER_CTX_encrypting(ctx)) ++ if (dat->stream.cbc) ++ (*dat->stream.cbc) (in, out, len, &dat->ks.ks, ctx->iv, ++ EVP_CIPHER_CTX_encrypting(ctx)); ++ else if (EVP_CIPHER_CTX_encrypting(ctx)) + CRYPTO_cbc128_encrypt(in, out, len, &dat->ks.ks, + EVP_CIPHER_CTX_iv_noconst(ctx), dat->block); + else +@@ -183,10 +201,16 @@ static int sm4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + return 0; + num = (unsigned int)n; + +- CRYPTO_ctr128_encrypt(in, out, len, &dat->ks.ks, +- ctx->iv, +- EVP_CIPHER_CTX_buf_noconst(ctx), &num, +- dat->block); ++ if (dat->stream.ctr) ++ CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks, ++ ctx->iv, ++ EVP_CIPHER_CTX_buf_noconst(ctx), ++ &num, dat->stream.ctr); ++ else ++ CRYPTO_ctr128_encrypt(in, out, len, &dat->ks.ks, ++ ctx->iv, ++ EVP_CIPHER_CTX_buf_noconst(ctx), &num, ++ dat->block); + EVP_CIPHER_CTX_set_num(ctx, num); + return 1; + } +diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl +new file mode 100644 +index 0000000..dbacad2 +--- /dev/null ++++ b/crypto/sm4/asm/sm4-armv8.pl +@@ -0,0 +1,629 @@ ++#! /usr/bin/env perl ++# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++ ++# ++# This module implements support for SM4 hw support on aarch64 ++# Oct 2021 ++# ++ ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or ++die "can't locate arm-xlate.pl"; ++ ++open OUT,"| \"$^X\" $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++*STDOUT=*OUT; ++ ++$prefix="sm4_v8"; ++my @rks=map("v$_",(0..7)); ++ ++sub rev32() { ++my $dst = shift; ++my $src = shift; ++$code.=<<___; ++#ifndef __ARMEB__ ++ rev32 $dst.16b,$src.16b ++#endif ++___ ++} ++ ++sub enc_blk () { ++my $data = shift; ++$code.=<<___; ++ sm4e $data.4s,@rks[0].4s ++ sm4e $data.4s,@rks[1].4s ++ sm4e $data.4s,@rks[2].4s ++ sm4e $data.4s,@rks[3].4s ++ sm4e $data.4s,@rks[4].4s ++ sm4e $data.4s,@rks[5].4s ++ sm4e $data.4s,@rks[6].4s ++ sm4e $data.4s,@rks[7].4s ++ rev64 $data.4S,$data.4S ++ ext $data.16b,$data.16b,$data.16b,#8 ++___ ++} ++ ++sub enc_4blks () { ++my $data0 = shift; ++my $data1 = shift; ++my $data2 = shift; ++my $data3 = shift; ++$code.=<<___; ++ sm4e $data0.4s,@rks[0].4s ++ sm4e $data1.4s,@rks[0].4s ++ sm4e $data2.4s,@rks[0].4s ++ sm4e $data3.4s,@rks[0].4s ++ ++ sm4e $data0.4s,@rks[1].4s ++ sm4e $data1.4s,@rks[1].4s ++ sm4e $data2.4s,@rks[1].4s ++ sm4e $data3.4s,@rks[1].4s ++ ++ sm4e $data0.4s,@rks[2].4s ++ sm4e $data1.4s,@rks[2].4s ++ sm4e $data2.4s,@rks[2].4s ++ sm4e $data3.4s,@rks[2].4s ++ ++ sm4e $data0.4s,@rks[3].4s ++ sm4e $data1.4s,@rks[3].4s ++ sm4e $data2.4s,@rks[3].4s ++ sm4e $data3.4s,@rks[3].4s ++ ++ sm4e $data0.4s,@rks[4].4s ++ sm4e $data1.4s,@rks[4].4s ++ sm4e $data2.4s,@rks[4].4s ++ sm4e $data3.4s,@rks[4].4s ++ ++ sm4e $data0.4s,@rks[5].4s ++ sm4e $data1.4s,@rks[5].4s ++ sm4e $data2.4s,@rks[5].4s ++ sm4e $data3.4s,@rks[5].4s ++ ++ sm4e $data0.4s,@rks[6].4s ++ sm4e $data1.4s,@rks[6].4s ++ sm4e $data2.4s,@rks[6].4s ++ sm4e $data3.4s,@rks[6].4s ++ ++ sm4e $data0.4s,@rks[7].4s ++ rev64 $data0.4S,$data0.4S ++ sm4e $data1.4s,@rks[7].4s ++ ext $data0.16b,$data0.16b,$data0.16b,#8 ++ rev64 $data1.4S,$data1.4S ++ sm4e $data2.4s,@rks[7].4s ++ ext $data1.16b,$data1.16b,$data1.16b,#8 ++ rev64 $data2.4S,$data2.4S ++ sm4e $data3.4s,@rks[7].4s ++ ext $data2.16b,$data2.16b,$data2.16b,#8 ++ rev64 $data3.4S,$data3.4S ++ ext $data3.16b,$data3.16b,$data3.16b,#8 ++___ ++} ++ ++$code=<<___; ++#include "arm_arch.h" ++.arch armv8-a+crypto ++.text ++___ ++ ++{{{ ++$code.=<<___; ++.align 6 ++.Lck: ++ .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 ++ .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 ++ .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 ++ .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 ++ .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 ++ .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 ++ .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 ++ .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 ++.Lfk: ++ .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc ++___ ++}}} ++ ++{{{ ++my ($key,$keys)=("x0","x1"); ++my ($tmp)=("x2"); ++my ($key0,$key1,$key2,$key3,$key4,$key5,$key6,$key7)=map("v$_",(0..7)); ++my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23)); ++my ($fkconst) = ("v24"); ++$code.=<<___; ++.globl ${prefix}_set_encrypt_key ++.type ${prefix}_set_encrypt_key,%function ++.align 5 ++${prefix}_set_encrypt_key: ++ ld1 {$key0.4s},[$key] ++ adr $tmp,.Lfk ++ ld1 {$fkconst.4s},[$tmp] ++ adr $tmp,.Lck ++ ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64 ++___ ++ &rev32($key0, $key0); ++$code.=<<___; ++ ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp] ++ eor $key0.16b,$key0.16b,$fkconst.16b; ++ sm4ekey $key0.4S,$key0.4S,$const0.4S ++ sm4ekey $key1.4S,$key0.4S,$const1.4S ++ sm4ekey $key2.4S,$key1.4S,$const2.4S ++ sm4ekey $key3.4S,$key2.4S,$const3.4S ++ sm4ekey $key4.4S,$key3.4S,$const4.4S ++ st1 {$key0.4s,$key1.4s,$key2.4s,$key3.4s},[$keys],64 ++ sm4ekey $key5.4S,$key4.4S,$const5.4S ++ sm4ekey $key6.4S,$key5.4S,$const6.4S ++ sm4ekey $key7.4S,$key6.4S,$const7.4S ++ st1 {$key4.4s,$key5.4s,$key6.4s,$key7.4s},[$keys] ++ ret ++.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key ++___ ++}}} ++ ++{{{ ++my ($key,$keys)=("x0","x1"); ++my ($tmp)=("x2"); ++my ($key7,$key6,$key5,$key4,$key3,$key2,$key1,$key0)=map("v$_",(0..7)); ++my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23)); ++my ($fkconst) = ("v24"); ++$code.=<<___; ++.globl ${prefix}_set_decrypt_key ++.type ${prefix}_set_decrypt_key,%function ++.align 5 ++${prefix}_set_decrypt_key: ++ ld1 {$key0.4s},[$key] ++ adr $tmp,.Lfk ++ ld1 {$fkconst.4s},[$tmp] ++ adr $tmp, .Lck ++ ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64 ++___ ++ &rev32($key0, $key0); ++$code.=<<___; ++ ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp] ++ eor $key0.16b, $key0.16b,$fkconst.16b; ++ sm4ekey $key0.4S,$key0.4S,$const0.4S ++ sm4ekey $key1.4S,$key0.4S,$const1.4S ++ sm4ekey $key2.4S,$key1.4S,$const2.4S ++ rev64 $key0.4s,$key0.4s ++ rev64 $key1.4s,$key1.4s ++ ext $key0.16b,$key0.16b,$key0.16b,#8 ++ ext $key1.16b,$key1.16b,$key1.16b,#8 ++ sm4ekey $key3.4S,$key2.4S,$const3.4S ++ sm4ekey $key4.4S,$key3.4S,$const4.4S ++ rev64 $key2.4s,$key2.4s ++ rev64 $key3.4s,$key3.4s ++ ext $key2.16b,$key2.16b,$key2.16b,#8 ++ ext $key3.16b,$key3.16b,$key3.16b,#8 ++ sm4ekey $key5.4S,$key4.4S,$const5.4S ++ sm4ekey $key6.4S,$key5.4S,$const6.4S ++ rev64 $key4.4s,$key4.4s ++ rev64 $key5.4s,$key5.4s ++ ext $key4.16b,$key4.16b,$key4.16b,#8 ++ ext $key5.16b,$key5.16b,$key5.16b,#8 ++ sm4ekey $key7.4S,$key6.4S,$const7.4S ++ rev64 $key6.4s, $key6.4s ++ rev64 $key7.4s, $key7.4s ++ ext $key6.16b,$key6.16b,$key6.16b,#8 ++ ext $key7.16b,$key7.16b,$key7.16b,#8 ++ st1 {$key7.4s,$key6.4s,$key5.4s,$key4.4s},[$keys],64 ++ st1 {$key3.4s,$key2.4s,$key1.4s,$key0.4s},[$keys] ++ ret ++.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key ++___ ++}}} ++ ++{{{ ++sub gen_block () { ++my $dir = shift; ++my ($inp,$out,$rk)=map("x$_",(0..2)); ++my ($data)=("v16"); ++$code.=<<___; ++.globl ${prefix}_${dir}crypt ++.type ${prefix}_${dir}crypt,%function ++.align 5 ++${prefix}_${dir}crypt: ++ ld1 {$data.4s},[$inp] ++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64 ++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] ++___ ++ &rev32($data,$data); ++ &enc_blk($data); ++ &rev32($data,$data); ++$code.=<<___; ++ st1 {$data.4s},[$out] ++ ret ++.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt ++___ ++} ++ ++&gen_block("en"); ++&gen_block("de"); ++}}} ++ ++{{{ ++my ($inp,$out,$len,$rk)=map("x$_",(0..3)); ++my ($enc) = ("w4"); ++my @dat=map("v$_",(16..23)); ++$code.=<<___; ++.globl ${prefix}_ecb_encrypt ++.type ${prefix}_ecb_encrypt,%function ++.align 5 ++${prefix}_ecb_encrypt: ++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64 ++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] ++1: ++ cmp $len,#64 ++ b.lt 1f ++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 ++ cmp $len,#128 ++ b.lt 2f ++ ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64 ++ // 8 blocks ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],@dat[7]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++$code.=<<___; ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++___ ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],@dat[7]); ++$code.=<<___; ++ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 ++ subs $len,$len,#128 ++ b.gt 1b ++ ret ++ // 4 blocks ++2: ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++$code.=<<___; ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ subs $len,$len,#64 ++ b.gt 1b ++1: ++ subs $len,$len,#16 ++ b.lt 1f ++ ld1 {@dat[0].4s},[$inp],#16 ++___ ++ &rev32(@dat[0],@dat[0]); ++ &enc_blk(@dat[0]); ++ &rev32(@dat[0],@dat[0]); ++$code.=<<___; ++ st1 {@dat[0].4s},[$out],#16 ++ b.ne 1b ++1: ++ ret ++.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt ++___ ++}}} ++ ++{{{ ++my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4)); ++my ($enc) = ("w5"); ++my @dat=map("v$_",(16..23)); ++my @in=map("v$_",(24..31)); ++my ($ivec) = ("v8"); ++$code.=<<___; ++.globl ${prefix}_cbc_encrypt ++.type ${prefix}_cbc_encrypt,%function ++.align 5 ++${prefix}_cbc_encrypt: ++ stp d8,d9,[sp, #-16]! ++ ++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64 ++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] ++ ld1 {$ivec.4s},[$ivp] ++ cmp $enc,#0 ++ b.eq .Ldec ++1: ++ cmp $len, #64 ++ b.lt 1f ++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 ++ eor @dat[0].16b,@dat[0].16b,$ivec.16b ++___ ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &enc_blk(@dat[0]); ++$code.=<<___; ++ eor @dat[1].16b,@dat[1].16b,@dat[0].16b ++___ ++ &enc_blk(@dat[1]); ++ &rev32(@dat[0],@dat[0]); ++$code.=<<___; ++ eor @dat[2].16b,@dat[2].16b,@dat[1].16b ++___ ++ &enc_blk(@dat[2]); ++ &rev32(@dat[1],@dat[1]); ++$code.=<<___; ++ eor @dat[3].16b,@dat[3].16b,@dat[2].16b ++___ ++ &enc_blk(@dat[3]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++$code.=<<___; ++ mov $ivec.16b,@dat[3].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ subs $len,$len,#64 ++ b.ne 1b ++1: ++ subs $len,$len,#16 ++ b.lt 3f ++ ld1 {@dat[0].4s},[$inp],#16 ++ eor $ivec.16b,$ivec.16b,@dat[0].16b ++___ ++ &rev32($ivec,$ivec); ++ &enc_blk($ivec); ++ &rev32($ivec,$ivec); ++$code.=<<___; ++ st1 {$ivec.16b},[$out],#16 ++ b.ne 1b ++ b 3f ++.Ldec: ++1: ++ cmp $len, #64 ++ b.lt 1f ++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp] ++ ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64 ++ cmp $len,#128 ++ b.lt 2f ++ // 8 blocks mode ++ ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp] ++ ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64 ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],$dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],$dat[7]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],@dat[7]); ++$code.=<<___; ++ eor @dat[0].16b,@dat[0].16b,$ivec.16b ++ eor @dat[1].16b,@dat[1].16b,@in[0].16b ++ eor @dat[2].16b,@dat[2].16b,@in[1].16b ++ mov $ivec.16b,@in[7].16b ++ eor @dat[3].16b,$dat[3].16b,@in[2].16b ++ eor @dat[4].16b,$dat[4].16b,@in[3].16b ++ eor @dat[5].16b,$dat[5].16b,@in[4].16b ++ eor @dat[6].16b,$dat[6].16b,@in[5].16b ++ eor @dat[7].16b,$dat[7].16b,@in[6].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 ++ subs $len,$len,128 ++ b.gt 1b ++ b 3f ++ // 4 blocks mode ++2: ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],$dat[3]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++$code.=<<___; ++ eor @dat[0].16b,@dat[0].16b,$ivec.16b ++ eor @dat[1].16b,@dat[1].16b,@in[0].16b ++ mov $ivec.16b,@in[3].16b ++ eor @dat[2].16b,@dat[2].16b,@in[1].16b ++ eor @dat[3].16b,$dat[3].16b,@in[2].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ subs $len,$len,#64 ++ b.gt 1b ++1: ++ subs $len,$len,#16 ++ b.lt 3f ++ ld1 {@dat[0].4s},[$inp],#16 ++ mov @in[0].16b,@dat[0].16b ++___ ++ &rev32(@dat[0],@dat[0]); ++ &enc_blk(@dat[0]); ++ &rev32(@dat[0],@dat[0]); ++$code.=<<___; ++ eor @dat[0].16b,@dat[0].16b,$ivec.16b ++ mov $ivec.16b,@in[0].16b ++ st1 {@dat[0].16b},[$out],#16 ++ b.ne 1b ++3: ++ // save back IV ++ st1 {$ivec.16b},[$ivp] ++ ldp d8,d9,[sp],#16 ++ ret ++.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt ++___ ++}}} ++ ++{{{ ++my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4)); ++my ($ctr)=("w5"); ++my @dat=map("v$_",(16..23)); ++my @in=map("v$_",(24..31)); ++my ($ivec)=("v8"); ++$code.=<<___; ++.globl ${prefix}_ctr32_encrypt_blocks ++.type ${prefix}_ctr32_encrypt_blocks,%function ++.align 5 ++${prefix}_ctr32_encrypt_blocks: ++ stp d8,d9,[sp, #-16]! ++ ++ ld1 {$ivec.4s},[$ivp] ++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64 ++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] ++___ ++ &rev32($ivec,$ivec); ++$code.=<<___; ++ mov $ctr,$ivec.s[3] ++1: ++ cmp $len,#4 ++ b.lt 1f ++ ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64 ++ mov @dat[0].16b,$ivec.16b ++ mov @dat[1].16b,$ivec.16b ++ mov @dat[2].16b,$ivec.16b ++ mov @dat[3].16b,$ivec.16b ++ add $ctr,$ctr,#1 ++ mov $dat[1].s[3],$ctr ++ add $ctr,$ctr,#1 ++ mov @dat[2].s[3],$ctr ++ add $ctr,$ctr,#1 ++ mov @dat[3].s[3],$ctr ++ cmp $len,#8 ++ b.lt 2f ++ ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64 ++ mov @dat[4].16b,$ivec.16b ++ mov @dat[5].16b,$ivec.16b ++ mov @dat[6].16b,$ivec.16b ++ mov @dat[7].16b,$ivec.16b ++ add $ctr,$ctr,#1 ++ mov $dat[4].s[3],$ctr ++ add $ctr,$ctr,#1 ++ mov @dat[5].s[3],$ctr ++ add $ctr,$ctr,#1 ++ mov @dat[6].s[3],$ctr ++ add $ctr,$ctr,#1 ++ mov @dat[7].s[3],$ctr ++___ ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],@dat[7]); ++$code.=<<___; ++ eor @dat[0].16b,@dat[0].16b,@in[0].16b ++ eor @dat[1].16b,@dat[1].16b,@in[1].16b ++ eor @dat[2].16b,@dat[2].16b,@in[2].16b ++ eor @dat[3].16b,@dat[3].16b,@in[3].16b ++ eor @dat[4].16b,@dat[4].16b,@in[4].16b ++ eor @dat[5].16b,@dat[5].16b,@in[5].16b ++ eor @dat[6].16b,@dat[6].16b,@in[6].16b ++ eor @dat[7].16b,@dat[7].16b,@in[7].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 ++ subs $len,$len,#8 ++ b.eq 3f ++ add $ctr,$ctr,#1 ++ mov $ivec.s[3],$ctr ++ b 1b ++2: ++___ ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++$code.=<<___; ++ eor @dat[0].16b,@dat[0].16b,@in[0].16b ++ eor @dat[1].16b,@dat[1].16b,@in[1].16b ++ eor @dat[2].16b,@dat[2].16b,@in[2].16b ++ eor @dat[3].16b,@dat[3].16b,@in[3].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ subs $len,$len,#4 ++ b.eq 3f ++ add $ctr,$ctr,#1 ++ mov $ivec.s[3],$ctr ++ b 1b ++1: ++ subs $len,$len,#1 ++ b.lt 3f ++ mov $dat[0].16b,$ivec.16b ++ ld1 {@in[0].4s},[$inp],#16 ++___ ++ &enc_blk(@dat[0]); ++ &rev32(@dat[0],@dat[0]); ++$code.=<<___; ++ eor $dat[0].16b,$dat[0].16b,@in[0].16b ++ st1 {$dat[0].4s},[$out],#16 ++ b.eq 3f ++ add $ctr,$ctr,#1 ++ mov $ivec.s[3],$ctr ++ b 1b ++3: ++ ldp d8,d9,[sp],#16 ++ ret ++.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks ++___ ++}}} ++######################################## ++{ my %opcode = ( ++ "sm4e" => 0xcec08400, ++ "sm4ekey" => 0xce60c800); ++ ++ sub unsm4 { ++ my ($mnemonic,$arg)=@_; ++ ++ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o ++ && ++ sprintf ".inst\t0x%08x\t//%s %s", ++ $opcode{$mnemonic}|$1|($2<<5)|($3<<16), ++ $mnemonic,$arg; ++ } ++} ++ ++open SELF,$0; ++while() { ++ next if (/^#!/); ++ last if (!s/^#/\/\// and !/^$/); ++ print; ++} ++close SELF; ++ ++foreach(split("\n",$code)) { ++ s/\`([^\`]*)\`/eval($1)/ge; ++ ++ s/\b(sm4\w+)\s+([qv].*)/unsm4($1,$2)/ge; ++ print $_,"\n"; ++} ++ ++close STDOUT or die "error closing STDOUT: $!"; +diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info +index bb042c5..4d26ede 100644 +--- a/crypto/sm4/build.info ++++ b/crypto/sm4/build.info +@@ -2,6 +2,17 @@ LIBS=../../libcrypto + SOURCE[../../libcrypto]=\ + sm4.c {- $target{sm4_asm_src} -} + ++GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl $(PERLASM_SCHEME) ++INCLUDE[sm4-armv8.o]=.. + + GENERATE[vpsm4_ex-armv8.S]=asm/vpsm4_ex-armv8.pl $(PERLASM_SCHEME) +-INCLUDE[vpsm4_ex-armv8.o]=.. +\ No newline at end of file ++INCLUDE[vpsm4_ex-armv8.o]=.. ++ ++BEGINRAW[Makefile] ++##### SM4 assembler implementations ++ ++# GNU make "catch all" ++{- $builddir -}/sm4-%.S: {- $sourcedir -}/asm/sm4-%.pl ++ CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ ++ ++ENDRAW[Makefile] +diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h +new file mode 100644 +index 0000000..2f5a6cf +--- /dev/null ++++ b/include/crypto/sm4_platform.h +@@ -0,0 +1,70 @@ ++/* ++ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++#ifndef OSSL_SM4_PLATFORM_H ++# define OSSL_SM4_PLATFORM_H ++# pragma once ++ ++# if defined(OPENSSL_CPUID_OBJ) ++# if (defined(__arm__) || defined(__arm) || defined(__aarch64__)) ++# include "arm_arch.h" ++# if __ARM_MAX_ARCH__>=7 ++# if defined(VPSM4_EX_ASM) ++# define VPSM4_EX_CAPABLE (OPENSSL_armcap_P & ARMV8_AES) ++# endif ++# define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4) ++# define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key ++# define HWSM4_set_decrypt_key sm4_v8_set_decrypt_key ++# define HWSM4_encrypt sm4_v8_encrypt ++# define HWSM4_decrypt sm4_v8_decrypt ++# define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt ++# define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt ++# define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks ++# endif ++# endif ++# endif /* OPENSSL_CPUID_OBJ */ ++ ++# if defined(HWSM4_CAPABLE) ++int HWSM4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); ++int HWSM4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); ++void HWSM4_encrypt(const unsigned char *in, unsigned char *out, ++ const SM4_KEY *key); ++void HWSM4_decrypt(const unsigned char *in, unsigned char *out, ++ const SM4_KEY *key); ++void HWSM4_cbc_encrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const SM4_KEY *key, ++ unsigned char *ivec, const int enc); ++void HWSM4_ecb_encrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const SM4_KEY *key, ++ const int enc); ++void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, ++ const unsigned char ivec[16]); ++# endif /* HWSM4_CAPABLE */ ++ ++#ifdef VPSM4_EX_CAPABLE ++void vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); ++void vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); ++#define vpsm4_ex_encrypt SM4_encrypt ++#define vpsm4_ex_decrypt SM4_encrypt ++void vpsm4_ex_ecb_encrypt( ++ const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key, const int enc); ++/* xts mode in GB/T 17964-2021 */ ++void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); ++void vpsm4_ex_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); ++/* xts mode in IEEE Std 1619-2007 */ ++void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); ++void vpsm4_ex_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); ++#endif /* VPSM4_EX_CAPABLE */ ++ ++#endif /* OSSL_SM4_PLATFORM_H */ +\ No newline at end of file +-- +2.36.1 + diff --git a/Feature-SM4-XTS-optimization-for-ARM-by-HW-instruction.patch b/Feature-SM4-XTS-optimization-for-ARM-by-HW-instruction.patch new file mode 100644 index 0000000..fef2d8f --- /dev/null +++ b/Feature-SM4-XTS-optimization-for-ARM-by-HW-instruction.patch @@ -0,0 +1,621 @@ +From 3f0898b2aea424f18f58a182803478f25548674e Mon Sep 17 00:00:00 2001 +From: Xu Yizhou +Date: Wed, 2 Nov 2022 11:13:07 +0800 +Subject: [PATCH 3/3] SM4 XTS optimization for ARM by HW instruction + +This patch implements the SM4 XTS optimization for ARM processor, +using SM4 HW instruction, which is an optional feature of +crypto extension for aarch64 V8. + +Signed-off-by: Xu Yizhou +--- + crypto/evp/e_sm4.c | 28 ++ + crypto/sm4/asm/sm4-armv8.pl | 498 +++++++++++++++++++++++++++++++++- + include/crypto/sm4_platform.h | 14 + + 3 files changed, 537 insertions(+), 3 deletions(-) + +diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c +index eaa5ba0..da4dbd3 100644 +--- a/crypto/evp/e_sm4.c ++++ b/crypto/evp/e_sm4.c +@@ -281,6 +281,34 @@ static int sm4_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + const int bytes = EVP_CIPHER_CTX_key_length(ctx) / 2; + xctx->stream_gb = NULL; + xctx->stream = NULL; ++#ifdef HWSM4_CAPABLE ++ if (HWSM4_CAPABLE) { ++ if (enc) { ++ HWSM4_set_encrypt_key(key, &xctx->ks1.ks); ++ xctx->xts.block1 = (block128_f) HWSM4_encrypt; ++# ifdef HWSM4_xts_encrypt_gb ++ xctx->stream_gb = HWSM4_xts_encrypt_gb; ++# endif ++# ifdef HWSM4_xts_encrypt ++ xctx->stream = HWSM4_xts_encrypt; ++# endif ++ } else { ++ HWSM4_set_decrypt_key(key, &xctx->ks1.ks); ++ xctx->xts.block1 = (block128_f) HWSM4_decrypt; ++# ifdef HWSM4_xts_decrypt_gb ++ xctx->stream_gb = HWSM4_xts_decrypt_gb; ++# endif ++# ifdef HWSM4_xts_decrypt ++ xctx->stream = HWSM4_xts_decrypt; ++# endif ++ } ++ HWSM4_set_encrypt_key(key + bytes, &xctx->ks2.ks); ++ xctx->xts.block2 = (block128_f) HWSM4_encrypt; ++ ++ xctx->xts.key1 = &xctx->ks1; ++ break; ++ } else ++#endif + #ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + if (enc) { +diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl +index dbacad2..923c1c0 100644 +--- a/crypto/sm4/asm/sm4-armv8.pl ++++ b/crypto/sm4/asm/sm4-armv8.pl +@@ -11,9 +11,9 @@ + # Oct 2021 + # + +-# $output is the last argument if it looks like a file (it has an extension) ++# $outut is the last argument if it looks like a file (it has an extension) + # $flavour is the first argument if it doesn't look like a file +-$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$outut = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; + $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +@@ -21,7 +21,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + +-open OUT,"| \"$^X\" $xlate $flavour \"$output\"" ++open OUT,"| \"$^X\" $xlate $flavour \"$outut\"" + or die "can't call $xlate: $!"; + *STDOUT=*OUT; + +@@ -110,6 +110,120 @@ $code.=<<___; + ___ + } + ++sub mov_reg_to_vec() { ++ my $src0 = shift; ++ my $src1 = shift; ++ my $desv = shift; ++$code.=<<___; ++ mov $desv.d[0],$src0 ++ mov $desv.d[1],$src1 ++#ifdef __ARMEB__ ++ rev32 $desv.16b,$desv.16b ++#endif ++___ ++} ++ ++sub mov_vec_to_reg() { ++ my $srcv = shift; ++ my $des0 = shift; ++ my $des1 = shift; ++$code.=<<___; ++ mov $des0,$srcv.d[0] ++ mov $des1,$srcv.d[1] ++___ ++} ++ ++sub compute_tweak() { ++ my $src0 = shift; ++ my $src1 = shift; ++ my $des0 = shift; ++ my $des1 = shift; ++ my $tmp0 = shift; ++ my $tmp1 = shift; ++ my $magic = shift; ++$code.=<<___; ++ extr x$tmp1,$src1,$src1,#32 ++ extr $des1,$src1,$src0,#63 ++ and w$tmp0,w$magic,w$tmp1,asr#31 ++ eor $des0,x$tmp0,$src0,lsl#1 ++___ ++} ++ ++sub compute_tweak_vec() { ++ my $src = shift; ++ my $des = shift; ++ my $tmp0 = shift; ++ my $tmp1 = shift; ++ my $magic = shift; ++ &rbit($tmp1,$src); ++$code.=<<___; ++ shl $des.16b, $tmp1.16b, #1 ++ ext $tmp0.16b, $tmp1.16b, $tmp1.16b,#15 ++ ushr $tmp0.16b, $tmp0.16b, #7 ++ mul $tmp0.16b, $tmp0.16b, $magic.16b ++ eor $des.16b, $des.16b, $tmp0.16b ++___ ++ &rbit($des,$des); ++} ++ ++sub mov_en_to_enc(){ ++ my $en = shift; ++ my $enc = shift; ++ if ($en eq "en") { ++$code.=<<___; ++ mov $enc,1 ++___ ++ } else { ++$code.=<<___; ++ mov $enc,0 ++___ ++ } ++} ++ ++sub rbit() { ++ my $dst = shift; ++ my $src = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++ if ($standard eq "_gb") { ++$code.=<<___; ++ rbit $dst.16b,$src.16b ++___ ++ } else { ++$code.=<<___; ++ mov $dst.16b,$src.16b ++___ ++ } ++ } else { ++ if ($standard eq "_gb") { ++$code.=<<___; ++ rbit $dst.16b,$src.16b ++___ ++ } ++ } ++} ++ ++sub rev32_armeb() { ++ my $dst = shift; ++ my $src = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++$code.=<<___; ++#ifdef __ARMEB__ ++ rev32 $dst.16b,$src.16b ++#else ++ mov $dst.16b,$src.16b ++#endif ++___ ++ } else { ++$code.=<<___; ++#ifdef __ARMEB__ ++ rev32 $dst.16b,$dst.16b ++#endif ++___ ++ } ++} ++ + $code=<<___; + #include "arm_arch.h" + .arch armv8-a+crypto +@@ -595,6 +709,384 @@ $code.=<<___; + .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks + ___ + }}} ++ ++ ++{{{ ++my ($inp,$out,$len,$rk1,$rk2,$ivp)=map("x$_",(0..5)); ++my ($blocks)=("x2"); ++my ($enc)=("x6"); ++my ($remain)=("x7"); ++my @twx=map("x$_",(9..24)); ++my $lastBlk=("x25"); ++ ++my @tweak=map("v$_",(8..15)); ++my @dat=map("v$_",(16..23)); ++my $lastTweak=("v24"); ++ ++# x/w/v/q registers for compute tweak ++my ($magic)=("8"); ++my ($tmp0,$tmp1)=("26","27"); ++my ($qMagic,$vMagic)=("q25","v25"); ++my ($vTmp0,$vTmp1)=("v26","v27"); ++ ++sub gen_xts_do_cipher() { ++$code.=<<___; ++.globl ${prefix}_xts_do_cipher${standard} ++.type ${prefix}_xts_do_cipher${standard},%function ++.align 5 ++${prefix}_xts_do_cipher${standard}: ++ mov w$magic,0x87 ++ ldr $qMagic, =0x01010101010101010101010101010187 ++ // used to encrypt the XORed plaintext blocks ++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk2],#64 ++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk2] ++ ld1 {@tweak[0].4s}, [$ivp] ++___ ++ &rev32(@tweak[0],@tweak[0]); ++ &enc_blk(@tweak[0]); ++ &rev32(@tweak[0],@tweak[0]); ++$code.=<<___; ++ // used to encrypt the initial vector to yield the initial tweak ++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk1],#64 ++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk1] ++ ++ and $remain,$len,#0x0F ++ // convert length into blocks ++ lsr $blocks,$len,4 ++ cmp $blocks,#1 // $len must be at least 16 ++ b.lt 99f ++ ++ cmp $remain,0 // if $len is a multiple of 16 ++ b.eq .xts_encrypt_blocks${standard} ++ // if $len is not a multiple of 16 ++ subs $blocks,$blocks,#1 ++ b.eq .only_2blks_tweak${standard} // if $len is less than 32 ++ ++.xts_encrypt_blocks${standard}: ++___ ++ &rbit(@tweak[0],@tweak[0]); ++ &rev32_armeb(@tweak[0],@tweak[0]); ++ &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); ++ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3],$tmp0,$tmp1,$magic); ++ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5],$tmp0,$tmp1,$magic); ++ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7],$tmp0,$tmp1,$magic); ++ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9],$tmp0,$tmp1,$magic); ++ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11],$tmp0,$tmp1,$magic); ++ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13],$tmp0,$tmp1,$magic); ++ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15],$tmp0,$tmp1,$magic); ++$code.=<<___; ++1: ++ cmp $blocks,#8 ++___ ++ &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); ++ &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1],$tmp0,$tmp1,$magic); ++ &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); ++ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3],$tmp0,$tmp1,$magic); ++ &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); ++ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5],$tmp0,$tmp1,$magic); ++ &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); ++ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7],$tmp0,$tmp1,$magic); ++ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]); ++ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9],$tmp0,$tmp1,$magic); ++ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]); ++ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11],$tmp0,$tmp1,$magic); ++ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]); ++ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13],$tmp0,$tmp1,$magic); ++ &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]); ++ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15],$tmp0,$tmp1,$magic); ++$code.=<<___; ++ b.lt 2f ++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 ++___ ++ &rbit(@tweak[0],@tweak[0]); ++ &rbit(@tweak[1],@tweak[1]); ++ &rbit(@tweak[2],@tweak[2]); ++ &rbit(@tweak[3],@tweak[3]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b ++ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b ++ eor @dat[3].16b, @dat[3].16b, @tweak[3].16b ++ ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64 ++___ ++ &rbit(@tweak[4],@tweak[4]); ++ &rbit(@tweak[5],@tweak[5]); ++ &rbit(@tweak[6],@tweak[6]); ++ &rbit(@tweak[7],@tweak[7]); ++$code.=<<___; ++ eor @dat[4].16b, @dat[4].16b, @tweak[4].16b ++ eor @dat[5].16b, @dat[5].16b, @tweak[5].16b ++ eor @dat[6].16b, @dat[6].16b, @tweak[6].16b ++ eor @dat[7].16b, @dat[7].16b, @tweak[7].16b ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],@dat[7]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],@dat[7]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b ++ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b ++ eor @dat[3].16b, @dat[3].16b, @tweak[3].16b ++ eor @dat[4].16b, @dat[4].16b, @tweak[4].16b ++ eor @dat[5].16b, @dat[5].16b, @tweak[5].16b ++ eor @dat[6].16b, @dat[6].16b, @tweak[6].16b ++ eor @dat[7].16b, @dat[7].16b, @tweak[7].16b ++ ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[7].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 ++ subs $blocks,$blocks,#8 ++ b.eq 100f ++ b 1b ++2: ++ // process 4 blocks ++ cmp $blocks,#4 ++ b.lt 1f ++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 ++___ ++ &rbit(@tweak[0],@tweak[0]); ++ &rbit(@tweak[1],@tweak[1]); ++ &rbit(@tweak[2],@tweak[2]); ++ &rbit(@tweak[3],@tweak[3]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b ++ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b ++ eor @dat[3].16b, @dat[3].16b, @tweak[3].16b ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b ++ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b ++ eor @dat[3].16b, @dat[3].16b, @tweak[3].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ sub $blocks,$blocks,#4 ++ mov @tweak[0].16b,@tweak[4].16b ++ mov @tweak[1].16b,@tweak[5].16b ++ mov @tweak[2].16b,@tweak[6].16b ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[3].16b ++1: ++ // process last block ++ cmp $blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ ld1 {@dat[0].4s},[$inp],#16 ++___ ++ &rbit(@tweak[0],@tweak[0]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++___ ++ &rev32(@dat[0],@dat[0]); ++ &enc_blk(@dat[0]); ++ &rev32(@dat[0],@dat[0]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++ st1 {@dat[0].4s},[$out],#16 ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[0].16b ++ b 100f ++1: // process last 2 blocks ++ cmp $blocks,#2 ++ b.gt 1f ++ ld1 {@dat[0].4s,@dat[1].4s},[$inp],#32 ++___ ++ &rbit(@tweak[0],@tweak[0]); ++ &rbit(@tweak[1],@tweak[1]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b ++ st1 {@dat[0].4s,@dat[1].4s},[$out],#32 ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[1].16b ++ b 100f ++1: // process last 3 blocks ++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s},[$inp],#48 ++___ ++ &rbit(@tweak[0],@tweak[0]); ++ &rbit(@tweak[1],@tweak[1]); ++ &rbit(@tweak[2],@tweak[2]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b ++ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b ++ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s},[$out],#48 ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[2].16b ++100: ++ cmp $remain,0 ++ b.eq 99f ++ ++// This brance calculates the last two tweaks, ++// while the encryption/decryption length is larger than 32 ++.last_2blks_tweak${standard}: ++___ ++ &rev32_armeb($lastTweak,$lastTweak); ++ &compute_tweak_vec($lastTweak,@tweak[1],$vTmp0,$vTmp1,$vMagic); ++ &compute_tweak_vec(@tweak[1],@tweak[2],$vTmp0,$vTmp1,$vMagic); ++$code.=<<___; ++ b .check_dec${standard} ++ ++ ++// This brance calculates the last two tweaks, ++// while the encryption/decryption length is less than 32, who only need two tweaks ++.only_2blks_tweak${standard}: ++ mov @tweak[1].16b,@tweak[0].16b ++___ ++ &rev32_armeb(@tweak[1],@tweak[1]); ++ &compute_tweak_vec(@tweak[1],@tweak[2],$vTmp0,$vTmp1,$vMagic); ++$code.=<<___; ++ b .check_dec${standard} ++ ++ ++// Determine whether encryption or decryption is required. ++// The last two tweaks need to be swapped for decryption. ++.check_dec${standard}: ++ // encryption:1 decryption:0 ++ cmp $enc,1 ++ b.eq .prcess_last_2blks${standard} ++ mov $vTmp0.16B,@tweak[1].16b ++ mov @tweak[1].16B,@tweak[2].16b ++ mov @tweak[2].16B,$vTmp0.16b ++ ++.prcess_last_2blks${standard}: ++___ ++ &rev32_armeb(@tweak[1],@tweak[1]); ++ &rev32_armeb(@tweak[2],@tweak[2]); ++$code.=<<___; ++ ld1 {@dat[0].4s},[$inp],#16 ++ eor @dat[0].16b, @dat[0].16b, @tweak[1].16b ++___ ++ &rev32(@dat[0],@dat[0]); ++ &enc_blk(@dat[0]); ++ &rev32(@dat[0],@dat[0]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[1].16b ++ st1 {@dat[0].4s},[$out],#16 ++ ++ sub $lastBlk,$out,16 ++ .loop${standard}: ++ subs $remain,$remain,1 ++ ldrb w$tmp0,[$lastBlk,$remain] ++ ldrb w$tmp1,[$inp,$remain] ++ strb w$tmp1,[$lastBlk,$remain] ++ strb w$tmp0,[$out,$remain] ++ b.gt .loop${standard} ++ ld1 {@dat[0].4s}, [$lastBlk] ++ eor @dat[0].16b, @dat[0].16b, @tweak[2].16b ++___ ++ &rev32(@dat[0],@dat[0]); ++ &enc_blk(@dat[0]); ++ &rev32(@dat[0],@dat[0]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[2].16b ++ st1 {@dat[0].4s}, [$lastBlk] ++99: ++ ret ++.size ${prefix}_xts_do_cipher${standard},.-${prefix}_xts_do_cipher${standard} ++___ ++} #end of gen_xts_do_cipher ++ ++}}} ++ ++{{{ ++my ($enc)=("w6"); ++ ++sub gen_xts_cipher() { ++ my $en = shift; ++$code.=<<___; ++.globl ${prefix}_xts_${en}crypt${standard} ++.type ${prefix}_xts_${en}crypt${standard},%function ++.align 5 ++${prefix}_xts_${en}crypt${standard}: ++ stp x15, x16, [sp, #-0x10]! ++ stp x17, x18, [sp, #-0x10]! ++ stp x19, x20, [sp, #-0x10]! ++ stp x21, x22, [sp, #-0x10]! ++ stp x23, x24, [sp, #-0x10]! ++ stp x25, x26, [sp, #-0x10]! ++ stp x27, x28, [sp, #-0x10]! ++ stp x29, x30, [sp, #-0x10]! ++ stp d8, d9, [sp, #-0x10]! ++ stp d10, d11, [sp, #-0x10]! ++ stp d12, d13, [sp, #-0x10]! ++ stp d14, d15, [sp, #-0x10]! ++___ ++ &mov_en_to_enc($en,$enc); ++$code.=<<___; ++ bl ${prefix}_xts_do_cipher${standard} ++ ldp d14, d15, [sp], #0x10 ++ ldp d12, d13, [sp], #0x10 ++ ldp d10, d11, [sp], #0x10 ++ ldp d8, d9, [sp], #0x10 ++ ldp x29, x30, [sp], #0x10 ++ ldp x27, x28, [sp], #0x10 ++ ldp x25, x26, [sp], #0x10 ++ ldp x23, x24, [sp], #0x10 ++ ldp x21, x22, [sp], #0x10 ++ ldp x19, x20, [sp], #0x10 ++ ldp x17, x18, [sp], #0x10 ++ ldp x15, x16, [sp], #0x10 ++ ret ++.size ${prefix}_xts_${en}crypt${standard},.-${prefix}_xts_${en}crypt${standard} ++___ ++ ++} # end of gen_xts_cipher ++$standard="_gb"; ++&gen_xts_do_cipher(); ++&gen_xts_cipher("en"); ++&gen_xts_cipher("de"); ++$standard=""; ++&gen_xts_do_cipher(); ++&gen_xts_cipher("en"); ++&gen_xts_cipher("de"); ++}}} + ######################################## + { my %opcode = ( + "sm4e" => 0xcec08400, +diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h +index 2f5a6cf..0bde96f 100644 +--- a/include/crypto/sm4_platform.h ++++ b/include/crypto/sm4_platform.h +@@ -26,6 +26,10 @@ + # define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt + # define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt + # define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks ++# define HWSM4_xts_encrypt_gb sm4_v8_xts_encrypt_gb ++# define HWSM4_xts_decrypt_gb sm4_v8_xts_decrypt_gb ++# define HWSM4_xts_encrypt sm4_v8_xts_encrypt ++# define HWSM4_xts_decrypt sm4_v8_xts_decrypt + # endif + # endif + # endif /* OPENSSL_CPUID_OBJ */ +@@ -46,6 +50,16 @@ void HWSM4_ecb_encrypt(const unsigned char *in, unsigned char *out, + void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + const unsigned char ivec[16]); ++/* xts mode in GB/T 17964-2021 */ ++void HWSM4_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); ++void HWSM4_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); ++/* xts mode in IEEE Std 1619-2007 */ ++void HWSM4_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); ++void HWSM4_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); + # endif /* HWSM4_CAPABLE */ + + #ifdef VPSM4_EX_CAPABLE +-- +2.36.1 + diff --git a/openssl.spec b/openssl.spec index 00beec3..427817b 100644 --- a/openssl.spec +++ b/openssl.spec @@ -2,7 +2,7 @@ Name: openssl Epoch: 1 Version: 1.1.1m -Release: 11 +Release: 12 Summary: Cryptography and SSL/TLS Toolkit License: OpenSSL and SSLeay URL: https://www.openssl.org/ @@ -34,7 +34,10 @@ Patch23: CVE-2022-2068-Fix-file-operations-in-c_rehash.patch Patch24: CVE-2022-2097-Fix-AES-OCB-encrypt-decrypt-for-x86-AES-NI.patch Patch25: Feature-add-ARMv8-implementations-of-SM4-in-ECB-and-XTS.patch Patch26: Fix-reported-performance-degradation-on-aarch64.patch -Patch27: Feature-PKCS7-sign-and-verify-support-SM2-algorithm.patch +Patch27: Feature-PKCS7-sign-and-verify-support-SM2-algorithm.patch +Patch28: Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch +Patch29: Backport-SM4-optimization-for-ARM-by-HW-instruction.patch +Patch30: Feature-SM4-XTS-optimization-for-ARM-by-HW-instruction.patch BuildRequires: gcc perl make lksctp-tools-devel coreutils util-linux zlib-devel Requires: coreutils %{name}-libs%{?_isa} = %{epoch}:%{version}-%{release} @@ -237,6 +240,11 @@ make test || : %ldconfig_scriptlets libs %changelog +* Wed Nov 2 2022 Xu Yizhou - 1:1.1.1m-12 +- SM3 acceleration with SM3 hardware instruction on aarch64 +- SM4 optimization for ARM by HW instruction +- SM4 XTS optimization for ARM by HW instruction + * Wed Oct 26 2022 luhuaxin - 1:1.1.1m-11 - fix cms testcase -- Gitee