diff --git a/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch b/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch new file mode 100644 index 0000000000000000000000000000000000000000..722d5061beb145474c72e3731cdfdf6f0eee823f --- /dev/null +++ b/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch @@ -0,0 +1,492 @@ +From 4d2e328357ac4b468d4762a5a5f615d7e7bf46a6 Mon Sep 17 00:00:00 2001 +From: Xu Yizhou +Date: Thu, 27 Oct 2022 20:49:34 +0800 +Subject: [PATCH 1/3] SM3 acceleration with SM3 hardware instruction on aarch64 + +This patch contains the following two PRs, + +1. SM3 acceleration with SM3 hardware instruction on aarch64 + +SM3 hardware instruction is optional feature of crypto extension for +aarch64. This implementation accelerates SM3 via SM3 instructions. For +the platform not supporting SM3 instruction, the original C +implementation still works. Thanks to AliBaba for testing and reporting +the following perf numbers for Yitian710: + +Benchmark on T-Head Yitian-710 2.75GHz: + +Before: +type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes +sm3 49297.82k 121062.63k 223106.05k 283371.52k 307574.10k 309400.92k + +After (33% - 74% faster): +type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes +sm3 65640.01k 179121.79k 359854.59k 481448.96k 534055.59k 538274.47k + +Reviewed-by: Paul Dale +Reviewed-by: Tomas Mraz +(Merged from https://github.com/openssl/openssl/pull/17454) + +2. Fix sm3ss1 translation issue in sm3-armv8.pl + +Reviewed-by: Tomas Mraz +Reviewed-by: Matt Caswell +Reviewed-by: Paul Dale +(Merged from https://github.com/openssl/openssl/pull/17542) + +Signed-off-by: Xu Yizhou +--- + Configurations/00-base-templates.conf | 1 + + Configure | 4 + + crypto/arm64cpuid.pl | 7 + + crypto/arm_arch.h | 1 + + crypto/armcap.c | 10 + + crypto/sm3/asm/sm3-armv8.pl | 280 ++++++++++++++++++++++++++ + crypto/sm3/build.info | 15 +- + crypto/sm3/sm3_local.h | 16 +- + 8 files changed, 332 insertions(+), 2 deletions(-) + create mode 100644 crypto/sm3/asm/sm3-armv8.pl + +diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf +index 1d35012..a67ae65 100644 +--- a/Configurations/00-base-templates.conf ++++ b/Configurations/00-base-templates.conf +@@ -322,6 +322,7 @@ my %targets=( + poly1305_asm_src=> "poly1305-armv8.S", + keccak1600_asm_src => "keccak1600-armv8.S", + sm4_asm_src => "vpsm4_ex-armv8.S", ++ sm3_asm_src => "sm3-armv8.S", + }, + parisc11_asm => { + template => 1, +diff --git a/Configure b/Configure +index 3bfe360..fce460d 100755 +--- a/Configure ++++ b/Configure +@@ -1423,6 +1423,9 @@ unless ($disabled{asm}) { + if ($target{sm4_asm_src} ne "") { + push @{$config{lib_defines}}, "VPSM4_EX_ASM"; + } ++ if ($target{sm3_asm_src} ne "") { ++ push @{$config{lib_defines}}, "SM3_ASM"; ++ } + } + + my %predefined_C = compiler_predefined($config{CROSS_COMPILE}.$config{CC}); +@@ -3379,6 +3382,7 @@ sub print_table_entry + "multilib", + "build_scheme", + "sm4_asm_src", ++ "sm3_asm_src", + ); + + if ($type eq "TABLE") { +diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl +index 319927e..1e9b167 100755 +--- a/crypto/arm64cpuid.pl ++++ b/crypto/arm64cpuid.pl +@@ -78,6 +78,13 @@ _armv8_sha512_probe: + ret + .size _armv8_sha512_probe,.-_armv8_sha512_probe + ++.globl _armv8_sm3_probe ++.type _armv8_sm3_probe,%function ++_armv8_sm3_probe: ++ .long 0xce63c004 // sm3partw1 v4.4s, v0.4s, v3.4s ++ ret ++.size _armv8_sm3_probe,.-_armv8_sm3_probe ++ + .globl OPENSSL_cleanse + .type OPENSSL_cleanse,%function + .align 5 +diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h +index 8b71055..8839b21 100644 +--- a/crypto/arm_arch.h ++++ b/crypto/arm_arch.h +@@ -80,5 +80,6 @@ extern unsigned int OPENSSL_armcap_P; + # define ARMV8_SHA256 (1<<4) + # define ARMV8_PMULL (1<<5) + # define ARMV8_SHA512 (1<<6) ++# define ARMV8_SM3 (1<<9) + + #endif +diff --git a/crypto/armcap.c b/crypto/armcap.c +index 48c5d4d..8b2f4a5 100644 +--- a/crypto/armcap.c ++++ b/crypto/armcap.c +@@ -47,6 +47,7 @@ void _armv8_sha1_probe(void); + void _armv8_sha256_probe(void); + void _armv8_pmull_probe(void); + # ifdef __aarch64__ ++void _armv8_sm3_probe(void); + void _armv8_sha512_probe(void); + # endif + uint32_t _armv7_tick(void); +@@ -130,6 +131,7 @@ static unsigned long getauxval(unsigned long key) + # define HWCAP_CE_PMULL (1 << 4) + # define HWCAP_CE_SHA1 (1 << 5) + # define HWCAP_CE_SHA256 (1 << 6) ++# define HWCAP_CE_SM3 (1 << 18) + # define HWCAP_CE_SHA512 (1 << 21) + # endif + +@@ -190,6 +192,9 @@ void OPENSSL_cpuid_setup(void) + # ifdef __aarch64__ + if (hwcap & HWCAP_CE_SHA512) + OPENSSL_armcap_P |= ARMV8_SHA512; ++ ++ if (hwcap & HWCAP_CE_SM3) ++ OPENSSL_armcap_P |= ARMV8_SM3; + # endif + } + # endif +@@ -233,6 +238,11 @@ void OPENSSL_cpuid_setup(void) + _armv8_sha512_probe(); + OPENSSL_armcap_P |= ARMV8_SHA512; + } ++ ++ if (sigsetjmp(ill_jmp, 1) == 0) { ++ _armv8_sm3_probe(); ++ OPENSSL_armcap_P |= ARMV8_SM3; ++ } + # endif + } + # endif +diff --git a/crypto/sm3/asm/sm3-armv8.pl b/crypto/sm3/asm/sm3-armv8.pl +new file mode 100644 +index 0000000..677ca52 +--- /dev/null ++++ b/crypto/sm3/asm/sm3-armv8.pl +@@ -0,0 +1,280 @@ ++#! /usr/bin/env perl ++# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++# ++# This module implements support for Armv8 SM3 instructions ++ ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or ++die "can't locate arm-xlate.pl"; ++ ++open OUT,"| \"$^X\" $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++*STDOUT=*OUT; ++ ++# Message expanding: ++# Wj <- P1(W[j-16]^W[j-9]^(W[j-3]<<<15))^(W[j-13]<<<7)^W[j-6] ++# Input: s0, s1, s2, s3 ++# s0 = w0 | w1 | w2 | w3 ++# s1 = w4 | w5 | w6 | w7 ++# s2 = w8 | w9 | w10 | w11 ++# s3 = w12 | w13 | w14 | w15 ++# Output: s4 ++sub msg_exp () { ++my $s0 = shift; ++my $s1 = shift; ++my $s2 = shift; ++my $s3 = shift; ++my $s4 = shift; ++my $vtmp1 = shift; ++my $vtmp2 = shift; ++$code.=<<___; ++ // s4 = w7 | w8 | w9 | w10 ++ ext $s4.16b, $s1.16b, $s2.16b, #12 ++ // vtmp1 = w3 | w4 | w5 | w6 ++ ext $vtmp1.16b, $s0.16b, $s1.16b, #12 ++ // vtmp2 = w10 | w11 | w12 | w13 ++ ext $vtmp2.16b, $s2.16b, $s3.16b, #8 ++ sm3partw1 $s4.4s, $s0.4s, $s3.4s ++ sm3partw2 $s4.4s, $vtmp2.4s, $vtmp1.4s ++___ ++} ++ ++# A round of compresson function ++# Input: ++# ab - choose instruction among sm3tt1a, sm3tt1b, sm3tt2a, sm3tt2b ++# vstate0 - vstate1, store digest status(A - H) ++# vconst0 - vconst1, interleaved used to store Tj <<< j ++# vtmp - temporary register ++# vw - for sm3tt1ab, vw = s0 eor s1 ++# s0 - for sm3tt2ab, just be s0 ++# i, choose wj' or wj from vw ++sub round () { ++my $ab = shift; ++my $vstate0 = shift; ++my $vstate1 = shift; ++my $vconst0 = shift; ++my $vconst1 = shift; ++my $vtmp = shift; ++my $vw = shift; ++my $s0 = shift; ++my $i = shift; ++$code.=<<___; ++ sm3ss1 $vtmp.4s, $vstate0.4s, $vconst0.4s, $vstate1.4s ++ shl $vconst1.4s, $vconst0.4s, #1 ++ sri $vconst1.4s, $vconst0.4s, #31 ++ sm3tt1$ab $vstate0.4s, $vtmp.4s, $vw.4s[$i] ++ sm3tt2$ab $vstate1.4s, $vtmp.4s, $s0.4s[$i] ++___ ++} ++ ++sub qround () { ++my $ab = shift; ++my $vstate0 = shift; ++my $vstate1 = shift; ++my $vconst0 = shift; ++my $vconst1 = shift; ++my $vtmp1 = shift; ++my $vtmp2 = shift; ++my $s0 = shift; ++my $s1 = shift; ++my $s2 = shift; ++my $s3 = shift; ++my $s4 = shift; ++ if($s4) { ++ &msg_exp($s0, $s1, $s2, $s3, $s4, $vtmp1, $vtmp2); ++ } ++$code.=<<___; ++ eor $vtmp1.16b, $s0.16b, $s1.16b ++___ ++ &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2, ++ $vtmp1, $s0, 0); ++ &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2, ++ $vtmp1, $s0, 1); ++ &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2, ++ $vtmp1, $s0, 2); ++ &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2, ++ $vtmp1, $s0, 3); ++} ++ ++$code=<<___; ++#include "arm_arch.h" ++.arch armv8.2-a ++.text ++___ ++ ++{{{ ++my ($pstate,$pdata,$num)=("x0","x1","w2"); ++my ($state1,$state2)=("v5","v6"); ++my ($sconst1, $sconst2)=("s16","s17"); ++my ($vconst1, $vconst2)=("v16","v17"); ++my ($s0,$s1,$s2,$s3,$s4)=map("v$_",(0..4)); ++my ($bkstate1,$bkstate2)=("v18","v19"); ++my ($vconst_tmp1,$vconst_tmp2)=("v20","v21"); ++my ($vtmp1,$vtmp2)=("v22","v23"); ++my $constaddr="x8"; ++# void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num) ++$code.=<<___; ++.globl ossl_hwsm3_block_data_order ++.type ossl_hwsm3_block_data_order,%function ++.align 5 ++ossl_hwsm3_block_data_order: ++ // load state ++ ld1 {$state1.4s-$state2.4s}, [$pstate] ++ rev64 $state1.4s, $state1.4s ++ rev64 $state2.4s, $state2.4s ++ ext $state1.16b, $state1.16b, $state1.16b, #8 ++ ext $state2.16b, $state2.16b, $state2.16b, #8 ++ ++ adr $constaddr, .Tj ++ ldp $sconst1, $sconst2, [$constaddr] ++ ++.Loop: ++ // load input ++ ld1 {$s0.16b-$s3.16b}, [$pdata], #64 ++ sub $num, $num, #1 ++ ++ mov $bkstate1.16b, $state1.16b ++ mov $bkstate2.16b, $state2.16b ++ ++#ifndef __ARMEB__ ++ rev32 $s0.16b, $s0.16b ++ rev32 $s1.16b, $s1.16b ++ rev32 $s2.16b, $s2.16b ++ rev32 $s3.16b, $s3.16b ++#endif ++ ++ ext $vconst_tmp1.16b, $vconst1.16b, $vconst1.16b, #4 ++___ ++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s0,$s1,$s2,$s3,$s4); ++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s1,$s2,$s3,$s4,$s0); ++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s2,$s3,$s4,$s0,$s1); ++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s3,$s4,$s0,$s1,$s2); ++ ++$code.=<<___; ++ ext $vconst_tmp1.16b, $vconst2.16b, $vconst2.16b, #4 ++___ ++ ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s4,$s0,$s1,$s2,$s3); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s0,$s1,$s2,$s3,$s4); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s1,$s2,$s3,$s4,$s0); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s2,$s3,$s4,$s0,$s1); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s3,$s4,$s0,$s1,$s2); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s4,$s0,$s1,$s2,$s3); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s0,$s1,$s2,$s3,$s4); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s1,$s2,$s3,$s4,$s0); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s2,$s3,$s4,$s0,$s1); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s3,$s4); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s4,$s0); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s0,$s1); ++ ++$code.=<<___; ++ eor $state1.16b, $state1.16b, $bkstate1.16b ++ eor $state2.16b, $state2.16b, $bkstate2.16b ++ ++ // any remained blocks? ++ cbnz $num, .Loop ++ ++ // save state ++ rev64 $state1.4s, $state1.4s ++ rev64 $state2.4s, $state2.4s ++ ext $state1.16b, $state1.16b, $state1.16b, #8 ++ ext $state2.16b, $state2.16b, $state2.16b, #8 ++ st1 {$state1.4s-$state2.4s}, [$pstate] ++ ret ++.size ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order ++ ++.align 3 ++.Tj: ++.word 0x79cc4519, 0x9d8a7a87 ++___ ++}}} ++ ++######################################### ++my %sm3partopcode = ( ++ "sm3partw1" => 0xce60C000, ++ "sm3partw2" => 0xce60C400); ++ ++my %sm3ss1opcode = ( ++ "sm3ss1" => 0xce400000); ++ ++my %sm3ttopcode = ( ++ "sm3tt1a" => 0xce408000, ++ "sm3tt1b" => 0xce408400, ++ "sm3tt2a" => 0xce408800, ++ "sm3tt2b" => 0xce408C00); ++ ++sub unsm3part { ++ my ($mnemonic,$arg)=@_; ++ ++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o ++ && ++ sprintf ".inst\t0x%08x\t//%s %s", ++ $sm3partopcode{$mnemonic}|$1|($2<<5)|($3<<16), ++ $mnemonic,$arg; ++} ++ ++sub unsm3ss1 { ++ my ($mnemonic,$arg)=@_; ++ ++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o ++ && ++ sprintf ".inst\t0x%08x\t//%s %s", ++ $sm3ss1opcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10), ++ $mnemonic,$arg; ++} ++ ++sub unsm3tt { ++ my ($mnemonic,$arg)=@_; ++ ++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*\[([0-3])\]/o ++ && ++ sprintf ".inst\t0x%08x\t//%s %s", ++ $sm3ttopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<12), ++ $mnemonic,$arg; ++} ++ ++open SELF,$0; ++while() { ++ next if (/^#!/); ++ last if (!s/^#/\/\// and !/^$/); ++ print; ++} ++close SELF; ++ ++foreach(split("\n",$code)) { ++ s/\`([^\`]*)\`/eval($1)/ge; ++ ++ s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge; ++ s/\b(sm3ss1)\s+([qv].*)/unsm3ss1($1,$2)/ge; ++ s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge; ++ print $_,"\n"; ++} ++ ++close STDOUT or die "error closing STDOUT: $!"; +diff --git a/crypto/sm3/build.info b/crypto/sm3/build.info +index 6009b19..e113729 100644 +--- a/crypto/sm3/build.info ++++ b/crypto/sm3/build.info +@@ -1,2 +1,15 @@ + LIBS=../../libcrypto +-SOURCE[../../libcrypto]=sm3.c m_sm3.c ++SOURCE[../../libcrypto]=\ ++ sm3.c m_sm3.c {- $target{sm3_asm_src} -} ++ ++GENERATE[sm3-armv8.S]=asm/sm3-armv8.pl $(PERLASM_SCHEME) ++INCLUDE[sm3-armv8.o]=.. ++ ++BEGINRAW[Makefile] ++##### SM3 assembler implementations ++ ++# GNU make "catch all" ++{- $builddir -}/sm3-%.S: {- $sourcedir -}/asm/sm3-%.pl ++ CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ ++ ++ENDRAW[Makefile] +\ No newline at end of file +diff --git a/crypto/sm3/sm3_local.h b/crypto/sm3/sm3_local.h +index 7171de5..aafff63 100644 +--- a/crypto/sm3/sm3_local.h ++++ b/crypto/sm3/sm3_local.h +@@ -32,7 +32,21 @@ + ll=(c)->G; (void)HOST_l2c(ll, (s)); \ + ll=(c)->H; (void)HOST_l2c(ll, (s)); \ + } while (0) +-#define HASH_BLOCK_DATA_ORDER sm3_block_data_order ++ ++#if defined(SM3_ASM) ++# if defined(__aarch64__) ++# include "crypto/arm_arch.h" ++# define HWSM3_CAPABLE (OPENSSL_armcap_P & ARMV8_SM3) ++void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num); ++# endif ++#endif ++ ++#if defined(HWSM3_CAPABLE) ++# define HASH_BLOCK_DATA_ORDER (HWSM3_CAPABLE ? ossl_hwsm3_block_data_order \ ++ : sm3_block_data_order) ++#else ++# define HASH_BLOCK_DATA_ORDER sm3_block_data_order ++#endif + + void sm3_transform(SM3_CTX *c, const unsigned char *data); + +-- +2.36.1 + diff --git a/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch b/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch new file mode 100644 index 0000000000000000000000000000000000000000..f525e708169046c40aebbcf61924f116e5ee2038 --- /dev/null +++ b/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch @@ -0,0 +1,1032 @@ +From 4f7e522f7fda2c55c4915396d08f8c9cf3b3fba8 Mon Sep 17 00:00:00 2001 +From: Xu Yizhou +Date: Fri, 28 Oct 2022 11:24:28 +0800 +Subject: [PATCH 2/3] SM4 optimization for ARM by HW instruction + +This patch is a copy of the following PR, with +some extra supporting code. + +1. SM4 optimization for ARM by HW instruction + +This patch implements the SM4 optimization for ARM processor, +using SM4 HW instruction, which is an optional feature of +crypto extension for aarch64 V8. + +Tested on some modern ARM micro-architectures with SM4 support, the +performance uplift can be observed around 8X~40X over existing +C implementation in openssl. Algorithms that can be parallelized +(like CTR, ECB, CBC decryption) are on higher end, with algorithm +like CBC encryption on lower end (due to inter-block dependency) + +Perf data on Yitian-710 2.75GHz hardware, before and after optimization: + +Before: +type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes +SM4-CTR 105787.80k 107837.87k 108380.84k 108462.08k 108549.46k 108554.92k +SM4-ECB 111924.58k 118173.76k 119776.00k 120093.70k 120264.02k 120274.94k +SM4-CBC 106428.09k 109190.98k 109674.33k 109774.51k 109827.41k 109827.41k + +After (7.4x - 36.6x faster): +type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes +SM4-CTR 781979.02k 2432994.28k 3437753.86k 3834177.88k 3963715.58k 3974556.33k +SM4-ECB 937590.69k 2941689.02k 3945751.81k 4328655.87k 4459181.40k 4468692.31k +SM4-CBC 890639.88k 1027746.58k 1050621.78k 1056696.66k 1058613.93k 1058701.31k + +Signed-off-by: Daniel Hu + +Reviewed-by: Paul Dale +Reviewed-by: Tomas Mraz +(Merged from https://github.com/openssl/openssl/pull/17455\) + +Signed-off-by: Xu Yizhou +--- + Configurations/00-base-templates.conf | 2 +- + Configure | 3 +- + crypto/arm64cpuid.pl | 7 + + crypto/arm_arch.h | 1 + + crypto/armcap.c | 10 + + crypto/evp/e_sm4.c | 88 ++-- + crypto/sm4/asm/sm4-armv8.pl | 629 ++++++++++++++++++++++++++ + crypto/sm4/build.info | 13 +- + include/crypto/sm4_platform.h | 70 +++ + 9 files changed, 788 insertions(+), 35 deletions(-) + create mode 100644 crypto/sm4/asm/sm4-armv8.pl + create mode 100644 include/crypto/sm4_platform.h + +diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf +index a67ae65..a26d081 100644 +--- a/Configurations/00-base-templates.conf ++++ b/Configurations/00-base-templates.conf +@@ -321,7 +321,7 @@ my %targets=( + chacha_asm_src => "chacha-armv8.S", + poly1305_asm_src=> "poly1305-armv8.S", + keccak1600_asm_src => "keccak1600-armv8.S", +- sm4_asm_src => "vpsm4_ex-armv8.S", ++ sm4_asm_src => "sm4-armv8.S vpsm4_ex-armv8.S", + sm3_asm_src => "sm3-armv8.S", + }, + parisc11_asm => { +diff --git a/Configure b/Configure +index fce460d..d013204 100755 +--- a/Configure ++++ b/Configure +@@ -1421,7 +1421,8 @@ unless ($disabled{asm}) { + push @{$config{lib_defines}}, "POLY1305_ASM"; + } + if ($target{sm4_asm_src} ne "") { +- push @{$config{lib_defines}}, "VPSM4_EX_ASM"; ++ push @{$config{lib_defines}}, "SM4_ASM" if ($target{sm4_asm_src} =~ m/sm4/); ++ push @{$config{lib_defines}}, "VPSM4_EX_ASM" if ($target{sm4_asm_src} =~ m/vpsm4_ex/); + } + if ($target{sm3_asm_src} ne "") { + push @{$config{lib_defines}}, "SM3_ASM"; +diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl +index 1e9b167..341167b 100755 +--- a/crypto/arm64cpuid.pl ++++ b/crypto/arm64cpuid.pl +@@ -71,6 +71,13 @@ _armv8_pmull_probe: + ret + .size _armv8_pmull_probe,.-_armv8_pmull_probe + ++.globl _armv8_sm4_probe ++.type _armv8_sm4_probe,%function ++_armv8_sm4_probe: ++ .long 0xcec08400 // sm4e v0.4s, v0.4s ++ ret ++.size _armv8_sm4_probe,.-_armv8_sm4_probe ++ + .globl _armv8_sha512_probe + .type _armv8_sha512_probe,%function + _armv8_sha512_probe: +diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h +index 8839b21..0f6f7ca 100644 +--- a/crypto/arm_arch.h ++++ b/crypto/arm_arch.h +@@ -81,5 +81,6 @@ extern unsigned int OPENSSL_armcap_P; + # define ARMV8_PMULL (1<<5) + # define ARMV8_SHA512 (1<<6) + # define ARMV8_SM3 (1<<9) ++# define ARMV8_SM4 (1<<10) + + #endif +diff --git a/crypto/armcap.c b/crypto/armcap.c +index 8b2f4a5..73bcad1 100644 +--- a/crypto/armcap.c ++++ b/crypto/armcap.c +@@ -48,6 +48,7 @@ void _armv8_sha256_probe(void); + void _armv8_pmull_probe(void); + # ifdef __aarch64__ + void _armv8_sm3_probe(void); ++void _armv8_sm4_probe(void); + void _armv8_sha512_probe(void); + # endif + uint32_t _armv7_tick(void); +@@ -132,6 +133,7 @@ static unsigned long getauxval(unsigned long key) + # define HWCAP_CE_SHA1 (1 << 5) + # define HWCAP_CE_SHA256 (1 << 6) + # define HWCAP_CE_SM3 (1 << 18) ++# define HWCAP_CE_SM4 (1 << 19) + # define HWCAP_CE_SHA512 (1 << 21) + # endif + +@@ -190,6 +192,9 @@ void OPENSSL_cpuid_setup(void) + OPENSSL_armcap_P |= ARMV8_SHA256; + + # ifdef __aarch64__ ++ if (hwcap & HWCAP_CE_SM4) ++ OPENSSL_armcap_P |= ARMV8_SM4; ++ + if (hwcap & HWCAP_CE_SHA512) + OPENSSL_armcap_P |= ARMV8_SHA512; + +@@ -234,6 +239,11 @@ void OPENSSL_cpuid_setup(void) + OPENSSL_armcap_P |= ARMV8_SHA256; + } + # if defined(__aarch64__) && !defined(__APPLE__) ++ if (sigsetjmp(ill_jmp, 1) == 0) { ++ _armv8_sm4_probe(); ++ OPENSSL_armcap_P |= ARMV8_SM4; ++ } ++ + if (sigsetjmp(ill_jmp, 1) == 0) { + _armv8_sha512_probe(); + OPENSSL_armcap_P |= ARMV8_SHA512; +diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c +index 169d6c7..eaa5ba0 100644 +--- a/crypto/evp/e_sm4.c ++++ b/crypto/evp/e_sm4.c +@@ -15,17 +15,11 @@ + # include + # include "crypto/sm4.h" + # include "crypto/evp.h" ++# include "crypto/sm4_platform.h" + # include "evp_local.h" + # include "modes_local.h" + +-#if defined(OPENSSL_CPUID_OBJ) && (defined(__arm__) || defined(__arm) || defined(__aarch64__)) +-# include "arm_arch.h" +-# if __ARM_MAX_ARCH__>=7 +-# if defined(VPSM4_EX_ASM) +-# define VPSM4_EX_CAPABLE (OPENSSL_armcap_P & ARMV8_AES) +-# endif +-# endif +-#endif ++ + + typedef struct { + union { +@@ -35,28 +29,11 @@ typedef struct { + block128_f block; + union { + ecb128_f ecb; ++ cbc128_f cbc; ++ ctr128_f ctr; + } stream; + } EVP_SM4_KEY; + +-#ifdef VPSM4_EX_CAPABLE +-void vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); +-void vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); +-#define vpsm4_ex_encrypt SM4_encrypt +-#define vpsm4_ex_decrypt SM4_encrypt +-void vpsm4_ex_ecb_encrypt( +- const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key, const int enc); +-/* xts mode in GB/T 17964-2021 */ +-void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, +- const SM4_KEY *key2, const uint8_t iv[16]); +-void vpsm4_ex_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, +- const SM4_KEY *key2, const uint8_t iv[16]); +-/* xts mode in IEEE Std 1619-2007 */ +-void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, +- const SM4_KEY *key2, const uint8_t iv[16]); +-void vpsm4_ex_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, +- const SM4_KEY *key2, const uint8_t iv[16]); +-#endif +- + # define BLOCK_CIPHER_generic(nid,blocksize,ivlen,nmode,mode,MODE,flags) \ + static const EVP_CIPHER sm4_##mode = { \ + nid##_##nmode,blocksize,128/8,ivlen, \ +@@ -84,6 +61,21 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + + mode = EVP_CIPHER_CTX_mode(ctx); + if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) && !enc) { ++#ifdef HWSM4_CAPABLE ++ if (HWSM4_CAPABLE) { ++ HWSM4_set_decrypt_key(key, &dat->ks.ks); ++ dat->block = (block128_f) HWSM4_decrypt; ++ dat->stream.cbc = NULL; ++# ifdef HWSM4_cbc_encrypt ++ if (mode == EVP_CIPH_CBC_MODE) ++ dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt; ++# endif ++# ifdef HWSM4_ecb_encrypt ++ if (mode == EVP_CIPH_ECB_MODE) ++ dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt; ++# endif ++ } else ++#endif + #ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + vpsm4_ex_set_decrypt_key(key, &dat->ks.ks); +@@ -97,6 +89,29 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + SM4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); + } + } else { ++#ifdef HWSM4_CAPABLE ++ if (HWSM4_CAPABLE) { ++ HWSM4_set_encrypt_key(key, &dat->ks.ks); ++ dat->block = (block128_f) HWSM4_encrypt; ++ dat->stream.cbc = NULL; ++# ifdef HWSM4_cbc_encrypt ++ if (mode == EVP_CIPH_CBC_MODE) ++ dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt; ++ else ++# endif ++# ifdef HWSM4_ecb_encrypt ++ if (mode == EVP_CIPH_ECB_MODE) ++ dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt; ++ else ++# endif ++# ifdef HWSM4_ctr32_encrypt_blocks ++ if (mode == EVP_CIPH_CTR_MODE) ++ dat->stream.ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks; ++ else ++# endif ++ (void)0; /* terminate potentially open 'else' */ ++ } else ++#endif + #ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + vpsm4_ex_set_encrypt_key(key, &dat->ks.ks); +@@ -118,7 +133,10 @@ static int sm4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + { + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + +- if (EVP_CIPHER_CTX_encrypting(ctx)) ++ if (dat->stream.cbc) ++ (*dat->stream.cbc) (in, out, len, &dat->ks.ks, ctx->iv, ++ EVP_CIPHER_CTX_encrypting(ctx)); ++ else if (EVP_CIPHER_CTX_encrypting(ctx)) + CRYPTO_cbc128_encrypt(in, out, len, &dat->ks.ks, + EVP_CIPHER_CTX_iv_noconst(ctx), dat->block); + else +@@ -183,10 +201,16 @@ static int sm4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + return 0; + num = (unsigned int)n; + +- CRYPTO_ctr128_encrypt(in, out, len, &dat->ks.ks, +- ctx->iv, +- EVP_CIPHER_CTX_buf_noconst(ctx), &num, +- dat->block); ++ if (dat->stream.ctr) ++ CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks, ++ ctx->iv, ++ EVP_CIPHER_CTX_buf_noconst(ctx), ++ &num, dat->stream.ctr); ++ else ++ CRYPTO_ctr128_encrypt(in, out, len, &dat->ks.ks, ++ ctx->iv, ++ EVP_CIPHER_CTX_buf_noconst(ctx), &num, ++ dat->block); + EVP_CIPHER_CTX_set_num(ctx, num); + return 1; + } +diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl +new file mode 100644 +index 0000000..dbacad2 +--- /dev/null ++++ b/crypto/sm4/asm/sm4-armv8.pl +@@ -0,0 +1,629 @@ ++#! /usr/bin/env perl ++# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++ ++# ++# This module implements support for SM4 hw support on aarch64 ++# Oct 2021 ++# ++ ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or ++die "can't locate arm-xlate.pl"; ++ ++open OUT,"| \"$^X\" $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++*STDOUT=*OUT; ++ ++$prefix="sm4_v8"; ++my @rks=map("v$_",(0..7)); ++ ++sub rev32() { ++my $dst = shift; ++my $src = shift; ++$code.=<<___; ++#ifndef __ARMEB__ ++ rev32 $dst.16b,$src.16b ++#endif ++___ ++} ++ ++sub enc_blk () { ++my $data = shift; ++$code.=<<___; ++ sm4e $data.4s,@rks[0].4s ++ sm4e $data.4s,@rks[1].4s ++ sm4e $data.4s,@rks[2].4s ++ sm4e $data.4s,@rks[3].4s ++ sm4e $data.4s,@rks[4].4s ++ sm4e $data.4s,@rks[5].4s ++ sm4e $data.4s,@rks[6].4s ++ sm4e $data.4s,@rks[7].4s ++ rev64 $data.4S,$data.4S ++ ext $data.16b,$data.16b,$data.16b,#8 ++___ ++} ++ ++sub enc_4blks () { ++my $data0 = shift; ++my $data1 = shift; ++my $data2 = shift; ++my $data3 = shift; ++$code.=<<___; ++ sm4e $data0.4s,@rks[0].4s ++ sm4e $data1.4s,@rks[0].4s ++ sm4e $data2.4s,@rks[0].4s ++ sm4e $data3.4s,@rks[0].4s ++ ++ sm4e $data0.4s,@rks[1].4s ++ sm4e $data1.4s,@rks[1].4s ++ sm4e $data2.4s,@rks[1].4s ++ sm4e $data3.4s,@rks[1].4s ++ ++ sm4e $data0.4s,@rks[2].4s ++ sm4e $data1.4s,@rks[2].4s ++ sm4e $data2.4s,@rks[2].4s ++ sm4e $data3.4s,@rks[2].4s ++ ++ sm4e $data0.4s,@rks[3].4s ++ sm4e $data1.4s,@rks[3].4s ++ sm4e $data2.4s,@rks[3].4s ++ sm4e $data3.4s,@rks[3].4s ++ ++ sm4e $data0.4s,@rks[4].4s ++ sm4e $data1.4s,@rks[4].4s ++ sm4e $data2.4s,@rks[4].4s ++ sm4e $data3.4s,@rks[4].4s ++ ++ sm4e $data0.4s,@rks[5].4s ++ sm4e $data1.4s,@rks[5].4s ++ sm4e $data2.4s,@rks[5].4s ++ sm4e $data3.4s,@rks[5].4s ++ ++ sm4e $data0.4s,@rks[6].4s ++ sm4e $data1.4s,@rks[6].4s ++ sm4e $data2.4s,@rks[6].4s ++ sm4e $data3.4s,@rks[6].4s ++ ++ sm4e $data0.4s,@rks[7].4s ++ rev64 $data0.4S,$data0.4S ++ sm4e $data1.4s,@rks[7].4s ++ ext $data0.16b,$data0.16b,$data0.16b,#8 ++ rev64 $data1.4S,$data1.4S ++ sm4e $data2.4s,@rks[7].4s ++ ext $data1.16b,$data1.16b,$data1.16b,#8 ++ rev64 $data2.4S,$data2.4S ++ sm4e $data3.4s,@rks[7].4s ++ ext $data2.16b,$data2.16b,$data2.16b,#8 ++ rev64 $data3.4S,$data3.4S ++ ext $data3.16b,$data3.16b,$data3.16b,#8 ++___ ++} ++ ++$code=<<___; ++#include "arm_arch.h" ++.arch armv8-a+crypto ++.text ++___ ++ ++{{{ ++$code.=<<___; ++.align 6 ++.Lck: ++ .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 ++ .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 ++ .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 ++ .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 ++ .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 ++ .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 ++ .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 ++ .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 ++.Lfk: ++ .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc ++___ ++}}} ++ ++{{{ ++my ($key,$keys)=("x0","x1"); ++my ($tmp)=("x2"); ++my ($key0,$key1,$key2,$key3,$key4,$key5,$key6,$key7)=map("v$_",(0..7)); ++my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23)); ++my ($fkconst) = ("v24"); ++$code.=<<___; ++.globl ${prefix}_set_encrypt_key ++.type ${prefix}_set_encrypt_key,%function ++.align 5 ++${prefix}_set_encrypt_key: ++ ld1 {$key0.4s},[$key] ++ adr $tmp,.Lfk ++ ld1 {$fkconst.4s},[$tmp] ++ adr $tmp,.Lck ++ ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64 ++___ ++ &rev32($key0, $key0); ++$code.=<<___; ++ ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp] ++ eor $key0.16b,$key0.16b,$fkconst.16b; ++ sm4ekey $key0.4S,$key0.4S,$const0.4S ++ sm4ekey $key1.4S,$key0.4S,$const1.4S ++ sm4ekey $key2.4S,$key1.4S,$const2.4S ++ sm4ekey $key3.4S,$key2.4S,$const3.4S ++ sm4ekey $key4.4S,$key3.4S,$const4.4S ++ st1 {$key0.4s,$key1.4s,$key2.4s,$key3.4s},[$keys],64 ++ sm4ekey $key5.4S,$key4.4S,$const5.4S ++ sm4ekey $key6.4S,$key5.4S,$const6.4S ++ sm4ekey $key7.4S,$key6.4S,$const7.4S ++ st1 {$key4.4s,$key5.4s,$key6.4s,$key7.4s},[$keys] ++ ret ++.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key ++___ ++}}} ++ ++{{{ ++my ($key,$keys)=("x0","x1"); ++my ($tmp)=("x2"); ++my ($key7,$key6,$key5,$key4,$key3,$key2,$key1,$key0)=map("v$_",(0..7)); ++my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23)); ++my ($fkconst) = ("v24"); ++$code.=<<___; ++.globl ${prefix}_set_decrypt_key ++.type ${prefix}_set_decrypt_key,%function ++.align 5 ++${prefix}_set_decrypt_key: ++ ld1 {$key0.4s},[$key] ++ adr $tmp,.Lfk ++ ld1 {$fkconst.4s},[$tmp] ++ adr $tmp, .Lck ++ ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64 ++___ ++ &rev32($key0, $key0); ++$code.=<<___; ++ ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp] ++ eor $key0.16b, $key0.16b,$fkconst.16b; ++ sm4ekey $key0.4S,$key0.4S,$const0.4S ++ sm4ekey $key1.4S,$key0.4S,$const1.4S ++ sm4ekey $key2.4S,$key1.4S,$const2.4S ++ rev64 $key0.4s,$key0.4s ++ rev64 $key1.4s,$key1.4s ++ ext $key0.16b,$key0.16b,$key0.16b,#8 ++ ext $key1.16b,$key1.16b,$key1.16b,#8 ++ sm4ekey $key3.4S,$key2.4S,$const3.4S ++ sm4ekey $key4.4S,$key3.4S,$const4.4S ++ rev64 $key2.4s,$key2.4s ++ rev64 $key3.4s,$key3.4s ++ ext $key2.16b,$key2.16b,$key2.16b,#8 ++ ext $key3.16b,$key3.16b,$key3.16b,#8 ++ sm4ekey $key5.4S,$key4.4S,$const5.4S ++ sm4ekey $key6.4S,$key5.4S,$const6.4S ++ rev64 $key4.4s,$key4.4s ++ rev64 $key5.4s,$key5.4s ++ ext $key4.16b,$key4.16b,$key4.16b,#8 ++ ext $key5.16b,$key5.16b,$key5.16b,#8 ++ sm4ekey $key7.4S,$key6.4S,$const7.4S ++ rev64 $key6.4s, $key6.4s ++ rev64 $key7.4s, $key7.4s ++ ext $key6.16b,$key6.16b,$key6.16b,#8 ++ ext $key7.16b,$key7.16b,$key7.16b,#8 ++ st1 {$key7.4s,$key6.4s,$key5.4s,$key4.4s},[$keys],64 ++ st1 {$key3.4s,$key2.4s,$key1.4s,$key0.4s},[$keys] ++ ret ++.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key ++___ ++}}} ++ ++{{{ ++sub gen_block () { ++my $dir = shift; ++my ($inp,$out,$rk)=map("x$_",(0..2)); ++my ($data)=("v16"); ++$code.=<<___; ++.globl ${prefix}_${dir}crypt ++.type ${prefix}_${dir}crypt,%function ++.align 5 ++${prefix}_${dir}crypt: ++ ld1 {$data.4s},[$inp] ++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64 ++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] ++___ ++ &rev32($data,$data); ++ &enc_blk($data); ++ &rev32($data,$data); ++$code.=<<___; ++ st1 {$data.4s},[$out] ++ ret ++.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt ++___ ++} ++ ++&gen_block("en"); ++&gen_block("de"); ++}}} ++ ++{{{ ++my ($inp,$out,$len,$rk)=map("x$_",(0..3)); ++my ($enc) = ("w4"); ++my @dat=map("v$_",(16..23)); ++$code.=<<___; ++.globl ${prefix}_ecb_encrypt ++.type ${prefix}_ecb_encrypt,%function ++.align 5 ++${prefix}_ecb_encrypt: ++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64 ++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] ++1: ++ cmp $len,#64 ++ b.lt 1f ++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 ++ cmp $len,#128 ++ b.lt 2f ++ ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64 ++ // 8 blocks ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],@dat[7]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++$code.=<<___; ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++___ ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],@dat[7]); ++$code.=<<___; ++ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 ++ subs $len,$len,#128 ++ b.gt 1b ++ ret ++ // 4 blocks ++2: ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++$code.=<<___; ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ subs $len,$len,#64 ++ b.gt 1b ++1: ++ subs $len,$len,#16 ++ b.lt 1f ++ ld1 {@dat[0].4s},[$inp],#16 ++___ ++ &rev32(@dat[0],@dat[0]); ++ &enc_blk(@dat[0]); ++ &rev32(@dat[0],@dat[0]); ++$code.=<<___; ++ st1 {@dat[0].4s},[$out],#16 ++ b.ne 1b ++1: ++ ret ++.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt ++___ ++}}} ++ ++{{{ ++my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4)); ++my ($enc) = ("w5"); ++my @dat=map("v$_",(16..23)); ++my @in=map("v$_",(24..31)); ++my ($ivec) = ("v8"); ++$code.=<<___; ++.globl ${prefix}_cbc_encrypt ++.type ${prefix}_cbc_encrypt,%function ++.align 5 ++${prefix}_cbc_encrypt: ++ stp d8,d9,[sp, #-16]! ++ ++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64 ++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] ++ ld1 {$ivec.4s},[$ivp] ++ cmp $enc,#0 ++ b.eq .Ldec ++1: ++ cmp $len, #64 ++ b.lt 1f ++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 ++ eor @dat[0].16b,@dat[0].16b,$ivec.16b ++___ ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &enc_blk(@dat[0]); ++$code.=<<___; ++ eor @dat[1].16b,@dat[1].16b,@dat[0].16b ++___ ++ &enc_blk(@dat[1]); ++ &rev32(@dat[0],@dat[0]); ++$code.=<<___; ++ eor @dat[2].16b,@dat[2].16b,@dat[1].16b ++___ ++ &enc_blk(@dat[2]); ++ &rev32(@dat[1],@dat[1]); ++$code.=<<___; ++ eor @dat[3].16b,@dat[3].16b,@dat[2].16b ++___ ++ &enc_blk(@dat[3]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++$code.=<<___; ++ mov $ivec.16b,@dat[3].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ subs $len,$len,#64 ++ b.ne 1b ++1: ++ subs $len,$len,#16 ++ b.lt 3f ++ ld1 {@dat[0].4s},[$inp],#16 ++ eor $ivec.16b,$ivec.16b,@dat[0].16b ++___ ++ &rev32($ivec,$ivec); ++ &enc_blk($ivec); ++ &rev32($ivec,$ivec); ++$code.=<<___; ++ st1 {$ivec.16b},[$out],#16 ++ b.ne 1b ++ b 3f ++.Ldec: ++1: ++ cmp $len, #64 ++ b.lt 1f ++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp] ++ ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64 ++ cmp $len,#128 ++ b.lt 2f ++ // 8 blocks mode ++ ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp] ++ ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64 ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],$dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],$dat[7]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],@dat[7]); ++$code.=<<___; ++ eor @dat[0].16b,@dat[0].16b,$ivec.16b ++ eor @dat[1].16b,@dat[1].16b,@in[0].16b ++ eor @dat[2].16b,@dat[2].16b,@in[1].16b ++ mov $ivec.16b,@in[7].16b ++ eor @dat[3].16b,$dat[3].16b,@in[2].16b ++ eor @dat[4].16b,$dat[4].16b,@in[3].16b ++ eor @dat[5].16b,$dat[5].16b,@in[4].16b ++ eor @dat[6].16b,$dat[6].16b,@in[5].16b ++ eor @dat[7].16b,$dat[7].16b,@in[6].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 ++ subs $len,$len,128 ++ b.gt 1b ++ b 3f ++ // 4 blocks mode ++2: ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],$dat[3]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++$code.=<<___; ++ eor @dat[0].16b,@dat[0].16b,$ivec.16b ++ eor @dat[1].16b,@dat[1].16b,@in[0].16b ++ mov $ivec.16b,@in[3].16b ++ eor @dat[2].16b,@dat[2].16b,@in[1].16b ++ eor @dat[3].16b,$dat[3].16b,@in[2].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ subs $len,$len,#64 ++ b.gt 1b ++1: ++ subs $len,$len,#16 ++ b.lt 3f ++ ld1 {@dat[0].4s},[$inp],#16 ++ mov @in[0].16b,@dat[0].16b ++___ ++ &rev32(@dat[0],@dat[0]); ++ &enc_blk(@dat[0]); ++ &rev32(@dat[0],@dat[0]); ++$code.=<<___; ++ eor @dat[0].16b,@dat[0].16b,$ivec.16b ++ mov $ivec.16b,@in[0].16b ++ st1 {@dat[0].16b},[$out],#16 ++ b.ne 1b ++3: ++ // save back IV ++ st1 {$ivec.16b},[$ivp] ++ ldp d8,d9,[sp],#16 ++ ret ++.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt ++___ ++}}} ++ ++{{{ ++my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4)); ++my ($ctr)=("w5"); ++my @dat=map("v$_",(16..23)); ++my @in=map("v$_",(24..31)); ++my ($ivec)=("v8"); ++$code.=<<___; ++.globl ${prefix}_ctr32_encrypt_blocks ++.type ${prefix}_ctr32_encrypt_blocks,%function ++.align 5 ++${prefix}_ctr32_encrypt_blocks: ++ stp d8,d9,[sp, #-16]! ++ ++ ld1 {$ivec.4s},[$ivp] ++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64 ++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] ++___ ++ &rev32($ivec,$ivec); ++$code.=<<___; ++ mov $ctr,$ivec.s[3] ++1: ++ cmp $len,#4 ++ b.lt 1f ++ ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64 ++ mov @dat[0].16b,$ivec.16b ++ mov @dat[1].16b,$ivec.16b ++ mov @dat[2].16b,$ivec.16b ++ mov @dat[3].16b,$ivec.16b ++ add $ctr,$ctr,#1 ++ mov $dat[1].s[3],$ctr ++ add $ctr,$ctr,#1 ++ mov @dat[2].s[3],$ctr ++ add $ctr,$ctr,#1 ++ mov @dat[3].s[3],$ctr ++ cmp $len,#8 ++ b.lt 2f ++ ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64 ++ mov @dat[4].16b,$ivec.16b ++ mov @dat[5].16b,$ivec.16b ++ mov @dat[6].16b,$ivec.16b ++ mov @dat[7].16b,$ivec.16b ++ add $ctr,$ctr,#1 ++ mov $dat[4].s[3],$ctr ++ add $ctr,$ctr,#1 ++ mov @dat[5].s[3],$ctr ++ add $ctr,$ctr,#1 ++ mov @dat[6].s[3],$ctr ++ add $ctr,$ctr,#1 ++ mov @dat[7].s[3],$ctr ++___ ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],@dat[7]); ++$code.=<<___; ++ eor @dat[0].16b,@dat[0].16b,@in[0].16b ++ eor @dat[1].16b,@dat[1].16b,@in[1].16b ++ eor @dat[2].16b,@dat[2].16b,@in[2].16b ++ eor @dat[3].16b,@dat[3].16b,@in[3].16b ++ eor @dat[4].16b,@dat[4].16b,@in[4].16b ++ eor @dat[5].16b,@dat[5].16b,@in[5].16b ++ eor @dat[6].16b,@dat[6].16b,@in[6].16b ++ eor @dat[7].16b,@dat[7].16b,@in[7].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 ++ subs $len,$len,#8 ++ b.eq 3f ++ add $ctr,$ctr,#1 ++ mov $ivec.s[3],$ctr ++ b 1b ++2: ++___ ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++$code.=<<___; ++ eor @dat[0].16b,@dat[0].16b,@in[0].16b ++ eor @dat[1].16b,@dat[1].16b,@in[1].16b ++ eor @dat[2].16b,@dat[2].16b,@in[2].16b ++ eor @dat[3].16b,@dat[3].16b,@in[3].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ subs $len,$len,#4 ++ b.eq 3f ++ add $ctr,$ctr,#1 ++ mov $ivec.s[3],$ctr ++ b 1b ++1: ++ subs $len,$len,#1 ++ b.lt 3f ++ mov $dat[0].16b,$ivec.16b ++ ld1 {@in[0].4s},[$inp],#16 ++___ ++ &enc_blk(@dat[0]); ++ &rev32(@dat[0],@dat[0]); ++$code.=<<___; ++ eor $dat[0].16b,$dat[0].16b,@in[0].16b ++ st1 {$dat[0].4s},[$out],#16 ++ b.eq 3f ++ add $ctr,$ctr,#1 ++ mov $ivec.s[3],$ctr ++ b 1b ++3: ++ ldp d8,d9,[sp],#16 ++ ret ++.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks ++___ ++}}} ++######################################## ++{ my %opcode = ( ++ "sm4e" => 0xcec08400, ++ "sm4ekey" => 0xce60c800); ++ ++ sub unsm4 { ++ my ($mnemonic,$arg)=@_; ++ ++ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o ++ && ++ sprintf ".inst\t0x%08x\t//%s %s", ++ $opcode{$mnemonic}|$1|($2<<5)|($3<<16), ++ $mnemonic,$arg; ++ } ++} ++ ++open SELF,$0; ++while() { ++ next if (/^#!/); ++ last if (!s/^#/\/\// and !/^$/); ++ print; ++} ++close SELF; ++ ++foreach(split("\n",$code)) { ++ s/\`([^\`]*)\`/eval($1)/ge; ++ ++ s/\b(sm4\w+)\s+([qv].*)/unsm4($1,$2)/ge; ++ print $_,"\n"; ++} ++ ++close STDOUT or die "error closing STDOUT: $!"; +diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info +index bb042c5..4d26ede 100644 +--- a/crypto/sm4/build.info ++++ b/crypto/sm4/build.info +@@ -2,6 +2,17 @@ LIBS=../../libcrypto + SOURCE[../../libcrypto]=\ + sm4.c {- $target{sm4_asm_src} -} + ++GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl $(PERLASM_SCHEME) ++INCLUDE[sm4-armv8.o]=.. + + GENERATE[vpsm4_ex-armv8.S]=asm/vpsm4_ex-armv8.pl $(PERLASM_SCHEME) +-INCLUDE[vpsm4_ex-armv8.o]=.. +\ No newline at end of file ++INCLUDE[vpsm4_ex-armv8.o]=.. ++ ++BEGINRAW[Makefile] ++##### SM4 assembler implementations ++ ++# GNU make "catch all" ++{- $builddir -}/sm4-%.S: {- $sourcedir -}/asm/sm4-%.pl ++ CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ ++ ++ENDRAW[Makefile] +diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h +new file mode 100644 +index 0000000..2f5a6cf +--- /dev/null ++++ b/include/crypto/sm4_platform.h +@@ -0,0 +1,70 @@ ++/* ++ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++#ifndef OSSL_SM4_PLATFORM_H ++# define OSSL_SM4_PLATFORM_H ++# pragma once ++ ++# if defined(OPENSSL_CPUID_OBJ) ++# if (defined(__arm__) || defined(__arm) || defined(__aarch64__)) ++# include "arm_arch.h" ++# if __ARM_MAX_ARCH__>=7 ++# if defined(VPSM4_EX_ASM) ++# define VPSM4_EX_CAPABLE (OPENSSL_armcap_P & ARMV8_AES) ++# endif ++# define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4) ++# define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key ++# define HWSM4_set_decrypt_key sm4_v8_set_decrypt_key ++# define HWSM4_encrypt sm4_v8_encrypt ++# define HWSM4_decrypt sm4_v8_decrypt ++# define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt ++# define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt ++# define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks ++# endif ++# endif ++# endif /* OPENSSL_CPUID_OBJ */ ++ ++# if defined(HWSM4_CAPABLE) ++int HWSM4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); ++int HWSM4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); ++void HWSM4_encrypt(const unsigned char *in, unsigned char *out, ++ const SM4_KEY *key); ++void HWSM4_decrypt(const unsigned char *in, unsigned char *out, ++ const SM4_KEY *key); ++void HWSM4_cbc_encrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const SM4_KEY *key, ++ unsigned char *ivec, const int enc); ++void HWSM4_ecb_encrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const SM4_KEY *key, ++ const int enc); ++void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, ++ const unsigned char ivec[16]); ++# endif /* HWSM4_CAPABLE */ ++ ++#ifdef VPSM4_EX_CAPABLE ++void vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); ++void vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); ++#define vpsm4_ex_encrypt SM4_encrypt ++#define vpsm4_ex_decrypt SM4_encrypt ++void vpsm4_ex_ecb_encrypt( ++ const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key, const int enc); ++/* xts mode in GB/T 17964-2021 */ ++void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); ++void vpsm4_ex_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); ++/* xts mode in IEEE Std 1619-2007 */ ++void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); ++void vpsm4_ex_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); ++#endif /* VPSM4_EX_CAPABLE */ ++ ++#endif /* OSSL_SM4_PLATFORM_H */ +\ No newline at end of file +-- +2.36.1 + diff --git a/Feature-SM4-XTS-optimization-for-ARM-by-HW-instruction.patch b/Feature-SM4-XTS-optimization-for-ARM-by-HW-instruction.patch new file mode 100644 index 0000000000000000000000000000000000000000..d7294be2e80da776deb871265b166426e5bb630f --- /dev/null +++ b/Feature-SM4-XTS-optimization-for-ARM-by-HW-instruction.patch @@ -0,0 +1,620 @@ +From 9ce1668b573f089546ef7b846a67cabed2d5203a Mon Sep 17 00:00:00 2001 +From: Xu Yizhou +Date: Fri, 28 Oct 2022 16:04:51 +0800 +Subject: [PATCH 3/3] SM4 XTS optimization for ARM by HW instruction + +This patch implements the SM4 XTSoptimization for ARM processor, +using SM4 HW instruction, which is an optional feature of +crypto extension for aarch64 V8. + +Signed-off-by: Xu Yizhou +--- + crypto/evp/e_sm4.c | 28 ++ + crypto/sm4/asm/sm4-armv8.pl | 497 +++++++++++++++++++++++++++++++++- + include/crypto/sm4_platform.h | 14 + + 3 files changed, 536 insertions(+), 3 deletions(-) + +diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c +index eaa5ba0..da4dbd3 100644 +--- a/crypto/evp/e_sm4.c ++++ b/crypto/evp/e_sm4.c +@@ -281,6 +281,34 @@ static int sm4_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + const int bytes = EVP_CIPHER_CTX_key_length(ctx) / 2; + xctx->stream_gb = NULL; + xctx->stream = NULL; ++#ifdef HWSM4_CAPABLE ++ if (HWSM4_CAPABLE) { ++ if (enc) { ++ HWSM4_set_encrypt_key(key, &xctx->ks1.ks); ++ xctx->xts.block1 = (block128_f) HWSM4_encrypt; ++# ifdef HWSM4_xts_encrypt_gb ++ xctx->stream_gb = HWSM4_xts_encrypt_gb; ++# endif ++# ifdef HWSM4_xts_encrypt ++ xctx->stream = HWSM4_xts_encrypt; ++# endif ++ } else { ++ HWSM4_set_decrypt_key(key, &xctx->ks1.ks); ++ xctx->xts.block1 = (block128_f) HWSM4_decrypt; ++# ifdef HWSM4_xts_decrypt_gb ++ xctx->stream_gb = HWSM4_xts_decrypt_gb; ++# endif ++# ifdef HWSM4_xts_decrypt ++ xctx->stream = HWSM4_xts_decrypt; ++# endif ++ } ++ HWSM4_set_encrypt_key(key + bytes, &xctx->ks2.ks); ++ xctx->xts.block2 = (block128_f) HWSM4_encrypt; ++ ++ xctx->xts.key1 = &xctx->ks1; ++ break; ++ } else ++#endif + #ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + if (enc) { +diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl +index dbacad2..0faa890 100644 +--- a/crypto/sm4/asm/sm4-armv8.pl ++++ b/crypto/sm4/asm/sm4-armv8.pl +@@ -11,9 +11,9 @@ + # Oct 2021 + # + +-# $output is the last argument if it looks like a file (it has an extension) ++# $outut is the last argument if it looks like a file (it has an extension) + # $flavour is the first argument if it doesn't look like a file +-$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$outut = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; + $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +@@ -21,7 +21,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + +-open OUT,"| \"$^X\" $xlate $flavour \"$output\"" ++open OUT,"| \"$^X\" $xlate $flavour \"$outut\"" + or die "can't call $xlate: $!"; + *STDOUT=*OUT; + +@@ -110,6 +110,120 @@ $code.=<<___; + ___ + } + ++sub mov_reg_to_vec() { ++ my $src0 = shift; ++ my $src1 = shift; ++ my $desv = shift; ++$code.=<<___; ++ mov $desv.d[0],$src0 ++ mov $desv.d[1],$src1 ++#ifdef __ARMEB__ ++ rev32 $desv.16b,$desv.16b ++#endif ++___ ++} ++ ++sub mov_vec_to_reg() { ++ my $srcv = shift; ++ my $des0 = shift; ++ my $des1 = shift; ++$code.=<<___; ++ mov $des0,$srcv.d[0] ++ mov $des1,$srcv.d[1] ++___ ++} ++ ++sub compute_tweak() { ++ my $src0 = shift; ++ my $src1 = shift; ++ my $des0 = shift; ++ my $des1 = shift; ++ my $tmp0 = shift; ++ my $tmp1 = shift; ++ my $magic = shift; ++$code.=<<___; ++ extr x$tmp1,$src1,$src1,#32 ++ extr $des1,$src1,$src0,#63 ++ and w$tmp0,w$magic,w$tmp1,asr#31 ++ eor $des0,x$tmp0,$src0,lsl#1 ++___ ++} ++ ++sub compute_tweak_vec() { ++ my $src = shift; ++ my $des = shift; ++ my $tmp0 = shift; ++ my $tmp1 = shift; ++ my $magic = shift; ++ &rbit($tmp1,$src); ++$code.=<<___; ++ shl $des.16b, $tmp1.16b, #1 ++ ext $tmp0.16b, $tmp1.16b, $tmp1.16b,#15 ++ ushr $tmp0.16b, $tmp0.16b, #7 ++ mul $tmp0.16b, $tmp0.16b, $magic.16b ++ eor $des.16b, $des.16b, $tmp0.16b ++___ ++ &rbit($des,$des); ++} ++ ++sub mov_en_to_enc(){ ++ my $en = shift; ++ my $enc = shift; ++ if ($en eq "en") { ++$code.=<<___; ++ mov $enc,1 ++___ ++ } else { ++$code.=<<___; ++ mov $enc,0 ++___ ++ } ++} ++ ++sub rbit() { ++ my $dst = shift; ++ my $src = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++ if ($standard eq "_gb") { ++$code.=<<___; ++ rbit $dst.16b,$src.16b ++___ ++ } else { ++$code.=<<___; ++ mov $dst.16b,$src.16b ++___ ++ } ++ } else { ++ if ($standard eq "_gb") { ++$code.=<<___; ++ rbit $dst.16b,$src.16b ++___ ++ } ++ } ++} ++ ++sub rev32_armeb() { ++ my $dst = shift; ++ my $src = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++$code.=<<___; ++#ifdef __ARMEB__ ++ rev32 $dst.16b,$src.16b ++#else ++ mov $dst.16b,$src.16b ++#endif ++___ ++ } else { ++$code.=<<___; ++#ifdef __ARMEB__ ++ rev32 $dst.16b,$dst.16b ++#endif ++___ ++ } ++} ++ + $code=<<___; + #include "arm_arch.h" + .arch armv8-a+crypto +@@ -595,6 +709,383 @@ $code.=<<___; + .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks + ___ + }}} ++ ++ ++{{{ ++my ($inp,$out,$len,$rk1,$rk2,$ivp)=map("x$_",(0..5)); ++my ($blocks)=("x2"); ++my ($enc)=("x6"); ++my ($remain)=("x7"); ++my @twx=map("x$_",(9..24)); ++my $lastBlk=("x25"); ++ ++my @tweak=map("v$_",(8..15)); ++my @dat=map("v$_",(16..23)); ++my $lastTweak=("v24"); ++ ++# x/w/v/q registers for compute tweak ++my ($magic)=("8"); ++my ($tmp0,$tmp1)=("26","27"); ++my ($qMagic,$vMagic)=("q25","v25"); ++my ($vTmp0,$vTmp1)=("v26","v27"); ++ ++sub gen_xts_do_cipher() { ++$code.=<<___; ++.globl ${prefix}_xts_do_cipher${standard} ++.type ${prefix}_xts_do_cipher${standard},%function ++.align 5 ++${prefix}_xts_do_cipher${standard}: ++ mov w$magic,0x87 ++ ldr $qMagic, =0x01010101010101010101010101010187 ++ // used to encrypt the XORed plaintext blocks ++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk2],#64 ++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk2] ++ ld1 {@tweak[0].4s}, [$ivp] ++___ ++ &rev32(@tweak[0],@tweak[0]); ++ &enc_blk(@tweak[0]); ++ &rev32(@tweak[0],@tweak[0]); ++$code.=<<___; ++ // used to encrypt the initial vector to yield the initial tweak ++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk1],#64 ++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk1] ++ ++ and $remain,$len,#0x0F ++ // convert length into blocks ++ lsr $blocks,$len,4 ++ cmp $blocks,#1 // $len must be at least 16 ++ b.lt 99f ++ ++ cmp $remain,0 // if $len is a multiple of 16 ++ b.eq .xts_encrypt_blocks${standard} ++ // if $len is not a multiple of 16 ++ subs $blocks,$blocks,#1 ++ b.eq .only_2blks_tweak${standard} // if $len is less than 32 ++ ++.xts_encrypt_blocks${standard}: ++___ ++ &rbit(@tweak[0],@tweak[0]); ++ &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); ++ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3],$tmp0,$tmp1,$magic); ++ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5],$tmp0,$tmp1,$magic); ++ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7],$tmp0,$tmp1,$magic); ++ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9],$tmp0,$tmp1,$magic); ++ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11],$tmp0,$tmp1,$magic); ++ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13],$tmp0,$tmp1,$magic); ++ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15],$tmp0,$tmp1,$magic); ++$code.=<<___; ++1: ++ cmp $blocks,#8 ++___ ++ &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); ++ &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1],$tmp0,$tmp1,$magic); ++ &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); ++ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3],$tmp0,$tmp1,$magic); ++ &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); ++ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5],$tmp0,$tmp1,$magic); ++ &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); ++ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7],$tmp0,$tmp1,$magic); ++ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]); ++ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9],$tmp0,$tmp1,$magic); ++ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]); ++ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11],$tmp0,$tmp1,$magic); ++ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]); ++ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13],$tmp0,$tmp1,$magic); ++ &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]); ++ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15],$tmp0,$tmp1,$magic); ++$code.=<<___; ++ b.lt 2f ++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 ++___ ++ &rbit(@tweak[0],@tweak[0]); ++ &rbit(@tweak[1],@tweak[1]); ++ &rbit(@tweak[2],@tweak[2]); ++ &rbit(@tweak[3],@tweak[3]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b ++ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b ++ eor @dat[3].16b, @dat[3].16b, @tweak[3].16b ++ ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64 ++___ ++ &rbit(@tweak[4],@tweak[4]); ++ &rbit(@tweak[5],@tweak[5]); ++ &rbit(@tweak[6],@tweak[6]); ++ &rbit(@tweak[7],@tweak[7]); ++$code.=<<___; ++ eor @dat[4].16b, @dat[4].16b, @tweak[4].16b ++ eor @dat[5].16b, @dat[5].16b, @tweak[5].16b ++ eor @dat[6].16b, @dat[6].16b, @tweak[6].16b ++ eor @dat[7].16b, @dat[7].16b, @tweak[7].16b ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],@dat[7]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],@dat[7]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b ++ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b ++ eor @dat[3].16b, @dat[3].16b, @tweak[3].16b ++ eor @dat[4].16b, @dat[4].16b, @tweak[4].16b ++ eor @dat[5].16b, @dat[5].16b, @tweak[5].16b ++ eor @dat[6].16b, @dat[6].16b, @tweak[6].16b ++ eor @dat[7].16b, @dat[7].16b, @tweak[7].16b ++ ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[7].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 ++ subs $blocks,$blocks,#8 ++ b.eq 100f ++ b 1b ++2: ++ // process 4 blocks ++ cmp $blocks,#4 ++ b.lt 1f ++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 ++___ ++ &rbit(@tweak[0],@tweak[0]); ++ &rbit(@tweak[1],@tweak[1]); ++ &rbit(@tweak[2],@tweak[2]); ++ &rbit(@tweak[3],@tweak[3]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b ++ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b ++ eor @dat[3].16b, @dat[3].16b, @tweak[3].16b ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b ++ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b ++ eor @dat[3].16b, @dat[3].16b, @tweak[3].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ sub $blocks,$blocks,#4 ++ mov @tweak[0].16b,@tweak[4].16b ++ mov @tweak[1].16b,@tweak[5].16b ++ mov @tweak[2].16b,@tweak[6].16b ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[3].16b ++1: ++ // process last block ++ cmp $blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ ld1 {@dat[0].4s},[$inp],#16 ++___ ++ &rbit(@tweak[0],@tweak[0]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++___ ++ &rev32(@dat[0],@dat[0]); ++ &enc_blk(@dat[0]); ++ &rev32(@dat[0],@dat[0]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++ st1 {@dat[0].4s},[$out],#16 ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[0].16b ++ b 100f ++1: // process last 2 blocks ++ cmp $blocks,#2 ++ b.gt 1f ++ ld1 {@dat[0].4s,@dat[1].4s},[$inp],#32 ++___ ++ &rbit(@tweak[0],@tweak[0]); ++ &rbit(@tweak[1],@tweak[1]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b ++ st1 {@dat[0].4s,@dat[1].4s},[$out],#32 ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[1].16b ++ b 100f ++1: // process last 3 blocks ++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s},[$inp],#48 ++___ ++ &rbit(@tweak[0],@tweak[0]); ++ &rbit(@tweak[1],@tweak[1]); ++ &rbit(@tweak[2],@tweak[2]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b ++ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[0].16b ++ eor @dat[1].16b, @dat[1].16b, @tweak[1].16b ++ eor @dat[2].16b, @dat[2].16b, @tweak[2].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s},[$out],#48 ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[2].16b ++100: ++ cmp $remain,0 ++ b.eq 99f ++ ++// This brance calculates the last two tweaks, ++// while the encryption/decryption length is larger than 32 ++.last_2blks_tweak${standard}: ++___ ++ &rev32_armeb($lastTweak,$lastTweak); ++ &compute_tweak_vec($lastTweak,@tweak[1],$vTmp0,$vTmp1,$vMagic); ++ &compute_tweak_vec(@tweak[1],@tweak[2],$vTmp0,$vTmp1,$vMagic); ++$code.=<<___; ++ b .check_dec${standard} ++ ++ ++// This brance calculates the last two tweaks, ++// while the encryption/decryption length is less than 32, who only need two tweaks ++.only_2blks_tweak${standard}: ++ mov @tweak[1].16b,@tweak[0].16b ++___ ++ &rev32_armeb(@tweak[1],@tweak[1]); ++ &compute_tweak_vec(@tweak[1],@tweak[2],$vTmp0,$vTmp1,$vMagic); ++$code.=<<___; ++ b .check_dec${standard} ++ ++ ++// Determine whether encryption or decryption is required. ++// The last two tweaks need to be swapped for decryption. ++.check_dec${standard}: ++ // encryption:1 decryption:0 ++ cmp $enc,1 ++ b.eq .prcess_last_2blks${standard} ++ mov $vTmp0.16B,@tweak[1].16b ++ mov @tweak[1].16B,@tweak[2].16b ++ mov @tweak[2].16B,$vTmp0.16b ++ ++.prcess_last_2blks${standard}: ++___ ++ &rev32_armeb(@tweak[1],@tweak[1]); ++ &rev32_armeb(@tweak[2],@tweak[2]); ++$code.=<<___; ++ ld1 {@dat[0].4s},[$inp],#16 ++ eor @dat[0].16b, @dat[0].16b, @tweak[1].16b ++___ ++ &rev32(@dat[0],@dat[0]); ++ &enc_blk(@dat[0]); ++ &rev32(@dat[0],@dat[0]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[1].16b ++ st1 {@dat[0].4s},[$out],#16 ++ ++ sub $lastBlk,$out,16 ++ .loop${standard}: ++ subs $remain,$remain,1 ++ ldrb w$tmp0,[$lastBlk,$remain] ++ ldrb w$tmp1,[$inp,$remain] ++ strb w$tmp1,[$lastBlk,$remain] ++ strb w$tmp0,[$out,$remain] ++ b.gt .loop${standard} ++ ld1 {@dat[0].4s}, [$lastBlk] ++ eor @dat[0].16b, @dat[0].16b, @tweak[2].16b ++___ ++ &rev32(@dat[0],@dat[0]); ++ &enc_blk(@dat[0]); ++ &rev32(@dat[0],@dat[0]); ++$code.=<<___; ++ eor @dat[0].16b, @dat[0].16b, @tweak[2].16b ++ st1 {@dat[0].4s}, [$lastBlk] ++99: ++ ret ++.size ${prefix}_xts_do_cipher${standard},.-${prefix}_xts_do_cipher${standard} ++___ ++} #end of gen_xts_do_cipher ++ ++}}} ++ ++{{{ ++my ($enc)=("w6"); ++ ++sub gen_xts_cipher() { ++ my $en = shift; ++$code.=<<___; ++.globl ${prefix}_xts_${en}crypt${standard} ++.type ${prefix}_xts_${en}crypt${standard},%function ++.align 5 ++${prefix}_xts_${en}crypt${standard}: ++ stp x15, x16, [sp, #-0x10]! ++ stp x17, x18, [sp, #-0x10]! ++ stp x19, x20, [sp, #-0x10]! ++ stp x21, x22, [sp, #-0x10]! ++ stp x23, x24, [sp, #-0x10]! ++ stp x25, x26, [sp, #-0x10]! ++ stp x27, x28, [sp, #-0x10]! ++ stp x29, x30, [sp, #-0x10]! ++ stp d8, d9, [sp, #-0x10]! ++ stp d10, d11, [sp, #-0x10]! ++ stp d12, d13, [sp, #-0x10]! ++ stp d14, d15, [sp, #-0x10]! ++___ ++ &mov_en_to_enc($en,$enc); ++$code.=<<___; ++ bl ${prefix}_xts_do_cipher${standard} ++ ldp d14, d15, [sp], #0x10 ++ ldp d12, d13, [sp], #0x10 ++ ldp d10, d11, [sp], #0x10 ++ ldp d8, d9, [sp], #0x10 ++ ldp x29, x30, [sp], #0x10 ++ ldp x27, x28, [sp], #0x10 ++ ldp x25, x26, [sp], #0x10 ++ ldp x23, x24, [sp], #0x10 ++ ldp x21, x22, [sp], #0x10 ++ ldp x19, x20, [sp], #0x10 ++ ldp x17, x18, [sp], #0x10 ++ ldp x15, x16, [sp], #0x10 ++ ret ++.size ${prefix}_xts_${en}crypt${standard},.-${prefix}_xts_${en}crypt${standard} ++___ ++ ++} # end of gen_xts_cipher ++$standard="_gb"; ++&gen_xts_do_cipher(); ++&gen_xts_cipher("en"); ++&gen_xts_cipher("de"); ++$standard=""; ++&gen_xts_do_cipher(); ++&gen_xts_cipher("en"); ++&gen_xts_cipher("de"); ++}}} + ######################################## + { my %opcode = ( + "sm4e" => 0xcec08400, +diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h +index 2f5a6cf..0bde96f 100644 +--- a/include/crypto/sm4_platform.h ++++ b/include/crypto/sm4_platform.h +@@ -26,6 +26,10 @@ + # define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt + # define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt + # define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks ++# define HWSM4_xts_encrypt_gb sm4_v8_xts_encrypt_gb ++# define HWSM4_xts_decrypt_gb sm4_v8_xts_decrypt_gb ++# define HWSM4_xts_encrypt sm4_v8_xts_encrypt ++# define HWSM4_xts_decrypt sm4_v8_xts_decrypt + # endif + # endif + # endif /* OPENSSL_CPUID_OBJ */ +@@ -46,6 +50,16 @@ void HWSM4_ecb_encrypt(const unsigned char *in, unsigned char *out, + void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + const unsigned char ivec[16]); ++/* xts mode in GB/T 17964-2021 */ ++void HWSM4_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); ++void HWSM4_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); ++/* xts mode in IEEE Std 1619-2007 */ ++void HWSM4_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); ++void HWSM4_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); + # endif /* HWSM4_CAPABLE */ + + #ifdef VPSM4_EX_CAPABLE +-- +2.36.1 + diff --git a/Feature-add-ARMv8-implementations-of-SM4-in-ECB-and-XTS.patch b/Feature-add-ARMv8-implementations-of-SM4-in-ECB-and-XTS.patch new file mode 100644 index 0000000000000000000000000000000000000000..1deb0abab8b714390966fc5c9bf2c95c2e990067 --- /dev/null +++ b/Feature-add-ARMv8-implementations-of-SM4-in-ECB-and-XTS.patch @@ -0,0 +1,2225 @@ +From df56c1da16d705fb6f471651feb77a69171af9e3 Mon Sep 17 00:00:00 2001 +From: Xu Yizhou +Date: Wed, 19 Oct 2022 13:28:58 +0800 +Subject: [PATCH] add ARMv8 implementations of SM4 in ECB and XTS + +--- + Configurations/00-base-templates.conf | 1 + + Configure | 4 + + crypto/evp/c_allc.c | 1 + + crypto/evp/e_sm4.c | 352 ++++++- + crypto/modes/build.info | 2 +- + crypto/modes/xts128gb.c | 204 ++++ + crypto/objects/obj_dat.h | 15 +- + crypto/objects/obj_mac.num | 1 + + crypto/objects/objects.txt | 1 + + crypto/sm4/asm/vpsm4_ex-armv8.pl | 1173 +++++++++++++++++++++ + crypto/sm4/build.info | 5 +- + doc/man3/EVP_sm4_xts.pod | 67 ++ + fuzz/oids.txt | 1 + + include/openssl/evp.h | 4 + + include/openssl/modes.h | 9 + + include/openssl/obj_mac.h | 5 + + test/evp_test.c | 17 +- + test/recipes/30-test_evp_data/evpciph.txt | 22 + + util/libcrypto.num | 2 + + 19 files changed, 1832 insertions(+), 54 deletions(-) + create mode 100644 crypto/modes/xts128gb.c + create mode 100644 crypto/sm4/asm/vpsm4_ex-armv8.pl + create mode 100644 doc/man3/EVP_sm4_xts.pod + +diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf +index e01dc63..1d35012 100644 +--- a/Configurations/00-base-templates.conf ++++ b/Configurations/00-base-templates.conf +@@ -321,6 +321,7 @@ my %targets=( + chacha_asm_src => "chacha-armv8.S", + poly1305_asm_src=> "poly1305-armv8.S", + keccak1600_asm_src => "keccak1600-armv8.S", ++ sm4_asm_src => "vpsm4_ex-armv8.S", + }, + parisc11_asm => { + template => 1, +diff --git a/Configure b/Configure +index a41c897..3bfe360 100755 +--- a/Configure ++++ b/Configure +@@ -1420,6 +1420,9 @@ unless ($disabled{asm}) { + if ($target{poly1305_asm_src} ne "") { + push @{$config{lib_defines}}, "POLY1305_ASM"; + } ++ if ($target{sm4_asm_src} ne "") { ++ push @{$config{lib_defines}}, "VPSM4_EX_ASM"; ++ } + } + + my %predefined_C = compiler_predefined($config{CROSS_COMPILE}.$config{CC}); +@@ -3375,6 +3378,7 @@ sub print_table_entry + "mtoutflag", + "multilib", + "build_scheme", ++ "sm4_asm_src", + ); + + if ($type eq "TABLE") { +diff --git a/crypto/evp/c_allc.c b/crypto/evp/c_allc.c +index 22fdcc4..01b0d1f 100644 +--- a/crypto/evp/c_allc.c ++++ b/crypto/evp/c_allc.c +@@ -85,6 +85,7 @@ void openssl_add_all_ciphers_int(void) + EVP_add_cipher(EVP_sm4_cfb()); + EVP_add_cipher(EVP_sm4_ofb()); + EVP_add_cipher(EVP_sm4_ctr()); ++ EVP_add_cipher(EVP_sm4_xts()); + EVP_add_cipher_alias(SN_sm4_cbc, "SM4"); + EVP_add_cipher_alias(SN_sm4_cbc, "sm4"); + #endif +diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c +index fce3279..169d6c7 100644 +--- a/crypto/evp/e_sm4.c ++++ b/crypto/evp/e_sm4.c +@@ -15,86 +15,346 @@ + # include + # include "crypto/sm4.h" + # include "crypto/evp.h" ++# include "evp_local.h" ++# include "modes_local.h" ++ ++#if defined(OPENSSL_CPUID_OBJ) && (defined(__arm__) || defined(__arm) || defined(__aarch64__)) ++# include "arm_arch.h" ++# if __ARM_MAX_ARCH__>=7 ++# if defined(VPSM4_EX_ASM) ++# define VPSM4_EX_CAPABLE (OPENSSL_armcap_P & ARMV8_AES) ++# endif ++# endif ++#endif + + typedef struct { +- SM4_KEY ks; ++ union { ++ double align; ++ SM4_KEY ks; ++ } ks; ++ block128_f block; ++ union { ++ ecb128_f ecb; ++ } stream; + } EVP_SM4_KEY; + ++#ifdef VPSM4_EX_CAPABLE ++void vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); ++void vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); ++#define vpsm4_ex_encrypt SM4_encrypt ++#define vpsm4_ex_decrypt SM4_encrypt ++void vpsm4_ex_ecb_encrypt( ++ const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key, const int enc); ++/* xts mode in GB/T 17964-2021 */ ++void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); ++void vpsm4_ex_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); ++/* xts mode in IEEE Std 1619-2007 */ ++void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); ++void vpsm4_ex_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, ++ const SM4_KEY *key2, const uint8_t iv[16]); ++#endif ++ ++# define BLOCK_CIPHER_generic(nid,blocksize,ivlen,nmode,mode,MODE,flags) \ ++static const EVP_CIPHER sm4_##mode = { \ ++ nid##_##nmode,blocksize,128/8,ivlen, \ ++ flags|EVP_CIPH_##MODE##_MODE, \ ++ sm4_init_key, \ ++ sm4_##mode##_cipher, \ ++ NULL, \ ++ sizeof(EVP_SM4_KEY), \ ++ NULL,NULL,NULL,NULL }; \ ++const EVP_CIPHER *EVP_sm4_##mode(void) \ ++{ return &sm4_##mode; } ++ ++#define BLOCK_CIPHER_generic_pack(nid,flags) \ ++ BLOCK_CIPHER_generic(nid,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ ++ BLOCK_CIPHER_generic(nid,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ ++ BLOCK_CIPHER_generic(nid,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ ++ BLOCK_CIPHER_generic(nid,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ ++ BLOCK_CIPHER_generic(nid,1,16,ctr,ctr,CTR,flags) ++ + static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + const unsigned char *iv, int enc) + { +- SM4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); ++ int mode; ++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY, ctx); ++ ++ mode = EVP_CIPHER_CTX_mode(ctx); ++ if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) && !enc) { ++#ifdef VPSM4_EX_CAPABLE ++ if (VPSM4_EX_CAPABLE) { ++ vpsm4_ex_set_decrypt_key(key, &dat->ks.ks); ++ dat->block = (block128_f) vpsm4_ex_decrypt; ++ if (mode == EVP_CIPH_ECB_MODE) ++ dat->stream.ecb = (ecb128_f) vpsm4_ex_ecb_encrypt; ++ } else ++#endif ++ { ++ dat->block = (block128_f)SM4_decrypt; ++ SM4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); ++ } ++ } else { ++#ifdef VPSM4_EX_CAPABLE ++ if (VPSM4_EX_CAPABLE) { ++ vpsm4_ex_set_encrypt_key(key, &dat->ks.ks); ++ dat->block = (block128_f) vpsm4_ex_encrypt; ++ if (mode == EVP_CIPH_ECB_MODE) ++ dat->stream.ecb = (ecb128_f) vpsm4_ex_ecb_encrypt; ++ } else ++#endif ++ { ++ dat->block = (block128_f)SM4_encrypt; ++ SM4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); ++ } ++ } + return 1; + } + +-static void sm4_cbc_encrypt(const unsigned char *in, unsigned char *out, +- size_t len, const SM4_KEY *key, +- unsigned char *ivec, const int enc) ++static int sm4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, ++ const unsigned char *in, size_t len) + { +- if (enc) +- CRYPTO_cbc128_encrypt(in, out, len, key, ivec, +- (block128_f)SM4_encrypt); ++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); ++ ++ if (EVP_CIPHER_CTX_encrypting(ctx)) ++ CRYPTO_cbc128_encrypt(in, out, len, &dat->ks.ks, ++ EVP_CIPHER_CTX_iv_noconst(ctx), dat->block); + else +- CRYPTO_cbc128_decrypt(in, out, len, key, ivec, +- (block128_f)SM4_decrypt); ++ CRYPTO_cbc128_decrypt(in, out, len, &dat->ks.ks, ++ EVP_CIPHER_CTX_iv_noconst(ctx), dat->block); ++ return 1; + } + +-static void sm4_cfb128_encrypt(const unsigned char *in, unsigned char *out, +- size_t length, const SM4_KEY *key, +- unsigned char *ivec, int *num, const int enc) ++static int sm4_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, ++ const unsigned char *in, size_t len) + { +- CRYPTO_cfb128_encrypt(in, out, length, key, ivec, num, enc, +- (block128_f)SM4_encrypt); ++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); ++ int num = EVP_CIPHER_CTX_num(ctx); ++ ++ CRYPTO_cfb128_encrypt(in, out, len, &dat->ks.ks, ++ ctx->iv, &num, ++ EVP_CIPHER_CTX_encrypting(ctx), dat->block); ++ EVP_CIPHER_CTX_set_num(ctx, num); ++ ++ return 1; + } + +-static void sm4_ecb_encrypt(const unsigned char *in, unsigned char *out, +- const SM4_KEY *key, const int enc) ++static int sm4_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, ++ const unsigned char *in, size_t len) + { +- if (enc) +- SM4_encrypt(in, out, key); ++ size_t bl = EVP_CIPHER_CTX_block_size(ctx); ++ size_t i; ++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); ++ ++ if (len < bl){ ++ return 1; ++ } ++ if (dat->stream.ecb != NULL) ++ (*dat->stream.ecb) (in, out, len, &dat->ks.ks, ++ EVP_CIPHER_CTX_encrypting(ctx)); + else +- SM4_decrypt(in, out, key); ++ for (i = 0, len -= bl; i <= len; i += bl) ++ (*dat->block) (in + i, out + i, &dat->ks.ks); ++ return 1; + } + +-static void sm4_ofb128_encrypt(const unsigned char *in, unsigned char *out, +- size_t length, const SM4_KEY *key, +- unsigned char *ivec, int *num) ++static int sm4_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, ++ const unsigned char *in, size_t len) + { +- CRYPTO_ofb128_encrypt(in, out, length, key, ivec, num, +- (block128_f)SM4_encrypt); +-} ++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); ++ int num = EVP_CIPHER_CTX_num(ctx); + +-IMPLEMENT_BLOCK_CIPHER(sm4, ks, sm4, EVP_SM4_KEY, NID_sm4, +- 16, 16, 16, 128, EVP_CIPH_FLAG_DEFAULT_ASN1, +- sm4_init_key, 0, 0, 0, 0) ++ CRYPTO_ofb128_encrypt(in, out, len, &dat->ks.ks, ++ ctx->iv, &num, dat->block); ++ EVP_CIPHER_CTX_set_num(ctx, num); ++ return 1; ++} + + static int sm4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) + { +- unsigned int num = EVP_CIPHER_CTX_num(ctx); +- EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY, ctx); ++ int n = EVP_CIPHER_CTX_num(ctx); ++ unsigned int num; ++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); ++ ++ if (n < 0) ++ return 0; ++ num = (unsigned int)n; + +- CRYPTO_ctr128_encrypt(in, out, len, &dat->ks, +- EVP_CIPHER_CTX_iv_noconst(ctx), +- EVP_CIPHER_CTX_buf_noconst(ctx), &num, +- (block128_f)SM4_encrypt); ++ CRYPTO_ctr128_encrypt(in, out, len, &dat->ks.ks, ++ ctx->iv, ++ EVP_CIPHER_CTX_buf_noconst(ctx), &num, ++ dat->block); + EVP_CIPHER_CTX_set_num(ctx, num); + return 1; + } + +-static const EVP_CIPHER sm4_ctr_mode = { +- NID_sm4_ctr, 1, 16, 16, +- EVP_CIPH_CTR_MODE, +- sm4_init_key, +- sm4_ctr_cipher, +- NULL, +- sizeof(EVP_SM4_KEY), +- NULL, NULL, NULL, NULL +-}; ++BLOCK_CIPHER_generic_pack(NID_sm4, 0) + +-const EVP_CIPHER *EVP_sm4_ctr(void) ++typedef struct { ++ union { ++ double align; ++ SM4_KEY ks; ++ } ks1, ks2; /* sm4 key schedules to use */ ++ XTS128_CONTEXT xts; ++ int std; /* 0 for xts mode in GB/T 17964-2021 */ ++ /* 1 for xts mode in IEEE Std 1619-2007 */ ++ void (*stream_gb) (const unsigned char *in, ++ unsigned char *out, size_t length, ++ const SM4_KEY *key1, const SM4_KEY *key2, ++ const unsigned char iv[16]); /* stream for xts mode in GB/T 17964-2021 */ ++ void (*stream) (const unsigned char *in, ++ unsigned char *out, size_t length, ++ const SM4_KEY *key1, const SM4_KEY *key2, ++ const unsigned char iv[16]); /* stream for xts mode in IEEE Std 1619-2007 */ ++} EVP_SM4_XTS_CTX; ++ ++static int sm4_xts_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) ++{ ++ EVP_SM4_XTS_CTX *xctx = EVP_C_DATA(EVP_SM4_XTS_CTX, c); ++ ++ if (type == EVP_CTRL_COPY) { ++ EVP_CIPHER_CTX *out = ptr; ++ EVP_SM4_XTS_CTX *xctx_out = EVP_C_DATA(EVP_SM4_XTS_CTX,out); ++ ++ if (xctx->xts.key1) { ++ if (xctx->xts.key1 != &xctx->ks1) ++ return 0; ++ xctx_out->xts.key1 = &xctx_out->ks1; ++ } ++ if (xctx->xts.key2) { ++ if (xctx->xts.key2 != &xctx->ks2) ++ return 0; ++ xctx_out->xts.key2 = &xctx_out->ks2; ++ } ++ return 1; ++ } else if (type == EVP_CTRL_XTS_STANDARD) { ++ if ((arg < 0) || (arg > 1)) ++ return 0; ++ xctx->std = arg; ++ return 1; ++ } else if (type != EVP_CTRL_INIT) ++ return -1; ++ /* key1 and key2 are used as an indicator both key and IV are set */ ++ xctx->xts.key1 = NULL; ++ xctx->xts.key2 = NULL; ++ return 1; ++} ++ ++static int sm4_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, ++ const unsigned char *iv, int enc) ++{ ++ EVP_SM4_XTS_CTX *xctx = EVP_C_DATA(EVP_SM4_XTS_CTX,ctx); ++ ++ if (!iv && !key) ++ return 1; ++ ++ if (key) ++ do { ++ /* The key is two half length keys in reality */ ++ const int bytes = EVP_CIPHER_CTX_key_length(ctx) / 2; ++ xctx->stream_gb = NULL; ++ xctx->stream = NULL; ++#ifdef VPSM4_EX_CAPABLE ++ if (VPSM4_EX_CAPABLE) { ++ if (enc) { ++ vpsm4_ex_set_encrypt_key(key, &xctx->ks1.ks); ++ xctx->xts.block1 = (block128_f) vpsm4_ex_encrypt; ++ xctx->stream_gb = vpsm4_ex_xts_encrypt_gb; ++ xctx->stream = vpsm4_ex_xts_encrypt; ++ } else { ++ vpsm4_ex_set_decrypt_key(key, &xctx->ks1.ks); ++ xctx->xts.block1 = (block128_f) vpsm4_ex_decrypt; ++ xctx->stream_gb = vpsm4_ex_xts_decrypt_gb; ++ xctx->stream = vpsm4_ex_xts_decrypt; ++ } ++ vpsm4_ex_set_encrypt_key(key + bytes, &xctx->ks2.ks); ++ xctx->xts.block2 = (block128_f) vpsm4_ex_encrypt; ++ ++ xctx->xts.key1 = &xctx->ks1; ++ break; ++ } else ++#endif ++ (void)0; /* terminate potentially open 'else' */ ++ ++ if (enc) { ++ SM4_set_key(key, &xctx->ks1.ks); ++ xctx->xts.block1 = (block128_f) SM4_encrypt; ++ } else { ++ SM4_set_key(key, &xctx->ks1.ks); ++ xctx->xts.block1 = (block128_f) SM4_decrypt; ++ } ++ ++ SM4_set_key(key + bytes, &xctx->ks2.ks); ++ xctx->xts.block2 = (block128_f) SM4_encrypt; ++ ++ xctx->xts.key1 = &xctx->ks1; ++ } while (0); ++ ++ if (iv) { ++ xctx->xts.key2 = &xctx->ks2; ++ memcpy(EVP_CIPHER_CTX_iv_noconst(ctx), iv, 16); ++ } ++ ++ return 1; ++} ++ ++static int sm4_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, ++ const unsigned char *in, size_t len) ++{ ++ EVP_SM4_XTS_CTX *xctx = EVP_C_DATA(EVP_SM4_XTS_CTX,ctx); ++ if (!xctx->xts.key1 || !xctx->xts.key2) ++ return 0; ++ if (!out || !in || len < SM4_BLOCK_SIZE) ++ return 0; ++ if (xctx->std) { ++ if (xctx->stream) ++ (*xctx->stream) (in, out, len, ++ xctx->xts.key1, xctx->xts.key2, ++ EVP_CIPHER_CTX_iv_noconst(ctx)); ++ else if (CRYPTO_xts128_encrypt(&xctx->xts, EVP_CIPHER_CTX_iv_noconst(ctx), ++ in, out, len, ++ EVP_CIPHER_CTX_encrypting(ctx))) ++ return 0; ++ } else { ++ if (xctx->stream_gb) ++ (*xctx->stream_gb) (in, out, len, ++ xctx->xts.key1, xctx->xts.key2, ++ EVP_CIPHER_CTX_iv_noconst(ctx)); ++ else if (CRYPTO_xts128gb_encrypt(&xctx->xts, EVP_CIPHER_CTX_iv_noconst(ctx), ++ in, out, len, ++ EVP_CIPHER_CTX_encrypting(ctx))) ++ return 0; ++ } ++ return 1; ++} ++ ++#define SM4_XTS_BLOCK_SIZE 1 ++#define SM4_XTS_IV_LENGTH 16 ++#define SM4_XTS_KEY_LENGTH 32 ++ ++#define XTS_FLAGS (EVP_CIPH_FLAG_DEFAULT_ASN1 | EVP_CIPH_CUSTOM_IV \ ++ | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT \ ++ | EVP_CIPH_CUSTOM_COPY | EVP_CIPH_XTS_MODE) ++ ++static const EVP_CIPHER sm4_xts_mode = { ++ NID_sm4_xts, ++ SM4_XTS_BLOCK_SIZE, ++ SM4_XTS_KEY_LENGTH, ++ SM4_XTS_IV_LENGTH, ++ XTS_FLAGS, ++ sm4_xts_init_key, ++ sm4_xts_cipher, ++ NULL, ++ sizeof(EVP_SM4_XTS_CTX), ++ NULL, NULL, sm4_xts_ctrl, NULL ++}; ++ ++const EVP_CIPHER *EVP_sm4_xts(void) + { +- return &sm4_ctr_mode; ++ return &sm4_xts_mode; + } + + #endif +diff --git a/crypto/modes/build.info b/crypto/modes/build.info +index 821340e..f974b04 100644 +--- a/crypto/modes/build.info ++++ b/crypto/modes/build.info +@@ -1,7 +1,7 @@ + LIBS=../../libcrypto + SOURCE[../../libcrypto]=\ + cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \ +- ccm128.c xts128.c wrap128.c ocb128.c \ ++ ccm128.c xts128.c xts128gb.c wrap128.c ocb128.c \ + {- $target{modes_asm_src} -} + + INCLUDE[gcm128.o]=.. +diff --git a/crypto/modes/xts128gb.c b/crypto/modes/xts128gb.c +new file mode 100644 +index 0000000..8f57cc5 +--- /dev/null ++++ b/crypto/modes/xts128gb.c +@@ -0,0 +1,204 @@ ++/* ++ * Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the OpenSSL license (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++// This is the xts mode in GB/T 17964-2021 ++#include ++#include "modes_local.h" ++#include ++ ++#ifndef STRICT_ALIGNMENT ++# ifdef __GNUC__ ++typedef u64 u64_a1 __attribute((__aligned__(1))); ++# else ++typedef u64 u64_a1; ++# endif ++#endif ++ ++int CRYPTO_xts128gb_encrypt(const XTS128_CONTEXT *ctx, ++ const unsigned char iv[16], ++ const unsigned char *inp, unsigned char *out, ++ size_t len, int enc) ++{ ++ const union { ++ long one; ++ char little; ++ } is_endian = { ++ 1 ++ }; ++ union { ++ u64 u[2]; ++ u32 d[4]; ++ u8 c[16]; ++ } tweak, scratch; ++ unsigned int i; ++ ++ if (len < 16) ++ return -1; ++ ++ memcpy(tweak.c, iv, 16); ++ ++ (*ctx->block2) (tweak.c, tweak.c, ctx->key2); ++ ++ if (!enc && (len % 16)) ++ len -= 16; ++ ++ while (len >= 16) { ++#if defined(STRICT_ALIGNMENT) ++ memcpy(scratch.c, inp, 16); ++ scratch.u[0] ^= tweak.u[0]; ++ scratch.u[1] ^= tweak.u[1]; ++#else ++ scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak.u[0]; ++ scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak.u[1]; ++#endif ++ (*ctx->block1) (scratch.c, scratch.c, ctx->key1); ++#if defined(STRICT_ALIGNMENT) ++ scratch.u[0] ^= tweak.u[0]; ++ scratch.u[1] ^= tweak.u[1]; ++ memcpy(out, scratch.c, 16); ++#else ++ ((u64_a1 *)out)[0] = scratch.u[0] ^= tweak.u[0]; ++ ((u64_a1 *)out)[1] = scratch.u[1] ^= tweak.u[1]; ++#endif ++ inp += 16; ++ out += 16; ++ len -= 16; ++ ++ if (len == 0) ++ return 0; ++ ++ if (is_endian.little) { ++ u8 res; ++ u64 hi, lo; ++#ifdef BSWAP8 ++ hi = BSWAP8(tweak.u[0]); ++ lo = BSWAP8(tweak.u[1]); ++#else ++ u8 *p = tweak.c; ++ ++ hi = (u64)GETU32(p) << 32 | GETU32(p + 4); ++ lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); ++#endif ++ res = (u8)lo & 1; ++ tweak.u[0] = (lo >> 1) | (hi << 63); ++ tweak.u[1] = hi >> 1; ++ if (res) ++ tweak.c[15] ^= 0xe1; ++#ifdef BSWAP8 ++ hi = BSWAP8(tweak.u[0]); ++ lo = BSWAP8(tweak.u[1]); ++#else ++ p = tweak.c; ++ ++ hi = (u64)GETU32(p) << 32 | GETU32(p + 4); ++ lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); ++#endif ++ tweak.u[0] = lo; ++ tweak.u[1] = hi; ++ } else { ++ u8 Cin, Cout; ++ Cin = 0; ++ for (i = 0; i < 16; ++i) { ++ Cout = (tweak.c[i] << 7) & 0x80; ++ tweak.c[i] = ((tweak.c[i] >> 1) + Cin) & 0xff; ++ Cin = Cout; ++ } ++ if (Cout) ++ tweak.c[0] ^= 0xe1; ++ } ++ } ++ if (enc) { ++ for (i = 0; i < len; ++i) { ++ u8 c = inp[i]; ++ out[i] = scratch.c[i]; ++ scratch.c[i] = c; ++ } ++ scratch.u[0] ^= tweak.u[0]; ++ scratch.u[1] ^= tweak.u[1]; ++ (*ctx->block1) (scratch.c, scratch.c, ctx->key1); ++ scratch.u[0] ^= tweak.u[0]; ++ scratch.u[1] ^= tweak.u[1]; ++ memcpy(out - 16, scratch.c, 16); ++ } else { ++ union { ++ u64 u[2]; ++ u8 c[16]; ++ } tweak1; ++ ++ if (is_endian.little) { ++ u8 res; ++ u64 hi, lo; ++#ifdef BSWAP8 ++ hi = BSWAP8(tweak.u[0]); ++ lo = BSWAP8(tweak.u[1]); ++#else ++ u8 *p = tweak.c; ++ ++ hi = (u64)GETU32(p) << 32 | GETU32(p + 4); ++ lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); ++#endif ++ res = (u8)lo & 1; ++ tweak1.u[0] = (lo >> 1) | (hi << 63); ++ tweak1.u[1] = hi >> 1; ++ if (res) ++ tweak1.c[15] ^= 0xe1; ++#ifdef BSWAP8 ++ hi = BSWAP8(tweak1.u[0]); ++ lo = BSWAP8(tweak1.u[1]); ++#else ++ p = tweak1.c; ++ ++ hi = (u64)GETU32(p) << 32 | GETU32(p + 4); ++ lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); ++#endif ++ tweak1.u[0] = lo; ++ tweak1.u[1] = hi; ++ } else { ++ u8 Cin, Cout; ++ Cin = 0; ++ for ( i = 0; i < 16; ++i ) { ++ Cout = (tweak.c[i] << 7) & 0x80; ++ tweak1.c[i] = ((tweak.c[i] >> 1) + Cin) & 0xff; ++ Cin = Cout; ++ } ++ if (Cout) ++ tweak1.c[0] ^= 0xe1; ++ } ++#if defined(STRICT_ALIGNMENT) ++ memcpy(scratch.c, inp, 16); ++ scratch.u[0] ^= tweak1.u[0]; ++ scratch.u[1] ^= tweak1.u[1]; ++#else ++ scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak1.u[0]; ++ scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak1.u[1]; ++#endif ++ (*ctx->block1) (scratch.c, scratch.c, ctx->key1); ++ scratch.u[0] ^= tweak1.u[0]; ++ scratch.u[1] ^= tweak1.u[1]; ++ ++ for (i = 0; i < len; ++i) { ++ u8 c = inp[16 + i]; ++ out[16 + i] = scratch.c[i]; ++ scratch.c[i] = c; ++ } ++ scratch.u[0] ^= tweak.u[0]; ++ scratch.u[1] ^= tweak.u[1]; ++ (*ctx->block1) (scratch.c, scratch.c, ctx->key1); ++#if defined(STRICT_ALIGNMENT) ++ scratch.u[0] ^= tweak.u[0]; ++ scratch.u[1] ^= tweak.u[1]; ++ memcpy(out, scratch.c, 16); ++#else ++ ((u64_a1 *)out)[0] = scratch.u[0] ^ tweak.u[0]; ++ ((u64_a1 *)out)[1] = scratch.u[1] ^ tweak.u[1]; ++#endif ++ } ++ ++ return 0; ++} +diff --git a/crypto/objects/obj_dat.h b/crypto/objects/obj_dat.h +index eb4cce4..6d60f87 100644 +--- a/crypto/objects/obj_dat.h ++++ b/crypto/objects/obj_dat.h +@@ -10,7 +10,7 @@ + */ + + /* Serialized OID's */ +-static const unsigned char so[7770] = { ++static const unsigned char so[7778] = { + 0x2A,0x86,0x48,0x86,0xF7,0x0D, /* [ 0] OBJ_rsadsi */ + 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01, /* [ 6] OBJ_pkcs */ + 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x02,0x02, /* [ 13] OBJ_md2 */ +@@ -1077,9 +1077,10 @@ static const unsigned char so[7770] = { + 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x02,0x0C, /* [ 7745] OBJ_hmacWithSHA512_224 */ + 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x02,0x0D, /* [ 7753] OBJ_hmacWithSHA512_256 */ + 0x2A,0x81,0x1C,0xCF,0x55,0x01,0x83,0x75, /* [ 7761] OBJ_SM2_with_SM3 */ ++ 0x2A,0x81,0x1C,0xCF,0x55,0x01,0x68,0x0A, /* [ 7769] OBJ_sm4_xts */ + }; + +-#define NUM_NID 1196 ++#define NUM_NID 1197 + static const ASN1_OBJECT nid_objs[NUM_NID] = { + {"UNDEF", "undefined", NID_undef}, + {"rsadsi", "RSA Data Security, Inc.", NID_rsadsi, 6, &so[0]}, +@@ -2277,9 +2278,10 @@ static const ASN1_OBJECT nid_objs[NUM_NID] = { + {"hmacWithSHA512-224", "hmacWithSHA512-224", NID_hmacWithSHA512_224, 8, &so[7745]}, + {"hmacWithSHA512-256", "hmacWithSHA512-256", NID_hmacWithSHA512_256, 8, &so[7753]}, + {"SM2-SM3", "SM2-with-SM3", NID_SM2_with_SM3, 8, &so[7761]}, ++ {"SM4-XTS", "sm4-xts", NID_sm4_xts, 8, &so[7769]}, + }; + +-#define NUM_SN 1187 ++#define NUM_SN 1188 + static const unsigned int sn_objs[NUM_SN] = { + 364, /* "AD_DVCS" */ + 419, /* "AES-128-CBC" */ +@@ -2554,6 +2556,7 @@ static const unsigned int sn_objs[NUM_SN] = { + 1139, /* "SM4-CTR" */ + 1133, /* "SM4-ECB" */ + 1135, /* "SM4-OFB" */ ++ 1196, /* "SM4-XTS" */ + 188, /* "SMIME" */ + 167, /* "SMIME-CAPS" */ + 100, /* "SN" */ +@@ -3470,7 +3473,7 @@ static const unsigned int sn_objs[NUM_SN] = { + 1093, /* "x509ExtAdmission" */ + }; + +-#define NUM_LN 1187 ++#define NUM_LN 1188 + static const unsigned int ln_objs[NUM_LN] = { + 363, /* "AD Time Stamping" */ + 405, /* "ANSI X9.62" */ +@@ -4613,6 +4616,7 @@ static const unsigned int ln_objs[NUM_LN] = { + 1139, /* "sm4-ctr" */ + 1133, /* "sm4-ecb" */ + 1135, /* "sm4-ofb" */ ++ 1196, /* "sm4-xts" */ + 16, /* "stateOrProvinceName" */ + 660, /* "streetAddress" */ + 498, /* "subtreeMaximumQuality" */ +@@ -4661,7 +4665,7 @@ static const unsigned int ln_objs[NUM_LN] = { + 125, /* "zlib compression" */ + }; + +-#define NUM_OBJ 1072 ++#define NUM_OBJ 1073 + static const unsigned int obj_objs[NUM_OBJ] = { + 0, /* OBJ_undef 0 */ + 181, /* OBJ_iso 1 */ +@@ -5128,6 +5132,7 @@ static const unsigned int obj_objs[NUM_OBJ] = { + 1136, /* OBJ_sm4_cfb1 1 2 156 10197 1 104 5 */ + 1138, /* OBJ_sm4_cfb8 1 2 156 10197 1 104 6 */ + 1139, /* OBJ_sm4_ctr 1 2 156 10197 1 104 7 */ ++ 1196, /* OBJ_sm4_xts 1 2 156 10197 1 104 10 */ + 1172, /* OBJ_sm2 1 2 156 10197 1 301 */ + 1143, /* OBJ_sm3 1 2 156 10197 1 401 */ + 1195, /* OBJ_SM2_with_SM3 1 2 156 10197 1 501 */ +diff --git a/crypto/objects/obj_mac.num b/crypto/objects/obj_mac.num +index 8b797b0..77ad385 100644 +--- a/crypto/objects/obj_mac.num ++++ b/crypto/objects/obj_mac.num +@@ -1193,3 +1193,4 @@ magma_mac 1192 + hmacWithSHA512_224 1193 + hmacWithSHA512_256 1194 + SM2_with_SM3 1195 ++sm4_xts 1196 +diff --git a/crypto/objects/objects.txt b/crypto/objects/objects.txt +index be9da47..5713fae 100644 +--- a/crypto/objects/objects.txt ++++ b/crypto/objects/objects.txt +@@ -1520,6 +1520,7 @@ sm-scheme 104 4 : SM4-CFB : sm4-cfb + sm-scheme 104 5 : SM4-CFB1 : sm4-cfb1 + sm-scheme 104 6 : SM4-CFB8 : sm4-cfb8 + sm-scheme 104 7 : SM4-CTR : sm4-ctr ++sm-scheme 104 10 : SM4-XTS : sm4-xts + + # There is no OID that just denotes "HMAC" oddly enough... + +diff --git a/crypto/sm4/asm/vpsm4_ex-armv8.pl b/crypto/sm4/asm/vpsm4_ex-armv8.pl +new file mode 100644 +index 0000000..86a6f89 +--- /dev/null ++++ b/crypto/sm4/asm/vpsm4_ex-armv8.pl +@@ -0,0 +1,1173 @@ ++#! /usr/bin/env perl ++# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++ ++# ++# This module implements SM4 with ASIMD and AESE on AARCH64 ++# ++# Feb 2022 ++# ++ ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or ++die "can't locate arm-xlate.pl"; ++ ++open OUT,"| \"$^X\" $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++*STDOUT=*OUT; ++ ++$prefix="vpsm4_ex"; ++my ($inp,$outp,$rks1,$rks2,$ivp,$enc)=("x0","x1","x3","x4","x5","x6"); ++my ($blocks,$len)=("x2","x2"); ++my $remain=("x7"); ++my ($ptr,$counter)=("x12","w13"); ++my ($wtmp0,$wtmp1,$wtmp2,$wtmp3)=("w8","w9","w10","w11"); ++my ($xtmp0,$xtmp1,$xtmp2,$xtmp3)=("x8","x9","x10","x11"); ++my ($word0,$word1,$word2,$word3)=("w14","w15","w16","w17"); ++my @twx=map("x$_",(14..29)); ++my $lastBlk=("x26"); ++ ++my @tweak=map("v$_",(0..7)); ++my @qtmp=map("q$_",(8..11)); ++my @vtmp=map("v$_",(8..11)); ++my ($rk0,$rk1)=("v12","v13"); ++my ($rka,$rkb)=("v14","v15"); ++my @data=map("v$_",(16..19)); ++my @datax=map("v$_",(20..23)); ++my ($vtmp4,$vtmp5)=("v24","v25"); ++my $lastTweak=("v25"); ++my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31"); ++my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31"); ++ ++sub rev32() { ++ my $dst = shift; ++ my $src = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++$code.=<<___; ++#ifndef __ARMEB__ ++ rev32 $dst.16b,$src.16b ++#else ++ mov $dst.16b,$src.16b ++#endif ++___ ++ } else { ++$code.=<<___; ++#ifndef __ARMEB__ ++ rev32 $dst.16b,$dst.16b ++#endif ++___ ++ } ++} ++ ++sub rev32_armeb() { ++ my $dst = shift; ++ my $src = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++$code.=<<___; ++#ifdef __ARMEB__ ++ rev32 $dst.16b,$src.16b ++#else ++ mov $dst.16b,$src.16b ++#endif ++___ ++ } else { ++$code.=<<___; ++#ifdef __ARMEB__ ++ rev32 $dst.16b,$dst.16b ++#endif ++___ ++ } ++} ++ ++sub transpose() { ++ my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; ++ ++$code.=<<___; ++ zip1 $vt0.4s,$dat0.4s,$dat1.4s ++ zip2 $vt1.4s,$dat0.4s,$dat1.4s ++ zip1 $vt2.4s,$dat2.4s,$dat3.4s ++ zip2 $vt3.4s,$dat2.4s,$dat3.4s ++ zip1 $dat0.2d,$vt0.2d,$vt2.2d ++ zip2 $dat1.2d,$vt0.2d,$vt2.2d ++ zip1 $dat2.2d,$vt1.2d,$vt3.2d ++ zip2 $dat3.2d,$vt1.2d,$vt3.2d ++___ ++} ++ ++sub load_sbox_matrix () { ++$code.=<<___; ++ ldr $MaskQ, =0x0306090c0f0205080b0e0104070a0d00 ++ ldr $TAHMatQ, =0x22581a6002783a4062185a2042387a00 ++ ldr $TALMatQ, =0xc10bb67c4a803df715df62a89e54e923 ++ ldr $ATAHMatQ, =0x1407c6d56c7fbeadb9aa6b78c1d21300 ++ ldr $ATALMatQ, =0xe383c1a1fe9edcbc6404462679195b3b ++ ldr $ANDMaskQ, =0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f ++___ ++} ++# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x) ++sub mul_matrix() { ++ my $x = shift; ++ my $higherMat = shift; ++ my $lowerMat = shift; ++ my $tmp = shift; ++$code.=<<___; ++ ushr $tmp.16b, $x.16b, 4 ++ and $x.16b, $x.16b, $ANDMaskV.16b ++ tbl $x.16b, {$lowerMat.16b}, $x.16b ++ tbl $tmp.16b, {$higherMat.16b}, $tmp.16b ++ eor $x.16b, $x.16b, $tmp.16b ++___ ++} ++ ++# sbox operation for one single word ++sub sbox_1word () { ++ my $word = shift; ++ ++$code.=<<___; ++ mov @vtmp[3].s[0],$word ++ // optimize sbox using AESE instruction ++ tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b ++___ ++ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); ++$code.=<<___; ++ eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b ++ aese @vtmp[0].16b,@vtmp[1].16b ++___ ++ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); ++$code.=<<___; ++ ++ mov $wtmp0,@vtmp[0].s[0] ++ eor $word,$wtmp0,$wtmp0,ror #32-2 ++ eor $word,$word,$wtmp0,ror #32-10 ++ eor $word,$word,$wtmp0,ror #32-18 ++ eor $word,$word,$wtmp0,ror #32-24 ++___ ++} ++ ++# sbox operation for 4-lane of words ++sub sbox() { ++ my $dat = shift; ++ ++$code.=<<___; ++ // optimize sbox using AESE instruction ++ tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b ++___ ++ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); ++$code.=<<___; ++ eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b ++ aese @vtmp[0].16b,@vtmp[1].16b ++___ ++ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4); ++$code.=<<___; ++ mov $dat.16b,@vtmp[0].16b ++ ++ // linear transformation ++ ushr @vtmp[0].4s,$dat.4s,32-2 ++ ushr @vtmp[1].4s,$dat.4s,32-10 ++ ushr @vtmp[2].4s,$dat.4s,32-18 ++ ushr @vtmp[3].4s,$dat.4s,32-24 ++ sli @vtmp[0].4s,$dat.4s,2 ++ sli @vtmp[1].4s,$dat.4s,10 ++ sli @vtmp[2].4s,$dat.4s,18 ++ sli @vtmp[3].4s,$dat.4s,24 ++ eor $vtmp4.16b,@vtmp[0].16b,$dat.16b ++ eor $vtmp4.16b,$vtmp4.16b,$vtmp[1].16b ++ eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b ++ eor $dat.16b,$dat.16b,$vtmp4.16b ++___ ++} ++ ++# sbox operation for 8-lane of words ++sub sbox_double() { ++ my $dat = shift; ++ my $datx = shift; ++ ++$code.=<<___; ++ // optimize sbox using AESE instruction ++ tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b ++ tbl @vtmp[1].16b, {$datx.16b}, $MaskV.16b ++___ ++ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); ++ &mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4); ++$code.=<<___; ++ eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b ++ aese @vtmp[0].16b,$vtmp5.16b ++ aese @vtmp[1].16b,$vtmp5.16b ++___ ++ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4); ++ &mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4); ++$code.=<<___; ++ mov $dat.16b,@vtmp[0].16b ++ mov $datx.16b,@vtmp[1].16b ++ ++ // linear transformation ++ ushr @vtmp[0].4s,$dat.4s,32-2 ++ ushr $vtmp5.4s,$datx.4s,32-2 ++ ushr @vtmp[1].4s,$dat.4s,32-10 ++ ushr @vtmp[2].4s,$dat.4s,32-18 ++ ushr @vtmp[3].4s,$dat.4s,32-24 ++ sli @vtmp[0].4s,$dat.4s,2 ++ sli $vtmp5.4s,$datx.4s,2 ++ sli @vtmp[1].4s,$dat.4s,10 ++ sli @vtmp[2].4s,$dat.4s,18 ++ sli @vtmp[3].4s,$dat.4s,24 ++ eor $vtmp4.16b,@vtmp[0].16b,$dat.16b ++ eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b ++ eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b ++ eor $dat.16b,$dat.16b,$vtmp4.16b ++ ushr @vtmp[1].4s,$datx.4s,32-10 ++ ushr @vtmp[2].4s,$datx.4s,32-18 ++ ushr @vtmp[3].4s,$datx.4s,32-24 ++ sli @vtmp[1].4s,$datx.4s,10 ++ sli @vtmp[2].4s,$datx.4s,18 ++ sli @vtmp[3].4s,$datx.4s,24 ++ eor $vtmp4.16b,$vtmp5.16b,$datx.16b ++ eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b ++ eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b ++ eor $datx.16b,$datx.16b,$vtmp4.16b ++___ ++} ++ ++# sm4 for one block of data, in scalar registers word0/word1/word2/word3 ++sub sm4_1blk () { ++ my $kptr = shift; ++ ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ /* B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) */ ++ eor $wtmp3,$word2,$word3 ++ eor $wtmp2,$wtmp0,$word1 ++ eor $wtmp3,$wtmp3,$wtmp2 ++___ ++ &sbox_1word($wtmp3); ++$code.=<<___; ++ eor $word0,$word0,$wtmp3 ++ /* B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) */ ++ eor $wtmp3,$word2,$word3 ++ eor $wtmp2,$word0,$wtmp1 ++ eor $wtmp3,$wtmp3,$wtmp2 ++___ ++ &sbox_1word($wtmp3); ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ eor $word1,$word1,$wtmp3 ++ /* B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) */ ++ eor $wtmp3,$word0,$word1 ++ eor $wtmp2,$wtmp0,$word3 ++ eor $wtmp3,$wtmp3,$wtmp2 ++___ ++ &sbox_1word($wtmp3); ++$code.=<<___; ++ eor $word2,$word2,$wtmp3 ++ /* B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) */ ++ eor $wtmp3,$word0,$word1 ++ eor $wtmp2,$word2,$wtmp1 ++ eor $wtmp3,$wtmp3,$wtmp2 ++___ ++ &sbox_1word($wtmp3); ++$code.=<<___; ++ eor $word3,$word3,$wtmp3 ++___ ++} ++ ++# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3 ++sub sm4_4blks () { ++ my $kptr = shift; ++ ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ dup $rk0.4s,$wtmp0 ++ dup $rk1.4s,$wtmp1 ++ ++ /* B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) */ ++ eor $rka.16b,@data[2].16b,@data[3].16b ++ eor $rk0.16b,@data[1].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,$rk0.16b ++___ ++ &sbox($rk0); ++$code.=<<___; ++ eor @data[0].16b,@data[0].16b,$rk0.16b ++ ++ /* B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) */ ++ eor $rka.16b,$rka.16b,@data[0].16b ++ eor $rk1.16b,$rka.16b,$rk1.16b ++___ ++ &sbox($rk1); ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ eor @data[1].16b,@data[1].16b,$rk1.16b ++ ++ dup $rk0.4s,$wtmp0 ++ dup $rk1.4s,$wtmp1 ++ ++ /* B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) */ ++ eor $rka.16b,@data[0].16b,@data[1].16b ++ eor $rk0.16b,@data[3].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,$rk0.16b ++___ ++ &sbox($rk0); ++$code.=<<___; ++ eor @data[2].16b,@data[2].16b,$rk0.16b ++ ++ /* B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) */ ++ eor $rka.16b,$rka.16b,@data[2].16b ++ eor $rk1.16b,$rka.16b,$rk1.16b ++___ ++ &sbox($rk1); ++$code.=<<___; ++ eor @data[3].16b,@data[3].16b,$rk1.16b ++___ ++} ++ ++# sm4 for 8 lanes of data, in neon registers ++# data0/data1/data2/data3 datax0/datax1/datax2/datax3 ++sub sm4_8blks () { ++ my $kptr = shift; ++ ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ /* B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) */ ++ dup $rk0.4s,$wtmp0 ++ eor $rka.16b,@data[2].16b,@data[3].16b ++ eor $rkb.16b,@datax[2].16b,@datax[3].16b ++ eor @vtmp[0].16b,@data[1].16b,$rk0.16b ++ eor @vtmp[1].16b,@datax[1].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,@vtmp[0].16b ++ eor $rk1.16b,$rkb.16b,@vtmp[1].16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ eor @data[0].16b,@data[0].16b,$rk0.16b ++ eor @datax[0].16b,@datax[0].16b,$rk1.16b ++ ++ /* B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) */ ++ dup $rk1.4s,$wtmp1 ++ eor $rka.16b,$rka.16b,@data[0].16b ++ eor $rkb.16b,$rkb.16b,@datax[0].16b ++ eor $rk0.16b,$rka.16b,$rk1.16b ++ eor $rk1.16b,$rkb.16b,$rk1.16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ eor @data[1].16b,@data[1].16b,$rk0.16b ++ eor @datax[1].16b,@datax[1].16b,$rk1.16b ++ ++ /* B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) */ ++ dup $rk0.4s,$wtmp0 ++ eor $rka.16b,@data[0].16b,@data[1].16b ++ eor $rkb.16b,@datax[0].16b,@datax[1].16b ++ eor @vtmp[0].16b,@data[3].16b,$rk0.16b ++ eor @vtmp[1].16b,@datax[3].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,@vtmp[0].16b ++ eor $rk1.16b,$rkb.16b,@vtmp[1].16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ eor @data[2].16b,@data[2].16b,$rk0.16b ++ eor @datax[2].16b,@datax[2].16b,$rk1.16b ++ ++ /* B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) */ ++ dup $rk1.4s,$wtmp1 ++ eor $rka.16b,$rka.16b,@data[2].16b ++ eor $rkb.16b,$rkb.16b,@datax[2].16b ++ eor $rk0.16b,$rka.16b,$rk1.16b ++ eor $rk1.16b,$rkb.16b,$rk1.16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ eor @data[3].16b,@data[3].16b,$rk0.16b ++ eor @datax[3].16b,@datax[3].16b,$rk1.16b ++___ ++} ++ ++sub encrypt_1blk_norev() { ++ my $dat = shift; ++ my $rks = shift; ++$code.=<<___; ++ mov $ptr,$rks ++ mov $counter,#8 ++ mov $word0,$dat.s[0] ++ mov $word1,$dat.s[1] ++ mov $word2,$dat.s[2] ++ mov $word3,$dat.s[3] ++10: ++___ ++ &sm4_1blk($ptr); ++$code.=<<___; ++ subs $counter,$counter,#1 ++ b.ne 10b ++ mov $dat.s[0],$word3 ++ mov $dat.s[1],$word2 ++ mov $dat.s[2],$word1 ++ mov $dat.s[3],$word0 ++___ ++} ++ ++sub encrypt_1blk() { ++ my $dat = shift; ++ my $rks = shift; ++ ++ &encrypt_1blk_norev($dat,$rks); ++ &rev32($dat,$dat); ++} ++ ++sub encrypt_4blks() { ++$code.=<<___; ++ mov $ptr,$rks1 ++ mov $counter,#8 ++10: ++___ ++ &sm4_4blks($ptr); ++$code.=<<___; ++ subs $counter,$counter,#1 ++ b.ne 10b ++___ ++ &rev32(@vtmp[3],@data[0]); ++ &rev32(@vtmp[2],@data[1]); ++ &rev32(@vtmp[1],@data[2]); ++ &rev32(@vtmp[0],@data[3]); ++} ++ ++sub encrypt_8blks() { ++ my $rks = shift; ++$code.=<<___; ++ mov $ptr,$rks ++ mov $counter,#8 ++10: ++___ ++ &sm4_8blks($ptr); ++$code.=<<___; ++ subs $counter,$counter,#1 ++ b.ne 10b ++___ ++ &rev32(@vtmp[3],@data[0]); ++ &rev32(@vtmp[2],@data[1]); ++ &rev32(@vtmp[1],@data[2]); ++ &rev32(@vtmp[0],@data[3]); ++ &rev32(@data[3],@datax[0]); ++ &rev32(@data[2],@datax[1]); ++ &rev32(@data[1],@datax[2]); ++ &rev32(@data[0],@datax[3]); ++} ++ ++sub mov_reg_to_vec() { ++ my $src0 = shift; ++ my $src1 = shift; ++ my $desv = shift; ++$code.=<<___; ++ mov $desv.d[0],$src0 ++ mov $desv.d[1],$src1 ++#ifdef __ARMEB__ ++ rev32 $desv.16b,$desv.16b ++#endif ++___ ++} ++ ++sub mov_vec_to_reg() { ++ my $srcv = shift; ++ my $des0 = shift; ++ my $des1 = shift; ++$code.=<<___; ++ mov $des0,$srcv.d[0] ++ mov $des1,$srcv.d[1] ++___ ++} ++ ++sub compute_tweak() { ++ my $src0 = shift; ++ my $src1 = shift; ++ my $des0 = shift; ++ my $des1 = shift; ++$code.=<<___; ++ mov $wtmp0,0x87 ++ extr $xtmp2,$src1,$src1,#32 ++ extr $des1,$src1,$src0,#63 ++ and $wtmp1,$wtmp0,$wtmp2,asr#31 ++ eor $des0,$xtmp1,$src0,lsl#1 ++___ ++} ++ ++sub compute_tweak_vec() { ++ my $src = shift; ++ my $des = shift; ++ &rbit(@vtmp[2],$src); ++$code.=<<___; ++ ldr @qtmp[0], =0x01010101010101010101010101010187 ++ shl $des.16b, @vtmp[2].16b, #1 ++ ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 ++ ushr @vtmp[1].16b, @vtmp[1].16b, #7 ++ mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b ++ eor $des.16b, $des.16b, @vtmp[1].16b ++___ ++ &rbit($des,$des); ++} ++ ++sub mov_en_to_enc(){ ++ my $en = shift; ++ if ($en eq "en") { ++$code.=<<___; ++ mov $enc,1 ++___ ++ } else { ++$code.=<<___; ++ mov $enc,0 ++___ ++ } ++} ++ ++sub rbit() { ++ my $dst = shift; ++ my $src = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++ if ($standard eq "_gb") { ++$code.=<<___; ++ rbit $dst.16b,$src.16b ++___ ++ } else { ++$code.=<<___; ++ mov $dst.16b,$src.16b ++___ ++ } ++ } else { ++ if ($standard eq "_gb") { ++$code.=<<___; ++ rbit $dst.16b,$src.16b ++___ ++ } ++ } ++} ++ ++$code=<<___; ++#include "arm_arch.h" ++.arch armv8-a+crypto ++.text ++ ++.type ${prefix}_consts,%object ++.align 7 ++${prefix}_consts: ++.Lck: ++ .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 ++ .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 ++ .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 ++ .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 ++ .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 ++ .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 ++ .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 ++ .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 ++.Lfk: ++ .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc ++.Lshuffles: ++ .long 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x03020100 ++ ++.size ${prefix}_consts,.-${prefix}_consts ++___ ++ ++{{{ ++my ($userKey,$roundKey,$enc)=("x0","x1","w2"); ++my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8"); ++my ($vkey,$vfk,$vmap)=("v5","v6","v7"); ++$code.=<<___; ++.type ${prefix}_set_key,%function ++.align 4 ++${prefix}_set_key: ++ ld1 {$vkey.4s},[$userKey] ++___ ++ &load_sbox_matrix(); ++ &rev32($vkey,$vkey); ++$code.=<<___; ++ adr $pointer,.Lshuffles ++ ld1 {$vmap.4s},[$pointer] ++ adr $pointer,.Lfk ++ ld1 {$vfk.4s},[$pointer] ++ eor $vkey.16b,$vkey.16b,$vfk.16b ++ mov $schedules,#32 ++ adr $pointer,.Lck ++ movi @vtmp[0].16b,#64 ++ cbnz $enc,1f ++ add $roundKey,$roundKey,124 ++1: ++ mov $wtmp,$vkey.s[1] ++ ldr $roundkey,[$pointer],#4 ++ eor $roundkey,$roundkey,$wtmp ++ mov $wtmp,$vkey.s[2] ++ eor $roundkey,$roundkey,$wtmp ++ mov $wtmp,$vkey.s[3] ++ eor $roundkey,$roundkey,$wtmp ++ ++ // optimize sbox using AESE instruction ++ mov @data[0].s[0],$roundkey ++ tbl @vtmp[0].16b, {@data[0].16b}, $MaskV.16b ++___ ++ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); ++$code.=<<___; ++ eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b ++ aese @vtmp[0].16b,@vtmp[1].16b ++___ ++ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); ++$code.=<<___; ++ mov $wtmp,@vtmp[0].s[0] ++ ++ // linear transformation ++ eor $roundkey,$wtmp,$wtmp,ror #19 ++ eor $roundkey,$roundkey,$wtmp,ror #9 ++ mov $wtmp,$vkey.s[0] ++ eor $roundkey,$roundkey,$wtmp ++ mov $vkey.s[0],$roundkey ++ cbz $enc,2f ++ str $roundkey,[$roundKey],#4 ++ b 3f ++2: ++ str $roundkey,[$roundKey],#-4 ++3: ++ tbl $vkey.16b,{$vkey.16b},$vmap.16b ++ subs $schedules,$schedules,#1 ++ b.ne 1b ++ ret ++.size ${prefix}_set_key,.-${prefix}_set_key ++___ ++}}} ++ ++ ++{{{ ++$code.=<<___; ++.type ${prefix}_enc_4blks,%function ++.align 4 ++${prefix}_enc_4blks: ++___ ++ &encrypt_4blks(); ++$code.=<<___; ++ ret ++.size ${prefix}_enc_4blks,.-${prefix}_enc_4blks ++___ ++}}} ++ ++{{{ ++$code.=<<___; ++.type ${prefix}_enc_8blks,%function ++.align 4 ++${prefix}_enc_8blks: ++___ ++ &encrypt_8blks($rks1); ++$code.=<<___; ++ ret ++.size ${prefix}_enc_8blks,.-${prefix}_enc_8blks ++___ ++}}} ++ ++{{{ ++my ($key,$keys)=("x0","x1"); ++$code.=<<___; ++.globl ${prefix}_set_encrypt_key ++.type ${prefix}_set_encrypt_key,%function ++.align 5 ++${prefix}_set_encrypt_key: ++ stp x29,x30,[sp,#-16]! ++ mov w2,1 ++ bl ${prefix}_set_key ++ ldp x29,x30,[sp],#16 ++ ret ++.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key ++___ ++}}} ++ ++{{{ ++my ($key,$keys)=("x0","x1"); ++$code.=<<___; ++.globl ${prefix}_set_decrypt_key ++.type ${prefix}_set_decrypt_key,%function ++.align 5 ++${prefix}_set_decrypt_key: ++ stp x29,x30,[sp,#-16]! ++ mov w2,0 ++ bl ${prefix}_set_key ++ ldp x29,x30,[sp],#16 ++ ret ++.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key ++___ ++}}} ++ ++ ++{{{ ++ ++$code.=<<___; ++.globl ${prefix}_ecb_encrypt ++.type ${prefix}_ecb_encrypt,%function ++.align 5 ++${prefix}_ecb_encrypt: ++ stp d8,d9,[sp,#-0x10]! ++ stp d10,d11,[sp,#-0x10]! ++ stp d12,d13,[sp,#-0x10]! ++ stp d14,d15,[sp,#-0x10]! ++ stp x16,x17,[sp,#-0x10]! ++ stp x29,x30,[sp,#-0x10]! ++___ ++ &load_sbox_matrix(); ++$code.=<<___; ++ // convert length into blocks ++ lsr x2,x2,4 ++.Lecb_8_blocks_process: ++ cmp $blocks,#8 ++ b.lt .Lecb_4_blocks_process ++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++ ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &rev32(@datax[0],@datax[0]); ++ &rev32(@datax[1],@datax[1]); ++ &rev32(@datax[2],@datax[2]); ++ &rev32(@datax[3],@datax[3]); ++$code.=<<___; ++ bl ${prefix}_enc_8blks ++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#8 ++ b.gt .Lecb_8_blocks_process ++ b 100f ++.Lecb_4_blocks_process: ++ cmp $blocks,#4 ++ b.lt 1f ++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl ${prefix}_enc_4blks ++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ sub $blocks,$blocks,#4 ++1: ++ // process last block ++ cmp $blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ ld1 {@data[0].4s},[$inp] ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0],$rks1); ++$code.=<<___; ++ st1 {@data[0].4s},[$outp] ++ b 100f ++1: // process last 2 blocks ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16 ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16 ++ cmp $blocks,#2 ++ b.gt 1f ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl ${prefix}_enc_4blks ++ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 ++ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp] ++ b 100f ++1: // process last 3 blocks ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16 ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl ${prefix}_enc_4blks ++ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 ++ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16 ++ st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp] ++100: ++ ldp x29,x30,[sp],#0x10 ++ ldp x16,x17,[sp],#0x10 ++ ldp d14,d15,[sp],#0x10 ++ ldp d12,d13,[sp],#0x10 ++ ldp d10,d11,[sp],#0x10 ++ ldp d8,d9,[sp],#0x10 ++ ret ++.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt ++___ ++}}} ++ ++{{{ ++sub gen_xts_do_cipher() { ++$code.=<<___; ++.globl ${prefix}_xts_do_cipher${standard} ++.type ${prefix}_xts_do_cipher${standard},%function ++.align 5 ++${prefix}_xts_do_cipher${standard}: ++ stp x29,x30,[sp,#-16]! ++ ld1 {@tweak[0].4s}, [$ivp] ++___ ++ &load_sbox_matrix(); ++ &rev32(@tweak[0],@tweak[0]); ++ &encrypt_1blk(@tweak[0],$rks2); ++$code.=<<___; ++ and $remain,$len,#0x0F ++ // convert length into blocks ++ lsr $blocks,$len,4 ++ cmp $blocks,#1 ++ b.lt .return${standard} ++ ++ cmp $remain,0 ++ // If the encryption/decryption Length is N times of 16, ++ // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${standard} ++ b.eq .xts_encrypt_blocks${standard} ++ ++ // If the encryption/decryption length is not N times of 16, ++ // the last two blocks are encrypted/decrypted in .last_2blks_tweak${standard} or .only_2blks_tweak${standard} ++ // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${standard} ++ subs $blocks,$blocks,#1 ++ b.eq .only_2blks_tweak${standard} ++.xts_encrypt_blocks${standard}: ++___ ++ &rbit(@tweak[0],@tweak[0]); ++ &rev32_armeb(@tweak[0],@tweak[0]); ++ &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); ++ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); ++ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); ++ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); ++ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); ++ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); ++ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); ++ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); ++$code.=<<___; ++.Lxts_8_blocks_process${standard}: ++ cmp $blocks,#8 ++___ ++ &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); ++ &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]); ++ &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); ++ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); ++ &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); ++ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); ++ &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); ++ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); ++ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]); ++ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); ++ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]); ++ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); ++ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]); ++ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); ++ &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]); ++ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); ++$code.=<<___; ++ b.lt .Lxts_4_blocks_process${standard} ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &rbit(@tweak[0],@tweak[0]); ++ &rbit(@tweak[1],@tweak[1]); ++ &rbit(@tweak[2],@tweak[2]); ++ &rbit(@tweak[3],@tweak[3]); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++ eor @data[2].16b, @data[2].16b, @tweak[2].16b ++ eor @data[3].16b, @data[3].16b, @tweak[3].16b ++ ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 ++___ ++ &rbit(@tweak[4],@tweak[4]); ++ &rbit(@tweak[5],@tweak[5]); ++ &rbit(@tweak[6],@tweak[6]); ++ &rbit(@tweak[7],@tweak[7]); ++$code.=<<___; ++ eor @datax[0].16b, @datax[0].16b, @tweak[4].16b ++ eor @datax[1].16b, @datax[1].16b, @tweak[5].16b ++ eor @datax[2].16b, @datax[2].16b, @tweak[6].16b ++ eor @datax[3].16b, @datax[3].16b, @tweak[7].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &rev32(@datax[0],@datax[0]); ++ &rev32(@datax[1],@datax[1]); ++ &rev32(@datax[2],@datax[2]); ++ &rev32(@datax[3],@datax[3]); ++ &transpose(@data,@vtmp); ++ &transpose(@datax,@vtmp); ++$code.=<<___; ++ bl ${prefix}_enc_8blks ++___ ++ &transpose(@vtmp,@datax); ++ &transpose(@data,@datax); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b ++ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b ++ eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b ++ eor @data[0].16b, @data[0].16b, @tweak[4].16b ++ eor @data[1].16b, @data[1].16b, @tweak[5].16b ++ eor @data[2].16b, @data[2].16b, @tweak[6].16b ++ eor @data[3].16b, @data[3].16b, @tweak[7].16b ++ ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[7].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#8 ++ b.gt .Lxts_8_blocks_process${standard} ++ b 100f ++.Lxts_4_blocks_process${standard}: ++ cmp $blocks,#4 ++ b.lt 1f ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &rbit(@tweak[0],@tweak[0]); ++ &rbit(@tweak[1],@tweak[1]); ++ &rbit(@tweak[2],@tweak[2]); ++ &rbit(@tweak[3],@tweak[3]); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++ eor @data[2].16b, @data[2].16b, @tweak[2].16b ++ eor @data[3].16b, @data[3].16b, @tweak[3].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &transpose(@data,@vtmp); ++$code.=<<___; ++ bl ${prefix}_enc_4blks ++___ ++ &transpose(@vtmp,@data); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b ++ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b ++ eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ sub $blocks,$blocks,#4 ++ mov @tweak[0].16b,@tweak[4].16b ++ mov @tweak[1].16b,@tweak[5].16b ++ mov @tweak[2].16b,@tweak[6].16b ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[3].16b ++1: ++ // process last block ++ cmp $blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ ld1 {@data[0].4s},[$inp],#16 ++___ ++ &rbit(@tweak[0],@tweak[0]); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0],$rks1); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ st1 {@data[0].4s},[$outp],#16 ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[0].16b ++ b 100f ++1: // process last 2 blocks ++ cmp $blocks,#2 ++ b.gt 1f ++ ld1 {@data[0].4s,@data[1].4s},[$inp],#32 ++___ ++ &rbit(@tweak[0],@tweak[0]); ++ &rbit(@tweak[1],@tweak[1]); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &transpose(@data,@vtmp); ++$code.=<<___; ++ bl ${prefix}_enc_4blks ++___ ++ &transpose(@vtmp,@data); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[1].16b ++ b 100f ++1: // process last 3 blocks ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 ++___ ++ &rbit(@tweak[0],@tweak[0]); ++ &rbit(@tweak[1],@tweak[1]); ++ &rbit(@tweak[2],@tweak[2]); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++ eor @data[2].16b, @data[2].16b, @tweak[2].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &transpose(@data,@vtmp); ++$code.=<<___; ++ bl ${prefix}_enc_4blks ++___ ++ &transpose(@vtmp,@data); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b ++ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[2].16b ++100: ++ cmp $remain,0 ++ b.eq .return${standard} ++ ++// This brance calculates the last two tweaks, ++// while the encryption/decryption length is larger than 32 ++.last_2blks_tweak${standard}: ++___ ++ &rev32_armeb($lastTweak,$lastTweak); ++ &compute_tweak_vec($lastTweak,@tweak[1]); ++ &compute_tweak_vec(@tweak[1],@tweak[2]); ++$code.=<<___; ++ b .check_dec${standard} ++ ++ ++// This brance calculates the last two tweaks, ++// while the encryption/decryption length is equal to 32, who only need two tweaks ++.only_2blks_tweak${standard}: ++ mov @tweak[1].16b,@tweak[0].16b ++___ ++ &rev32_armeb(@tweak[1],@tweak[1]); ++ &compute_tweak_vec(@tweak[1],@tweak[2]); ++$code.=<<___; ++ b .check_dec${standard} ++ ++ ++// Determine whether encryption or decryption is required. ++// The last two tweaks need to be swapped for decryption. ++.check_dec${standard}: ++ // encryption:1 decryption:0 ++ cmp $enc,1 ++ b.eq .prcess_last_2blks${standard} ++ mov @vtmp[0].16B,@tweak[1].16b ++ mov @tweak[1].16B,@tweak[2].16b ++ mov @tweak[2].16B,@vtmp[0].16b ++ ++.prcess_last_2blks${standard}: ++___ ++ &rev32_armeb(@tweak[1],@tweak[1]); ++ &rev32_armeb(@tweak[2],@tweak[2]); ++$code.=<<___; ++ ld1 {@data[0].4s},[$inp],#16 ++ eor @data[0].16b, @data[0].16b, @tweak[1].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0],$rks1); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[1].16b ++ st1 {@data[0].4s},[$outp],#16 ++ ++ sub $lastBlk,$outp,16 ++ .loop${standard}: ++ subs $remain,$remain,1 ++ ldrb $wtmp0,[$lastBlk,$remain] ++ ldrb $wtmp1,[$inp,$remain] ++ strb $wtmp1,[$lastBlk,$remain] ++ strb $wtmp0,[$outp,$remain] ++ b.gt .loop${standard} ++ ld1 {@data[0].4s}, [$lastBlk] ++ eor @data[0].16b, @data[0].16b, @tweak[2].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0],$rks1); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[2].16b ++ st1 {@data[0].4s}, [$lastBlk] ++.return${standard}: ++ ldp x29,x30,[sp],#16 ++ ret ++.size ${prefix}_xts_do_cipher${standard},.-${prefix}_xts_do_cipher${standard} ++___ ++} #end of gen_xts_do_cipher ++ ++}}} ++ ++{{{ ++sub gen_xts_cipher() { ++ my $en = shift; ++ ++$code.=<<___; ++.globl ${prefix}_xts_${en}crypt${standard} ++.type ${prefix}_xts_${en}crypt${standard},%function ++.align 5 ++${prefix}_xts_${en}crypt${standard}: ++ stp x15, x16, [sp, #-0x10]! ++ stp x17, x18, [sp, #-0x10]! ++ stp x19, x20, [sp, #-0x10]! ++ stp x21, x22, [sp, #-0x10]! ++ stp x23, x24, [sp, #-0x10]! ++ stp x25, x26, [sp, #-0x10]! ++ stp x27, x28, [sp, #-0x10]! ++ stp x29, x30, [sp, #-0x10]! ++ stp d8, d9, [sp, #-0x10]! ++ stp d10, d11, [sp, #-0x10]! ++ stp d12, d13, [sp, #-0x10]! ++ stp d14, d15, [sp, #-0x10]! ++___ ++ &mov_en_to_enc($en); ++$code.=<<___; ++ bl ${prefix}_xts_do_cipher${standard} ++ ldp d14, d15, [sp], #0x10 ++ ldp d12, d13, [sp], #0x10 ++ ldp d10, d11, [sp], #0x10 ++ ldp d8, d9, [sp], #0x10 ++ ldp x29, x30, [sp], #0x10 ++ ldp x27, x28, [sp], #0x10 ++ ldp x25, x26, [sp], #0x10 ++ ldp x23, x24, [sp], #0x10 ++ ldp x21, x22, [sp], #0x10 ++ ldp x19, x20, [sp], #0x10 ++ ldp x17, x18, [sp], #0x10 ++ ldp x15, x16, [sp], #0x10 ++ ret ++.size ${prefix}_xts_${en}crypt${standard},.-${prefix}_xts_${en}crypt${standard} ++___ ++ ++} # end of gen_xts_cipher ++$standard="_gb"; ++&gen_xts_do_cipher(); ++&gen_xts_cipher("en"); ++&gen_xts_cipher("de"); ++$standard=""; ++&gen_xts_do_cipher(); ++&gen_xts_cipher("en"); ++&gen_xts_cipher("de"); ++}}} ++ ++######################################## ++open SELF,$0; ++while() { ++ next if (/^#!/); ++ last if (!s/^#/\/\// and !/^$/); ++ print; ++} ++close SELF; ++ ++foreach(split("\n",$code)) { ++ s/\`([^\`]*)\`/eval($1)/ge; ++ print $_,"\n"; ++} ++ ++close STDOUT or die "error closing STDOUT: $!"; +diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info +index b65a7d1..bb042c5 100644 +--- a/crypto/sm4/build.info ++++ b/crypto/sm4/build.info +@@ -1,4 +1,7 @@ + LIBS=../../libcrypto + SOURCE[../../libcrypto]=\ +- sm4.c ++ sm4.c {- $target{sm4_asm_src} -} + ++ ++GENERATE[vpsm4_ex-armv8.S]=asm/vpsm4_ex-armv8.pl $(PERLASM_SCHEME) ++INCLUDE[vpsm4_ex-armv8.o]=.. +\ No newline at end of file +diff --git a/doc/man3/EVP_sm4_xts.pod b/doc/man3/EVP_sm4_xts.pod +new file mode 100644 +index 0000000..09ca3fb +--- /dev/null ++++ b/doc/man3/EVP_sm4_xts.pod +@@ -0,0 +1,67 @@ ++=pod ++ ++=head1 NAME ++ ++EVP_sm4_xts, ++- EVP SM4 cipher ++ ++=head1 SYNOPSIS ++ ++ #include ++ ++ const EVP_CIPHER *EVP_sm4_xts(void); ++ ++=head1 DESCRIPTION ++ ++The XTS mode of operation (GB/T 17964-2021) for SM4 block cipher. ++ ++=over 4 ++ ++=item EVP_sm4_xts(), ++ ++The SM4 blockcipher with a 256-bit key in XTS mode. This mode use a key length of 256 bits and acts on blocks of 128 bits. ++ ++The B parameter to L or L is the XTS first "tweak" value. XTS mode has two implementations to calculate the following tweak values, one is standardized in IEEE Std. 1619-2007 and has been widely used (e.g., XTS AES), the other is proposed recently (GB/T 17964-2021 implemented in May 2022) and is currently only used in SM4. ++ ++Assume that the input data (B, B, and B) are consistent, the following tweak values are inconsistent due to different standards. As a result, the first ciphertext block are consistent, but the subsequent ciphertext blocks (if any) are different. ++ ++By default, EVP_sm4_xts is standardized in GB/T 17964-2021, and can be changed by EVP_CIPHER_CTX_ctrl. The following Is is supported in XTS mode for SM4. ++ ++=over 4 ++ ++=item EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_XTS_STANDARD, std, NULL) ++ ++Sets the standard of EVP_sm4_xts to B. This must be one of 0 or 1, 0 for XTS mode in GB/T 17964-2021, 1 for XTS mode in IEEE Std 1619-2007. ++ ++=back ++ ++The XTS implementation in OpenSSL does not support streaming. That is there must ++only be one L call per L call (and ++similarly with the "Decrypt" functions). ++ ++=back ++ ++=head1 RETURN VALUES ++ ++These functions return a B structure that contains the ++implementation of the symmetric cipher. See L for ++details of the B structure. ++ ++=head1 SEE ALSO ++ ++L, ++L, ++L ++ ++=head1 COPYRIGHT ++ ++Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. ++Copyright 2022 Ribose Inc. All Rights Reserved. ++ ++Licensed under the OpenSSL license (the "License"). You may not use ++this file except in compliance with the License. You can obtain a copy ++in the file LICENSE in the source distribution or at ++L. ++ ++=cut ++ +diff --git a/fuzz/oids.txt b/fuzz/oids.txt +index 8dfdea9..d1f98a8 100644 +--- a/fuzz/oids.txt ++++ b/fuzz/oids.txt +@@ -1064,3 +1064,4 @@ OBJ_id_tc26_gost_3410_2012_256_paramSetD="\x2A\x85\x03\x07\x01\x02\x01\x01\x04" + OBJ_hmacWithSHA512_224="\x2A\x86\x48\x86\xF7\x0D\x02\x0C" + OBJ_hmacWithSHA512_256="\x2A\x86\x48\x86\xF7\x0D\x02\x0D" + OBJ_SM2_with_SM3="\x2A\x81\x1C\xCF\x55\x01\x83\x75" ++OBJ_sm4_xts="\x2A\x81\x1C\xCF\x55\x01\x68\x0A" +diff --git a/include/openssl/evp.h b/include/openssl/evp.h +index 3116c1b..69326bc 100644 +--- a/include/openssl/evp.h ++++ b/include/openssl/evp.h +@@ -353,6 +353,9 @@ int (*EVP_CIPHER_meth_get_ctrl(const EVP_CIPHER *cipher))(EVP_CIPHER_CTX *, + + # define EVP_CTRL_GET_IVLEN 0x25 + ++/* Set the XTS mode standard, SM4 only */ ++# define EVP_CTRL_XTS_STANDARD 0x26 ++ + /* Padding modes */ + #define EVP_PADDING_PKCS7 1 + #define EVP_PADDING_ISO7816_4 2 +@@ -937,6 +940,7 @@ const EVP_CIPHER *EVP_sm4_cfb128(void); + # define EVP_sm4_cfb EVP_sm4_cfb128 + const EVP_CIPHER *EVP_sm4_ofb(void); + const EVP_CIPHER *EVP_sm4_ctr(void); ++const EVP_CIPHER *EVP_sm4_xts(void); + # endif + + # if OPENSSL_API_COMPAT < 0x10100000L +diff --git a/include/openssl/modes.h b/include/openssl/modes.h +index d544f98..dea324f 100644 +--- a/include/openssl/modes.h ++++ b/include/openssl/modes.h +@@ -22,6 +22,10 @@ typedef void (*cbc128_f) (const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], int enc); + ++typedef void (*ecb128_f) (const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, ++ int enc); ++ + typedef void (*ctr128_f) (const unsigned char *in, unsigned char *out, + size_t blocks, const void *key, + const unsigned char ivec[16]); +@@ -153,6 +157,11 @@ int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, + size_t len, int enc); + ++int CRYPTO_xts128gb_encrypt(const XTS128_CONTEXT *ctx, ++ const unsigned char iv[16], ++ const unsigned char *inp, unsigned char *out, ++ size_t len, int enc); ++ + size_t CRYPTO_128_wrap(void *key, const unsigned char *iv, + unsigned char *out, + const unsigned char *in, size_t inlen, +diff --git a/include/openssl/obj_mac.h b/include/openssl/obj_mac.h +index 9b125c1..edfc87d 100644 +--- a/include/openssl/obj_mac.h ++++ b/include/openssl/obj_mac.h +@@ -4772,6 +4772,11 @@ + #define NID_sm4_ctr 1139 + #define OBJ_sm4_ctr OBJ_sm_scheme,104L,7L + ++#define SN_sm4_xts "SM4-XTS" ++#define LN_sm4_xts "sm4-xts" ++#define NID_sm4_xts 1196 ++#define OBJ_sm4_xts OBJ_sm_scheme,104L,10L ++ + #define SN_hmac "HMAC" + #define LN_hmac "hmac" + #define NID_hmac 855 +diff --git a/test/evp_test.c b/test/evp_test.c +index 62f20ec..3c65ce9 100644 +--- a/test/evp_test.c ++++ b/test/evp_test.c +@@ -485,6 +485,8 @@ typedef struct cipher_data_st { + unsigned char *tag; + size_t tag_len; + int tag_late; ++ /* SM4 XTS only */ ++ int std; + } CIPHER_DATA; + + static int cipher_test_init(EVP_TEST *t, const char *alg) +@@ -568,6 +570,15 @@ static int cipher_test_parse(EVP_TEST *t, const char *keyword, + return -1; + return 1; + } ++ if (strcmp(keyword, "Standard") == 0) { ++ if (strcmp(value, "GB") == 0) ++ cdat->std = 0; ++ else if (strcmp(value, "IEEE") == 0) ++ cdat->std = 1; ++ else ++ return -1; ++ return 1; ++ } + return 0; + } + +@@ -707,7 +718,11 @@ static int cipher_test_enc(EVP_TEST *t, int enc, + goto err; + } + } +- ++ if (expected->std) { ++ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_XTS_STANDARD, expected->std, NULL)) { ++ goto err; ++ }; ++ } + EVP_CIPHER_CTX_set_padding(ctx, 0); + t->err = "CIPHERUPDATE_ERROR"; + tmplen = 0; +diff --git a/test/recipes/30-test_evp_data/evpciph.txt b/test/recipes/30-test_evp_data/evpciph.txt +index 76c839b..a3687bc 100644 +--- a/test/recipes/30-test_evp_data/evpciph.txt ++++ b/test/recipes/30-test_evp_data/evpciph.txt +@@ -2132,6 +2132,28 @@ IV = 0123456789ABCDEFFEDCBA9876543210 + Plaintext = AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCDDDDDDDDDDDDDDDDEEEEEEEEEEEEEEEEFFFFFFFFFFFFFFFFEEEEEEEEEEEEEEEEAAAAAAAAAAAAAAAA + Ciphertext = C2B4759E78AC3CF43D0852F4E8D5F9FD7256E8A5FCB65A350EE00630912E44492A0B17E1B85B060D0FBA612D8A95831638B361FD5FFACD942F081485A83CA35D + ++Title = SM4 XTS test vectors, the XTS mode is standardized in GB/T 17964-2021 by default ++Cipher = SM4-XTS ++Key = 2B7E151628AED2A6ABF7158809CF4F3C000102030405060708090A0B0C0D0E0F ++IV = F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF ++Plaintext = 6BC1BEE22E409F96E93D7E117393172AAE2D8A571E03AC9C9EB76FAC45AF8E5130C81C46A35CE411E5FBC1191A0A52EFF69F2445DF4F9B17 ++Ciphertext = E9538251C71D7B80BBE4483FEF497BD12C5C581BD6242FC51E08964FB4F60FDB0BA42F63499279213D318D2C11F6886E903BE7F93A1B3479 ++ ++Title = SM4 test vectors for XTS mode in GB/T 17964-2021 and IEEE Std 1619-2007 ++Cipher = SM4-XTS ++Key = 2B7E151628AED2A6ABF7158809CF4F3C000102030405060708090A0B0C0D0E0F ++IV = F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF ++Plaintext = 6BC1BEE22E409F96E93D7E117393172AAE2D8A571E03AC9C9EB76FAC45AF8E5130C81C46A35CE411E5FBC1191A0A52EFF69F2445DF4F9B17 ++Ciphertext = E9538251C71D7B80BBE4483FEF497BD12C5C581BD6242FC51E08964FB4F60FDB0BA42F63499279213D318D2C11F6886E903BE7F93A1B3479 ++Standard = GB ++ ++Cipher = SM4-XTS ++Key = 2B7E151628AED2A6ABF7158809CF4F3C000102030405060708090A0B0C0D0E0F ++IV = F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF ++Plaintext = 6BC1BEE22E409F96E93D7E117393172AAE2D8A571E03AC9C9EB76FAC45AF8E5130C81C46A35CE411E5FBC1191A0A52EFF69F2445DF4F9B17 ++Ciphertext = E9538251C71D7B80BBE4483FEF497BD1B3DB1A3E60408C575D63FF7DB39F83260869F9E2585FEC9F0B863BF8FD784B8627D16C0DB6D2CFC7 ++Standard = IEEE ++ + Title = ARIA test vectors from RFC5794 (and others) + + Cipher = ARIA-128-ECB +diff --git a/util/libcrypto.num b/util/libcrypto.num +index 95bccf9..62e2ea2 100644 +--- a/util/libcrypto.num ++++ b/util/libcrypto.num +@@ -4632,3 +4632,5 @@ X509_REQ_get0_sm2_id 6385 1_1_1m EXIST::FUNCTION:SM2 + X509_REQ_set0_sm2_id 6386 1_1_1m EXIST::FUNCTION:SM2 + EVP_PKEY_is_sm2 6387 1_1_1m EXIST::FUNCTION:SM2 + SM2_compute_key 6388 1_1_1m EXIST::FUNCTION: ++EVP_sm4_xts 6389 1_1_1m EXIST::FUNCTION:SM4 ++CRYPTO_xts128gb_encrypt 6390 1_1_1m EXIST::FUNCTION: +\ No newline at end of file +-- +2.36.1 + diff --git a/openssl.spec b/openssl.spec index 70823d0f7cd4c0a994913359ecc26ab6e8f2c31f..02a8f138566d737e068b769311e48638943dd7fd 100644 --- a/openssl.spec +++ b/openssl.spec @@ -2,7 +2,7 @@ Name: openssl Epoch: 1 Version: 1.1.1m -Release: 13 +Release: 15 Summary: Cryptography and SSL/TLS Toolkit License: OpenSSL and SSLeay URL: https://www.openssl.org/ @@ -37,6 +37,10 @@ Patch26: Feature-Support-TLCP-protocol.patch Patch27: Feature-X509-command-supports-SM2-certificate-signing-with-default-sm2id.patch Patch28: Feature-PKCS7-sign-and-verify-support-SM2-algorithm.patch Patch29: backport-Update-further-expiring-certificates-that-affect-tes.patch +Patch30: Feature-add-ARMv8-implementations-of-SM4-in-ECB-and-XTS.patch +Patch31: Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch +Patch32: Backport-SM4-optimization-for-ARM-by-HW-instruction.patch +Patch33: Feature-SM4-XTS-optimization-for-ARM-by-HW-instruction.patch BuildRequires: gcc perl make lksctp-tools-devel coreutils util-linux zlib-devel Requires: coreutils %{name}-libs%{?_isa} = %{epoch}:%{version}-%{release} @@ -239,6 +243,14 @@ make test || : %ldconfig_scriptlets libs %changelog +* Sat Oct 29 2022 Xu Yizhou - 1:1.1.1m-15 +- SM3 acceleration with SM3 hardware instruction on aarch64 +- SM4 optimization for ARM by HW instruction +- SM4 XTS optimization for ARM by HW instruction + +* Sat Oct 29 2022 Xu Yizhou - 1:1.1.1m-14 +- add ARMv8 implementations of SM4 in ECB and XTS + * Fri Oct 28 2022 zhujianwei - 1:1.1.1m-13 - update further expiring certificates