From 0395a50058dbe3ae7110cf2736de26bd2d361014 Mon Sep 17 00:00:00 2001 From: dumbdog Date: Thu, 14 Sep 2023 11:30:06 +0800 Subject: [PATCH 1/5] [Backport] add ARMv8 implementations of SM4 in ECB and XTS reference: https://gitee.com/src-openeuler/openssl/blob/openEuler-22.03-LTS-Next/Feature-SM4-XTS-optimization-for-ARM-by-HW-instruction.patch Signed-off-by: dumbdog --- Configurations/00-base-templates.conf | 1 + Configure | 4 + crypto/evp/c_allc.c | 1 + crypto/evp/e_sm4.c | 352 ++++++- crypto/modes/build.info | 2 +- crypto/modes/xts128gb.c | 204 ++++ crypto/objects/obj_dat.h | 16 +- crypto/objects/obj_mac.num | 1 + crypto/objects/objects.txt | 1 + crypto/sm4/asm/vpsm4_ex-armv8.pl | 1173 +++++++++++++++++++++ crypto/sm4/build.info | 5 +- doc/man3/EVP_sm4_xts.pod | 67 ++ fuzz/oids.txt | 1 + include/openssl/evp.h | 4 + include/openssl/modes.h | 9 + include/openssl/obj_mac.h | 5 + test/evp_test.c | 17 +- test/recipes/30-test_evp_data/evpciph.txt | 22 + util/libcrypto.num | 2 + 19 files changed, 1833 insertions(+), 54 deletions(-) create mode 100644 crypto/modes/xts128gb.c create mode 100644 crypto/sm4/asm/vpsm4_ex-armv8.pl create mode 100644 doc/man3/EVP_sm4_xts.pod diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf index e01dc63a8b..1d3501242b 100644 --- a/Configurations/00-base-templates.conf +++ b/Configurations/00-base-templates.conf @@ -321,6 +321,7 @@ my %targets=( chacha_asm_src => "chacha-armv8.S", poly1305_asm_src=> "poly1305-armv8.S", keccak1600_asm_src => "keccak1600-armv8.S", + sm4_asm_src => "vpsm4_ex-armv8.S", }, parisc11_asm => { template => 1, diff --git a/Configure b/Configure index 78cc15d184..ae82beb67d 100755 --- a/Configure +++ b/Configure @@ -1417,6 +1417,9 @@ unless ($disabled{asm}) { if ($target{poly1305_asm_src} ne "") { push @{$config{lib_defines}}, "POLY1305_ASM"; } + if ($target{sm4_asm_src} ne "") { + push @{$config{lib_defines}}, "VPSM4_EX_ASM"; + } } my %predefined_C = compiler_predefined($config{CROSS_COMPILE}.$config{CC}); @@ -3372,6 +3375,7 @@ sub print_table_entry "mtoutflag", "multilib", "build_scheme", + "sm4_asm_src", ); if ($type eq "TABLE") { diff --git a/crypto/evp/c_allc.c b/crypto/evp/c_allc.c index 22fdcc409c..01b0d1f8ca 100644 --- a/crypto/evp/c_allc.c +++ b/crypto/evp/c_allc.c @@ -85,6 +85,7 @@ void openssl_add_all_ciphers_int(void) EVP_add_cipher(EVP_sm4_cfb()); EVP_add_cipher(EVP_sm4_ofb()); EVP_add_cipher(EVP_sm4_ctr()); + EVP_add_cipher(EVP_sm4_xts()); EVP_add_cipher_alias(SN_sm4_cbc, "SM4"); EVP_add_cipher_alias(SN_sm4_cbc, "sm4"); #endif diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c index fce32794fc..169d6c71d1 100644 --- a/crypto/evp/e_sm4.c +++ b/crypto/evp/e_sm4.c @@ -15,86 +15,346 @@ # include # include "crypto/sm4.h" # include "crypto/evp.h" +# include "evp_local.h" +# include "modes_local.h" + +#if defined(OPENSSL_CPUID_OBJ) && (defined(__arm__) || defined(__arm) || defined(__aarch64__)) +# include "arm_arch.h" +# if __ARM_MAX_ARCH__>=7 +# if defined(VPSM4_EX_ASM) +# define VPSM4_EX_CAPABLE (OPENSSL_armcap_P & ARMV8_AES) +# endif +# endif +#endif typedef struct { - SM4_KEY ks; + union { + double align; + SM4_KEY ks; + } ks; + block128_f block; + union { + ecb128_f ecb; + } stream; } EVP_SM4_KEY; +#ifdef VPSM4_EX_CAPABLE +void vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); +void vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); +#define vpsm4_ex_encrypt SM4_encrypt +#define vpsm4_ex_decrypt SM4_encrypt +void vpsm4_ex_ecb_encrypt( + const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key, const int enc); +/* xts mode in GB/T 17964-2021 */ +void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +void vpsm4_ex_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +/* xts mode in IEEE Std 1619-2007 */ +void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +void vpsm4_ex_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +#endif + +# define BLOCK_CIPHER_generic(nid,blocksize,ivlen,nmode,mode,MODE,flags) \ +static const EVP_CIPHER sm4_##mode = { \ + nid##_##nmode,blocksize,128/8,ivlen, \ + flags|EVP_CIPH_##MODE##_MODE, \ + sm4_init_key, \ + sm4_##mode##_cipher, \ + NULL, \ + sizeof(EVP_SM4_KEY), \ + NULL,NULL,NULL,NULL }; \ +const EVP_CIPHER *EVP_sm4_##mode(void) \ +{ return &sm4_##mode; } + +#define BLOCK_CIPHER_generic_pack(nid,flags) \ + BLOCK_CIPHER_generic(nid,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,1,16,ctr,ctr,CTR,flags) + static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, const unsigned char *iv, int enc) { - SM4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); + int mode; + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY, ctx); + + mode = EVP_CIPHER_CTX_mode(ctx); + if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) && !enc) { +#ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + vpsm4_ex_set_decrypt_key(key, &dat->ks.ks); + dat->block = (block128_f) vpsm4_ex_decrypt; + if (mode == EVP_CIPH_ECB_MODE) + dat->stream.ecb = (ecb128_f) vpsm4_ex_ecb_encrypt; + } else +#endif + { + dat->block = (block128_f)SM4_decrypt; + SM4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); + } + } else { +#ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + vpsm4_ex_set_encrypt_key(key, &dat->ks.ks); + dat->block = (block128_f) vpsm4_ex_encrypt; + if (mode == EVP_CIPH_ECB_MODE) + dat->stream.ecb = (ecb128_f) vpsm4_ex_ecb_encrypt; + } else +#endif + { + dat->block = (block128_f)SM4_encrypt; + SM4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); + } + } return 1; } -static void sm4_cbc_encrypt(const unsigned char *in, unsigned char *out, - size_t len, const SM4_KEY *key, - unsigned char *ivec, const int enc) +static int sm4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) { - if (enc) - CRYPTO_cbc128_encrypt(in, out, len, key, ivec, - (block128_f)SM4_encrypt); + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + + if (EVP_CIPHER_CTX_encrypting(ctx)) + CRYPTO_cbc128_encrypt(in, out, len, &dat->ks.ks, + EVP_CIPHER_CTX_iv_noconst(ctx), dat->block); else - CRYPTO_cbc128_decrypt(in, out, len, key, ivec, - (block128_f)SM4_decrypt); + CRYPTO_cbc128_decrypt(in, out, len, &dat->ks.ks, + EVP_CIPHER_CTX_iv_noconst(ctx), dat->block); + return 1; } -static void sm4_cfb128_encrypt(const unsigned char *in, unsigned char *out, - size_t length, const SM4_KEY *key, - unsigned char *ivec, int *num, const int enc) +static int sm4_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) { - CRYPTO_cfb128_encrypt(in, out, length, key, ivec, num, enc, - (block128_f)SM4_encrypt); + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + int num = EVP_CIPHER_CTX_num(ctx); + + CRYPTO_cfb128_encrypt(in, out, len, &dat->ks.ks, + ctx->iv, &num, + EVP_CIPHER_CTX_encrypting(ctx), dat->block); + EVP_CIPHER_CTX_set_num(ctx, num); + + return 1; } -static void sm4_ecb_encrypt(const unsigned char *in, unsigned char *out, - const SM4_KEY *key, const int enc) +static int sm4_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) { - if (enc) - SM4_encrypt(in, out, key); + size_t bl = EVP_CIPHER_CTX_block_size(ctx); + size_t i; + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + + if (len < bl){ + return 1; + } + if (dat->stream.ecb != NULL) + (*dat->stream.ecb) (in, out, len, &dat->ks.ks, + EVP_CIPHER_CTX_encrypting(ctx)); else - SM4_decrypt(in, out, key); + for (i = 0, len -= bl; i <= len; i += bl) + (*dat->block) (in + i, out + i, &dat->ks.ks); + return 1; } -static void sm4_ofb128_encrypt(const unsigned char *in, unsigned char *out, - size_t length, const SM4_KEY *key, - unsigned char *ivec, int *num) +static int sm4_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) { - CRYPTO_ofb128_encrypt(in, out, length, key, ivec, num, - (block128_f)SM4_encrypt); -} + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + int num = EVP_CIPHER_CTX_num(ctx); -IMPLEMENT_BLOCK_CIPHER(sm4, ks, sm4, EVP_SM4_KEY, NID_sm4, - 16, 16, 16, 128, EVP_CIPH_FLAG_DEFAULT_ASN1, - sm4_init_key, 0, 0, 0, 0) + CRYPTO_ofb128_encrypt(in, out, len, &dat->ks.ks, + ctx->iv, &num, dat->block); + EVP_CIPHER_CTX_set_num(ctx, num); + return 1; +} static int sm4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, const unsigned char *in, size_t len) { - unsigned int num = EVP_CIPHER_CTX_num(ctx); - EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY, ctx); + int n = EVP_CIPHER_CTX_num(ctx); + unsigned int num; + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + + if (n < 0) + return 0; + num = (unsigned int)n; - CRYPTO_ctr128_encrypt(in, out, len, &dat->ks, - EVP_CIPHER_CTX_iv_noconst(ctx), - EVP_CIPHER_CTX_buf_noconst(ctx), &num, - (block128_f)SM4_encrypt); + CRYPTO_ctr128_encrypt(in, out, len, &dat->ks.ks, + ctx->iv, + EVP_CIPHER_CTX_buf_noconst(ctx), &num, + dat->block); EVP_CIPHER_CTX_set_num(ctx, num); return 1; } -static const EVP_CIPHER sm4_ctr_mode = { - NID_sm4_ctr, 1, 16, 16, - EVP_CIPH_CTR_MODE, - sm4_init_key, - sm4_ctr_cipher, - NULL, - sizeof(EVP_SM4_KEY), - NULL, NULL, NULL, NULL -}; +BLOCK_CIPHER_generic_pack(NID_sm4, 0) -const EVP_CIPHER *EVP_sm4_ctr(void) +typedef struct { + union { + double align; + SM4_KEY ks; + } ks1, ks2; /* sm4 key schedules to use */ + XTS128_CONTEXT xts; + int std; /* 0 for xts mode in GB/T 17964-2021 */ + /* 1 for xts mode in IEEE Std 1619-2007 */ + void (*stream_gb) (const unsigned char *in, + unsigned char *out, size_t length, + const SM4_KEY *key1, const SM4_KEY *key2, + const unsigned char iv[16]); /* stream for xts mode in GB/T 17964-2021 */ + void (*stream) (const unsigned char *in, + unsigned char *out, size_t length, + const SM4_KEY *key1, const SM4_KEY *key2, + const unsigned char iv[16]); /* stream for xts mode in IEEE Std 1619-2007 */ +} EVP_SM4_XTS_CTX; + +static int sm4_xts_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) +{ + EVP_SM4_XTS_CTX *xctx = EVP_C_DATA(EVP_SM4_XTS_CTX, c); + + if (type == EVP_CTRL_COPY) { + EVP_CIPHER_CTX *out = ptr; + EVP_SM4_XTS_CTX *xctx_out = EVP_C_DATA(EVP_SM4_XTS_CTX,out); + + if (xctx->xts.key1) { + if (xctx->xts.key1 != &xctx->ks1) + return 0; + xctx_out->xts.key1 = &xctx_out->ks1; + } + if (xctx->xts.key2) { + if (xctx->xts.key2 != &xctx->ks2) + return 0; + xctx_out->xts.key2 = &xctx_out->ks2; + } + return 1; + } else if (type == EVP_CTRL_XTS_STANDARD) { + if ((arg < 0) || (arg > 1)) + return 0; + xctx->std = arg; + return 1; + } else if (type != EVP_CTRL_INIT) + return -1; + /* key1 and key2 are used as an indicator both key and IV are set */ + xctx->xts.key1 = NULL; + xctx->xts.key2 = NULL; + return 1; +} + +static int sm4_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + const unsigned char *iv, int enc) +{ + EVP_SM4_XTS_CTX *xctx = EVP_C_DATA(EVP_SM4_XTS_CTX,ctx); + + if (!iv && !key) + return 1; + + if (key) + do { + /* The key is two half length keys in reality */ + const int bytes = EVP_CIPHER_CTX_key_length(ctx) / 2; + xctx->stream_gb = NULL; + xctx->stream = NULL; +#ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + if (enc) { + vpsm4_ex_set_encrypt_key(key, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f) vpsm4_ex_encrypt; + xctx->stream_gb = vpsm4_ex_xts_encrypt_gb; + xctx->stream = vpsm4_ex_xts_encrypt; + } else { + vpsm4_ex_set_decrypt_key(key, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f) vpsm4_ex_decrypt; + xctx->stream_gb = vpsm4_ex_xts_decrypt_gb; + xctx->stream = vpsm4_ex_xts_decrypt; + } + vpsm4_ex_set_encrypt_key(key + bytes, &xctx->ks2.ks); + xctx->xts.block2 = (block128_f) vpsm4_ex_encrypt; + + xctx->xts.key1 = &xctx->ks1; + break; + } else +#endif + (void)0; /* terminate potentially open 'else' */ + + if (enc) { + SM4_set_key(key, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f) SM4_encrypt; + } else { + SM4_set_key(key, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f) SM4_decrypt; + } + + SM4_set_key(key + bytes, &xctx->ks2.ks); + xctx->xts.block2 = (block128_f) SM4_encrypt; + + xctx->xts.key1 = &xctx->ks1; + } while (0); + + if (iv) { + xctx->xts.key2 = &xctx->ks2; + memcpy(EVP_CIPHER_CTX_iv_noconst(ctx), iv, 16); + } + + return 1; +} + +static int sm4_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) +{ + EVP_SM4_XTS_CTX *xctx = EVP_C_DATA(EVP_SM4_XTS_CTX,ctx); + if (!xctx->xts.key1 || !xctx->xts.key2) + return 0; + if (!out || !in || len < SM4_BLOCK_SIZE) + return 0; + if (xctx->std) { + if (xctx->stream) + (*xctx->stream) (in, out, len, + xctx->xts.key1, xctx->xts.key2, + EVP_CIPHER_CTX_iv_noconst(ctx)); + else if (CRYPTO_xts128_encrypt(&xctx->xts, EVP_CIPHER_CTX_iv_noconst(ctx), + in, out, len, + EVP_CIPHER_CTX_encrypting(ctx))) + return 0; + } else { + if (xctx->stream_gb) + (*xctx->stream_gb) (in, out, len, + xctx->xts.key1, xctx->xts.key2, + EVP_CIPHER_CTX_iv_noconst(ctx)); + else if (CRYPTO_xts128gb_encrypt(&xctx->xts, EVP_CIPHER_CTX_iv_noconst(ctx), + in, out, len, + EVP_CIPHER_CTX_encrypting(ctx))) + return 0; + } + return 1; +} + +#define SM4_XTS_BLOCK_SIZE 1 +#define SM4_XTS_IV_LENGTH 16 +#define SM4_XTS_KEY_LENGTH 32 + +#define XTS_FLAGS (EVP_CIPH_FLAG_DEFAULT_ASN1 | EVP_CIPH_CUSTOM_IV \ + | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT \ + | EVP_CIPH_CUSTOM_COPY | EVP_CIPH_XTS_MODE) + +static const EVP_CIPHER sm4_xts_mode = { + NID_sm4_xts, + SM4_XTS_BLOCK_SIZE, + SM4_XTS_KEY_LENGTH, + SM4_XTS_IV_LENGTH, + XTS_FLAGS, + sm4_xts_init_key, + sm4_xts_cipher, + NULL, + sizeof(EVP_SM4_XTS_CTX), + NULL, NULL, sm4_xts_ctrl, NULL +}; + +const EVP_CIPHER *EVP_sm4_xts(void) { - return &sm4_ctr_mode; + return &sm4_xts_mode; } #endif diff --git a/crypto/modes/build.info b/crypto/modes/build.info index 821340eb90..f974b04457 100644 --- a/crypto/modes/build.info +++ b/crypto/modes/build.info @@ -1,7 +1,7 @@ LIBS=../../libcrypto SOURCE[../../libcrypto]=\ cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \ - ccm128.c xts128.c wrap128.c ocb128.c \ + ccm128.c xts128.c xts128gb.c wrap128.c ocb128.c \ {- $target{modes_asm_src} -} INCLUDE[gcm128.o]=.. diff --git a/crypto/modes/xts128gb.c b/crypto/modes/xts128gb.c new file mode 100644 index 0000000000..370b975229 --- /dev/null +++ b/crypto/modes/xts128gb.c @@ -0,0 +1,204 @@ +/* + * Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +// This is the xts mode in GB/T 17964-2021 +#include +#include "modes_local.h" +#include + +#ifndef STRICT_ALIGNMENT +# ifdef __GNUC__ +typedef u64 u64_a1 __attribute((__aligned__(1))); +# else +typedef u64 u64_a1; +# endif +#endif + +int CRYPTO_xts128gb_encrypt(const XTS128_CONTEXT *ctx, + const unsigned char iv[16], + const unsigned char *inp, unsigned char *out, + size_t len, int enc) +{ + const union { + long one; + char little; + } is_endian = { + 1 + }; + union { + u64 u[2]; + u32 d[4]; + u8 c[16]; + } tweak, scratch; + unsigned int i; + + if (len < 16) + return -1; + + memcpy(tweak.c, iv, 16); + + (*ctx->block2) (tweak.c, tweak.c, ctx->key2); + + if (!enc && (len % 16)) + len -= 16; + + while (len >= 16) { +#if defined(STRICT_ALIGNMENT) + memcpy(scratch.c, inp, 16); + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; +#else + scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak.u[0]; + scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak.u[1]; +#endif + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); +#if defined(STRICT_ALIGNMENT) + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out, scratch.c, 16); +#else + ((u64_a1 *)out)[0] = scratch.u[0] ^= tweak.u[0]; + ((u64_a1 *)out)[1] = scratch.u[1] ^= tweak.u[1]; +#endif + inp += 16; + out += 16; + len -= 16; + + if (len == 0) + return 0; + + if (is_endian.little) { + u8 res; + u64 hi, lo; +#ifdef BSWAP8 + hi = BSWAP8(tweak.u[0]); + lo = BSWAP8(tweak.u[1]); +#else + u8 *p = tweak.c; + + hi = (u64)GETU32(p) << 32 | GETU32(p + 4); + lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); +#endif + res = (u8)lo & 1; + tweak.u[0] = (lo >> 1) | (hi << 63); + tweak.u[1] = hi >> 1; + if (res) + tweak.c[15] ^= 0xe1; +#ifdef BSWAP8 + hi = BSWAP8(tweak.u[0]); + lo = BSWAP8(tweak.u[1]); +#else + p = tweak.c; + + hi = (u64)GETU32(p) << 32 | GETU32(p + 4); + lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); +#endif + tweak.u[0] = lo; + tweak.u[1] = hi; + } else { + u8 Cin, Cout; + Cin = Cout = 0; + for (i = 0; i < 16; ++i) { + Cout = (tweak.c[i] << 7) & 0x80; + tweak.c[i] = ((tweak.c[i] >> 1) + Cin) & 0xff; + Cin = Cout; + } + if (Cout) + tweak.c[0] ^= 0xe1; + } + } + if (enc) { + for (i = 0; i < len; ++i) { + u8 c = inp[i]; + out[i] = scratch.c[i]; + scratch.c[i] = c; + } + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out - 16, scratch.c, 16); + } else { + union { + u64 u[2]; + u8 c[16]; + } tweak1; + + if (is_endian.little) { + u8 res; + u64 hi, lo; +#ifdef BSWAP8 + hi = BSWAP8(tweak.u[0]); + lo = BSWAP8(tweak.u[1]); +#else + u8 *p = tweak.c; + + hi = (u64)GETU32(p) << 32 | GETU32(p + 4); + lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); +#endif + res = (u8)lo & 1; + tweak1.u[0] = (lo >> 1) | (hi << 63); + tweak1.u[1] = hi >> 1; + if (res) + tweak1.c[15] ^= 0xe1; +#ifdef BSWAP8 + hi = BSWAP8(tweak1.u[0]); + lo = BSWAP8(tweak1.u[1]); +#else + p = tweak1.c; + + hi = (u64)GETU32(p) << 32 | GETU32(p + 4); + lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); +#endif + tweak1.u[0] = lo; + tweak1.u[1] = hi; + } else { + u8 Cin, Cout; + Cin = Cout = 0; + for ( i = 0; i < 16; ++i ) { + Cout = (tweak.c[i] << 7) & 0x80; + tweak1.c[i] = ((tweak.c[i] >> 1) + Cin) & 0xff; + Cin = Cout; + } + if (Cout) + tweak1.c[0] ^= 0xe1; + } +#if defined(STRICT_ALIGNMENT) + memcpy(scratch.c, inp, 16); + scratch.u[0] ^= tweak1.u[0]; + scratch.u[1] ^= tweak1.u[1]; +#else + scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak1.u[0]; + scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak1.u[1]; +#endif + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); + scratch.u[0] ^= tweak1.u[0]; + scratch.u[1] ^= tweak1.u[1]; + + for (i = 0; i < len; ++i) { + u8 c = inp[16 + i]; + out[16 + i] = scratch.c[i]; + scratch.c[i] = c; + } + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); +#if defined(STRICT_ALIGNMENT) + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out, scratch.c, 16); +#else + ((u64_a1 *)out)[0] = scratch.u[0] ^ tweak.u[0]; + ((u64_a1 *)out)[1] = scratch.u[1] ^ tweak.u[1]; +#endif + } + + return 0; +} diff --git a/crypto/objects/obj_dat.h b/crypto/objects/obj_dat.h index 63bf69e443..36c38d0d22 100644 --- a/crypto/objects/obj_dat.h +++ b/crypto/objects/obj_dat.h @@ -10,7 +10,7 @@ */ /* Serialized OID's */ -static const unsigned char so[7762] = { +static const unsigned char so[7770] = { 0x2A,0x86,0x48,0x86,0xF7,0x0D, /* [ 0] OBJ_rsadsi */ 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01, /* [ 6] OBJ_pkcs */ 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x02,0x02, /* [ 13] OBJ_md2 */ @@ -1076,9 +1076,10 @@ static const unsigned char so[7762] = { 0x2A,0x85,0x03,0x07,0x01,0x02,0x01,0x01,0x04, /* [ 7736] OBJ_id_tc26_gost_3410_2012_256_paramSetD */ 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x02,0x0C, /* [ 7745] OBJ_hmacWithSHA512_224 */ 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x02,0x0D, /* [ 7753] OBJ_hmacWithSHA512_256 */ + 0x2A,0x81,0x1C,0xCF,0x55,0x01,0x68,0x0A, /* [ 7761] OBJ_sm4_xts */ }; -#define NUM_NID 1195 +#define NUM_NID 1197 static const ASN1_OBJECT nid_objs[NUM_NID] = { {"UNDEF", "undefined", NID_undef}, {"rsadsi", "RSA Data Security, Inc.", NID_rsadsi, 6, &so[0]}, @@ -2275,9 +2276,11 @@ static const ASN1_OBJECT nid_objs[NUM_NID] = { {"magma-mac", "magma-mac", NID_magma_mac}, {"hmacWithSHA512-224", "hmacWithSHA512-224", NID_hmacWithSHA512_224, 8, &so[7745]}, {"hmacWithSHA512-256", "hmacWithSHA512-256", NID_hmacWithSHA512_256, 8, &so[7753]}, + { NULL, NULL, NID_undef }, + {"SM4-XTS", "sm4-xts", NID_sm4_xts, 8, &so[7761]}, }; -#define NUM_SN 1186 +#define NUM_SN 1187 static const unsigned int sn_objs[NUM_SN] = { 364, /* "AD_DVCS" */ 419, /* "AES-128-CBC" */ @@ -2551,6 +2554,7 @@ static const unsigned int sn_objs[NUM_SN] = { 1139, /* "SM4-CTR" */ 1133, /* "SM4-ECB" */ 1135, /* "SM4-OFB" */ + 1196, /* "SM4-XTS" */ 188, /* "SMIME" */ 167, /* "SMIME-CAPS" */ 100, /* "SN" */ @@ -3467,7 +3471,7 @@ static const unsigned int sn_objs[NUM_SN] = { 1093, /* "x509ExtAdmission" */ }; -#define NUM_LN 1186 +#define NUM_LN 1187 static const unsigned int ln_objs[NUM_LN] = { 363, /* "AD Time Stamping" */ 405, /* "ANSI X9.62" */ @@ -4609,6 +4613,7 @@ static const unsigned int ln_objs[NUM_LN] = { 1139, /* "sm4-ctr" */ 1133, /* "sm4-ecb" */ 1135, /* "sm4-ofb" */ + 1196, /* "sm4-xts" */ 16, /* "stateOrProvinceName" */ 660, /* "streetAddress" */ 498, /* "subtreeMaximumQuality" */ @@ -4657,7 +4662,7 @@ static const unsigned int ln_objs[NUM_LN] = { 125, /* "zlib compression" */ }; -#define NUM_OBJ 1071 +#define NUM_OBJ 1072 static const unsigned int obj_objs[NUM_OBJ] = { 0, /* OBJ_undef 0 */ 181, /* OBJ_iso 1 */ @@ -5124,6 +5129,7 @@ static const unsigned int obj_objs[NUM_OBJ] = { 1136, /* OBJ_sm4_cfb1 1 2 156 10197 1 104 5 */ 1138, /* OBJ_sm4_cfb8 1 2 156 10197 1 104 6 */ 1139, /* OBJ_sm4_ctr 1 2 156 10197 1 104 7 */ + 1196, /* OBJ_sm4_xts 1 2 156 10197 1 104 10 */ 1172, /* OBJ_sm2 1 2 156 10197 1 301 */ 1143, /* OBJ_sm3 1 2 156 10197 1 401 */ 1144, /* OBJ_sm3WithRSAEncryption 1 2 156 10197 1 504 */ diff --git a/crypto/objects/obj_mac.num b/crypto/objects/obj_mac.num index 1b6a9c61a1..d1de6e1997 100644 --- a/crypto/objects/obj_mac.num +++ b/crypto/objects/obj_mac.num @@ -1192,3 +1192,4 @@ magma_cfb 1191 magma_mac 1192 hmacWithSHA512_224 1193 hmacWithSHA512_256 1194 +sm4_xts 1196 diff --git a/crypto/objects/objects.txt b/crypto/objects/objects.txt index c49d4c568b..14495f2ad8 100644 --- a/crypto/objects/objects.txt +++ b/crypto/objects/objects.txt @@ -1518,6 +1518,7 @@ sm-scheme 104 4 : SM4-CFB : sm4-cfb sm-scheme 104 5 : SM4-CFB1 : sm4-cfb1 sm-scheme 104 6 : SM4-CFB8 : sm4-cfb8 sm-scheme 104 7 : SM4-CTR : sm4-ctr +sm-scheme 104 10 : SM4-XTS : sm4-xts # There is no OID that just denotes "HMAC" oddly enough... diff --git a/crypto/sm4/asm/vpsm4_ex-armv8.pl b/crypto/sm4/asm/vpsm4_ex-armv8.pl new file mode 100644 index 0000000000..86a6f89f52 --- /dev/null +++ b/crypto/sm4/asm/vpsm4_ex-armv8.pl @@ -0,0 +1,1173 @@ +#! /usr/bin/env perl +# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# This module implements SM4 with ASIMD and AESE on AARCH64 +# +# Feb 2022 +# + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +$prefix="vpsm4_ex"; +my ($inp,$outp,$rks1,$rks2,$ivp,$enc)=("x0","x1","x3","x4","x5","x6"); +my ($blocks,$len)=("x2","x2"); +my $remain=("x7"); +my ($ptr,$counter)=("x12","w13"); +my ($wtmp0,$wtmp1,$wtmp2,$wtmp3)=("w8","w9","w10","w11"); +my ($xtmp0,$xtmp1,$xtmp2,$xtmp3)=("x8","x9","x10","x11"); +my ($word0,$word1,$word2,$word3)=("w14","w15","w16","w17"); +my @twx=map("x$_",(14..29)); +my $lastBlk=("x26"); + +my @tweak=map("v$_",(0..7)); +my @qtmp=map("q$_",(8..11)); +my @vtmp=map("v$_",(8..11)); +my ($rk0,$rk1)=("v12","v13"); +my ($rka,$rkb)=("v14","v15"); +my @data=map("v$_",(16..19)); +my @datax=map("v$_",(20..23)); +my ($vtmp4,$vtmp5)=("v24","v25"); +my $lastTweak=("v25"); +my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31"); +my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31"); + +sub rev32() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { +$code.=<<___; +#ifndef __ARMEB__ + rev32 $dst.16b,$src.16b +#else + mov $dst.16b,$src.16b +#endif +___ + } else { +$code.=<<___; +#ifndef __ARMEB__ + rev32 $dst.16b,$dst.16b +#endif +___ + } +} + +sub rev32_armeb() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { +$code.=<<___; +#ifdef __ARMEB__ + rev32 $dst.16b,$src.16b +#else + mov $dst.16b,$src.16b +#endif +___ + } else { +$code.=<<___; +#ifdef __ARMEB__ + rev32 $dst.16b,$dst.16b +#endif +___ + } +} + +sub transpose() { + my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; + +$code.=<<___; + zip1 $vt0.4s,$dat0.4s,$dat1.4s + zip2 $vt1.4s,$dat0.4s,$dat1.4s + zip1 $vt2.4s,$dat2.4s,$dat3.4s + zip2 $vt3.4s,$dat2.4s,$dat3.4s + zip1 $dat0.2d,$vt0.2d,$vt2.2d + zip2 $dat1.2d,$vt0.2d,$vt2.2d + zip1 $dat2.2d,$vt1.2d,$vt3.2d + zip2 $dat3.2d,$vt1.2d,$vt3.2d +___ +} + +sub load_sbox_matrix () { +$code.=<<___; + ldr $MaskQ, =0x0306090c0f0205080b0e0104070a0d00 + ldr $TAHMatQ, =0x22581a6002783a4062185a2042387a00 + ldr $TALMatQ, =0xc10bb67c4a803df715df62a89e54e923 + ldr $ATAHMatQ, =0x1407c6d56c7fbeadb9aa6b78c1d21300 + ldr $ATALMatQ, =0xe383c1a1fe9edcbc6404462679195b3b + ldr $ANDMaskQ, =0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f +___ +} +# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x) +sub mul_matrix() { + my $x = shift; + my $higherMat = shift; + my $lowerMat = shift; + my $tmp = shift; +$code.=<<___; + ushr $tmp.16b, $x.16b, 4 + and $x.16b, $x.16b, $ANDMaskV.16b + tbl $x.16b, {$lowerMat.16b}, $x.16b + tbl $tmp.16b, {$higherMat.16b}, $tmp.16b + eor $x.16b, $x.16b, $tmp.16b +___ +} + +# sbox operation for one single word +sub sbox_1word () { + my $word = shift; + +$code.=<<___; + mov @vtmp[3].s[0],$word + // optimize sbox using AESE instruction + tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); +$code.=<<___; + eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b + aese @vtmp[0].16b,@vtmp[1].16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); +$code.=<<___; + + mov $wtmp0,@vtmp[0].s[0] + eor $word,$wtmp0,$wtmp0,ror #32-2 + eor $word,$word,$wtmp0,ror #32-10 + eor $word,$word,$wtmp0,ror #32-18 + eor $word,$word,$wtmp0,ror #32-24 +___ +} + +# sbox operation for 4-lane of words +sub sbox() { + my $dat = shift; + +$code.=<<___; + // optimize sbox using AESE instruction + tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); +$code.=<<___; + eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b + aese @vtmp[0].16b,@vtmp[1].16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4); +$code.=<<___; + mov $dat.16b,@vtmp[0].16b + + // linear transformation + ushr @vtmp[0].4s,$dat.4s,32-2 + ushr @vtmp[1].4s,$dat.4s,32-10 + ushr @vtmp[2].4s,$dat.4s,32-18 + ushr @vtmp[3].4s,$dat.4s,32-24 + sli @vtmp[0].4s,$dat.4s,2 + sli @vtmp[1].4s,$dat.4s,10 + sli @vtmp[2].4s,$dat.4s,18 + sli @vtmp[3].4s,$dat.4s,24 + eor $vtmp4.16b,@vtmp[0].16b,$dat.16b + eor $vtmp4.16b,$vtmp4.16b,$vtmp[1].16b + eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b + eor $dat.16b,$dat.16b,$vtmp4.16b +___ +} + +# sbox operation for 8-lane of words +sub sbox_double() { + my $dat = shift; + my $datx = shift; + +$code.=<<___; + // optimize sbox using AESE instruction + tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b + tbl @vtmp[1].16b, {$datx.16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); + &mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4); +$code.=<<___; + eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b + aese @vtmp[0].16b,$vtmp5.16b + aese @vtmp[1].16b,$vtmp5.16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4); + &mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4); +$code.=<<___; + mov $dat.16b,@vtmp[0].16b + mov $datx.16b,@vtmp[1].16b + + // linear transformation + ushr @vtmp[0].4s,$dat.4s,32-2 + ushr $vtmp5.4s,$datx.4s,32-2 + ushr @vtmp[1].4s,$dat.4s,32-10 + ushr @vtmp[2].4s,$dat.4s,32-18 + ushr @vtmp[3].4s,$dat.4s,32-24 + sli @vtmp[0].4s,$dat.4s,2 + sli $vtmp5.4s,$datx.4s,2 + sli @vtmp[1].4s,$dat.4s,10 + sli @vtmp[2].4s,$dat.4s,18 + sli @vtmp[3].4s,$dat.4s,24 + eor $vtmp4.16b,@vtmp[0].16b,$dat.16b + eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b + eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b + eor $dat.16b,$dat.16b,$vtmp4.16b + ushr @vtmp[1].4s,$datx.4s,32-10 + ushr @vtmp[2].4s,$datx.4s,32-18 + ushr @vtmp[3].4s,$datx.4s,32-24 + sli @vtmp[1].4s,$datx.4s,10 + sli @vtmp[2].4s,$datx.4s,18 + sli @vtmp[3].4s,$datx.4s,24 + eor $vtmp4.16b,$vtmp5.16b,$datx.16b + eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b + eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b + eor $datx.16b,$datx.16b,$vtmp4.16b +___ +} + +# sm4 for one block of data, in scalar registers word0/word1/word2/word3 +sub sm4_1blk () { + my $kptr = shift; + +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + /* B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) */ + eor $wtmp3,$word2,$word3 + eor $wtmp2,$wtmp0,$word1 + eor $wtmp3,$wtmp3,$wtmp2 +___ + &sbox_1word($wtmp3); +$code.=<<___; + eor $word0,$word0,$wtmp3 + /* B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) */ + eor $wtmp3,$word2,$word3 + eor $wtmp2,$word0,$wtmp1 + eor $wtmp3,$wtmp3,$wtmp2 +___ + &sbox_1word($wtmp3); +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + eor $word1,$word1,$wtmp3 + /* B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) */ + eor $wtmp3,$word0,$word1 + eor $wtmp2,$wtmp0,$word3 + eor $wtmp3,$wtmp3,$wtmp2 +___ + &sbox_1word($wtmp3); +$code.=<<___; + eor $word2,$word2,$wtmp3 + /* B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) */ + eor $wtmp3,$word0,$word1 + eor $wtmp2,$word2,$wtmp1 + eor $wtmp3,$wtmp3,$wtmp2 +___ + &sbox_1word($wtmp3); +$code.=<<___; + eor $word3,$word3,$wtmp3 +___ +} + +# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3 +sub sm4_4blks () { + my $kptr = shift; + +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + dup $rk0.4s,$wtmp0 + dup $rk1.4s,$wtmp1 + + /* B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) */ + eor $rka.16b,@data[2].16b,@data[3].16b + eor $rk0.16b,@data[1].16b,$rk0.16b + eor $rk0.16b,$rka.16b,$rk0.16b +___ + &sbox($rk0); +$code.=<<___; + eor @data[0].16b,@data[0].16b,$rk0.16b + + /* B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) */ + eor $rka.16b,$rka.16b,@data[0].16b + eor $rk1.16b,$rka.16b,$rk1.16b +___ + &sbox($rk1); +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + eor @data[1].16b,@data[1].16b,$rk1.16b + + dup $rk0.4s,$wtmp0 + dup $rk1.4s,$wtmp1 + + /* B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) */ + eor $rka.16b,@data[0].16b,@data[1].16b + eor $rk0.16b,@data[3].16b,$rk0.16b + eor $rk0.16b,$rka.16b,$rk0.16b +___ + &sbox($rk0); +$code.=<<___; + eor @data[2].16b,@data[2].16b,$rk0.16b + + /* B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) */ + eor $rka.16b,$rka.16b,@data[2].16b + eor $rk1.16b,$rka.16b,$rk1.16b +___ + &sbox($rk1); +$code.=<<___; + eor @data[3].16b,@data[3].16b,$rk1.16b +___ +} + +# sm4 for 8 lanes of data, in neon registers +# data0/data1/data2/data3 datax0/datax1/datax2/datax3 +sub sm4_8blks () { + my $kptr = shift; + +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + /* B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) */ + dup $rk0.4s,$wtmp0 + eor $rka.16b,@data[2].16b,@data[3].16b + eor $rkb.16b,@datax[2].16b,@datax[3].16b + eor @vtmp[0].16b,@data[1].16b,$rk0.16b + eor @vtmp[1].16b,@datax[1].16b,$rk0.16b + eor $rk0.16b,$rka.16b,@vtmp[0].16b + eor $rk1.16b,$rkb.16b,@vtmp[1].16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + eor @data[0].16b,@data[0].16b,$rk0.16b + eor @datax[0].16b,@datax[0].16b,$rk1.16b + + /* B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) */ + dup $rk1.4s,$wtmp1 + eor $rka.16b,$rka.16b,@data[0].16b + eor $rkb.16b,$rkb.16b,@datax[0].16b + eor $rk0.16b,$rka.16b,$rk1.16b + eor $rk1.16b,$rkb.16b,$rk1.16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + eor @data[1].16b,@data[1].16b,$rk0.16b + eor @datax[1].16b,@datax[1].16b,$rk1.16b + + /* B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) */ + dup $rk0.4s,$wtmp0 + eor $rka.16b,@data[0].16b,@data[1].16b + eor $rkb.16b,@datax[0].16b,@datax[1].16b + eor @vtmp[0].16b,@data[3].16b,$rk0.16b + eor @vtmp[1].16b,@datax[3].16b,$rk0.16b + eor $rk0.16b,$rka.16b,@vtmp[0].16b + eor $rk1.16b,$rkb.16b,@vtmp[1].16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + eor @data[2].16b,@data[2].16b,$rk0.16b + eor @datax[2].16b,@datax[2].16b,$rk1.16b + + /* B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) */ + dup $rk1.4s,$wtmp1 + eor $rka.16b,$rka.16b,@data[2].16b + eor $rkb.16b,$rkb.16b,@datax[2].16b + eor $rk0.16b,$rka.16b,$rk1.16b + eor $rk1.16b,$rkb.16b,$rk1.16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + eor @data[3].16b,@data[3].16b,$rk0.16b + eor @datax[3].16b,@datax[3].16b,$rk1.16b +___ +} + +sub encrypt_1blk_norev() { + my $dat = shift; + my $rks = shift; +$code.=<<___; + mov $ptr,$rks + mov $counter,#8 + mov $word0,$dat.s[0] + mov $word1,$dat.s[1] + mov $word2,$dat.s[2] + mov $word3,$dat.s[3] +10: +___ + &sm4_1blk($ptr); +$code.=<<___; + subs $counter,$counter,#1 + b.ne 10b + mov $dat.s[0],$word3 + mov $dat.s[1],$word2 + mov $dat.s[2],$word1 + mov $dat.s[3],$word0 +___ +} + +sub encrypt_1blk() { + my $dat = shift; + my $rks = shift; + + &encrypt_1blk_norev($dat,$rks); + &rev32($dat,$dat); +} + +sub encrypt_4blks() { +$code.=<<___; + mov $ptr,$rks1 + mov $counter,#8 +10: +___ + &sm4_4blks($ptr); +$code.=<<___; + subs $counter,$counter,#1 + b.ne 10b +___ + &rev32(@vtmp[3],@data[0]); + &rev32(@vtmp[2],@data[1]); + &rev32(@vtmp[1],@data[2]); + &rev32(@vtmp[0],@data[3]); +} + +sub encrypt_8blks() { + my $rks = shift; +$code.=<<___; + mov $ptr,$rks + mov $counter,#8 +10: +___ + &sm4_8blks($ptr); +$code.=<<___; + subs $counter,$counter,#1 + b.ne 10b +___ + &rev32(@vtmp[3],@data[0]); + &rev32(@vtmp[2],@data[1]); + &rev32(@vtmp[1],@data[2]); + &rev32(@vtmp[0],@data[3]); + &rev32(@data[3],@datax[0]); + &rev32(@data[2],@datax[1]); + &rev32(@data[1],@datax[2]); + &rev32(@data[0],@datax[3]); +} + +sub mov_reg_to_vec() { + my $src0 = shift; + my $src1 = shift; + my $desv = shift; +$code.=<<___; + mov $desv.d[0],$src0 + mov $desv.d[1],$src1 +#ifdef __ARMEB__ + rev32 $desv.16b,$desv.16b +#endif +___ +} + +sub mov_vec_to_reg() { + my $srcv = shift; + my $des0 = shift; + my $des1 = shift; +$code.=<<___; + mov $des0,$srcv.d[0] + mov $des1,$srcv.d[1] +___ +} + +sub compute_tweak() { + my $src0 = shift; + my $src1 = shift; + my $des0 = shift; + my $des1 = shift; +$code.=<<___; + mov $wtmp0,0x87 + extr $xtmp2,$src1,$src1,#32 + extr $des1,$src1,$src0,#63 + and $wtmp1,$wtmp0,$wtmp2,asr#31 + eor $des0,$xtmp1,$src0,lsl#1 +___ +} + +sub compute_tweak_vec() { + my $src = shift; + my $des = shift; + &rbit(@vtmp[2],$src); +$code.=<<___; + ldr @qtmp[0], =0x01010101010101010101010101010187 + shl $des.16b, @vtmp[2].16b, #1 + ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 + ushr @vtmp[1].16b, @vtmp[1].16b, #7 + mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b + eor $des.16b, $des.16b, @vtmp[1].16b +___ + &rbit($des,$des); +} + +sub mov_en_to_enc(){ + my $en = shift; + if ($en eq "en") { +$code.=<<___; + mov $enc,1 +___ + } else { +$code.=<<___; + mov $enc,0 +___ + } +} + +sub rbit() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { + if ($standard eq "_gb") { +$code.=<<___; + rbit $dst.16b,$src.16b +___ + } else { +$code.=<<___; + mov $dst.16b,$src.16b +___ + } + } else { + if ($standard eq "_gb") { +$code.=<<___; + rbit $dst.16b,$src.16b +___ + } + } +} + +$code=<<___; +#include "arm_arch.h" +.arch armv8-a+crypto +.text + +.type ${prefix}_consts,%object +.align 7 +${prefix}_consts: +.Lck: + .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 + .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 + .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 + .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 + .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 + .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 + .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 + .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 +.Lfk: + .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc +.Lshuffles: + .long 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x03020100 + +.size ${prefix}_consts,.-${prefix}_consts +___ + +{{{ +my ($userKey,$roundKey,$enc)=("x0","x1","w2"); +my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8"); +my ($vkey,$vfk,$vmap)=("v5","v6","v7"); +$code.=<<___; +.type ${prefix}_set_key,%function +.align 4 +${prefix}_set_key: + ld1 {$vkey.4s},[$userKey] +___ + &load_sbox_matrix(); + &rev32($vkey,$vkey); +$code.=<<___; + adr $pointer,.Lshuffles + ld1 {$vmap.4s},[$pointer] + adr $pointer,.Lfk + ld1 {$vfk.4s},[$pointer] + eor $vkey.16b,$vkey.16b,$vfk.16b + mov $schedules,#32 + adr $pointer,.Lck + movi @vtmp[0].16b,#64 + cbnz $enc,1f + add $roundKey,$roundKey,124 +1: + mov $wtmp,$vkey.s[1] + ldr $roundkey,[$pointer],#4 + eor $roundkey,$roundkey,$wtmp + mov $wtmp,$vkey.s[2] + eor $roundkey,$roundkey,$wtmp + mov $wtmp,$vkey.s[3] + eor $roundkey,$roundkey,$wtmp + + // optimize sbox using AESE instruction + mov @data[0].s[0],$roundkey + tbl @vtmp[0].16b, {@data[0].16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); +$code.=<<___; + eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b + aese @vtmp[0].16b,@vtmp[1].16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); +$code.=<<___; + mov $wtmp,@vtmp[0].s[0] + + // linear transformation + eor $roundkey,$wtmp,$wtmp,ror #19 + eor $roundkey,$roundkey,$wtmp,ror #9 + mov $wtmp,$vkey.s[0] + eor $roundkey,$roundkey,$wtmp + mov $vkey.s[0],$roundkey + cbz $enc,2f + str $roundkey,[$roundKey],#4 + b 3f +2: + str $roundkey,[$roundKey],#-4 +3: + tbl $vkey.16b,{$vkey.16b},$vmap.16b + subs $schedules,$schedules,#1 + b.ne 1b + ret +.size ${prefix}_set_key,.-${prefix}_set_key +___ +}}} + + +{{{ +$code.=<<___; +.type ${prefix}_enc_4blks,%function +.align 4 +${prefix}_enc_4blks: +___ + &encrypt_4blks(); +$code.=<<___; + ret +.size ${prefix}_enc_4blks,.-${prefix}_enc_4blks +___ +}}} + +{{{ +$code.=<<___; +.type ${prefix}_enc_8blks,%function +.align 4 +${prefix}_enc_8blks: +___ + &encrypt_8blks($rks1); +$code.=<<___; + ret +.size ${prefix}_enc_8blks,.-${prefix}_enc_8blks +___ +}}} + +{{{ +my ($key,$keys)=("x0","x1"); +$code.=<<___; +.globl ${prefix}_set_encrypt_key +.type ${prefix}_set_encrypt_key,%function +.align 5 +${prefix}_set_encrypt_key: + stp x29,x30,[sp,#-16]! + mov w2,1 + bl ${prefix}_set_key + ldp x29,x30,[sp],#16 + ret +.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key +___ +}}} + +{{{ +my ($key,$keys)=("x0","x1"); +$code.=<<___; +.globl ${prefix}_set_decrypt_key +.type ${prefix}_set_decrypt_key,%function +.align 5 +${prefix}_set_decrypt_key: + stp x29,x30,[sp,#-16]! + mov w2,0 + bl ${prefix}_set_key + ldp x29,x30,[sp],#16 + ret +.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key +___ +}}} + + +{{{ + +$code.=<<___; +.globl ${prefix}_ecb_encrypt +.type ${prefix}_ecb_encrypt,%function +.align 5 +${prefix}_ecb_encrypt: + stp d8,d9,[sp,#-0x10]! + stp d10,d11,[sp,#-0x10]! + stp d12,d13,[sp,#-0x10]! + stp d14,d15,[sp,#-0x10]! + stp x16,x17,[sp,#-0x10]! + stp x29,x30,[sp,#-0x10]! +___ + &load_sbox_matrix(); +$code.=<<___; + // convert length into blocks + lsr x2,x2,4 +.Lecb_8_blocks_process: + cmp $blocks,#8 + b.lt .Lecb_4_blocks_process + ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 + ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &rev32(@datax[0],@datax[0]); + &rev32(@datax[1],@datax[1]); + &rev32(@datax[2],@datax[2]); + &rev32(@datax[3],@datax[3]); +$code.=<<___; + bl ${prefix}_enc_8blks + st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.gt .Lecb_8_blocks_process + b 100f +.Lecb_4_blocks_process: + cmp $blocks,#4 + b.lt 1f + ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl ${prefix}_enc_4blks + st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + sub $blocks,$blocks,#4 +1: + // process last block + cmp $blocks,#1 + b.lt 100f + b.gt 1f + ld1 {@data[0].4s},[$inp] +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0],$rks1); +$code.=<<___; + st1 {@data[0].4s},[$outp] + b 100f +1: // process last 2 blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16 + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16 + cmp $blocks,#2 + b.gt 1f +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl ${prefix}_enc_4blks + st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 + st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp] + b 100f +1: // process last 3 blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16 +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl ${prefix}_enc_4blks + st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 + st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16 + st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp] +100: + ldp x29,x30,[sp],#0x10 + ldp x16,x17,[sp],#0x10 + ldp d14,d15,[sp],#0x10 + ldp d12,d13,[sp],#0x10 + ldp d10,d11,[sp],#0x10 + ldp d8,d9,[sp],#0x10 + ret +.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt +___ +}}} + +{{{ +sub gen_xts_do_cipher() { +$code.=<<___; +.globl ${prefix}_xts_do_cipher${standard} +.type ${prefix}_xts_do_cipher${standard},%function +.align 5 +${prefix}_xts_do_cipher${standard}: + stp x29,x30,[sp,#-16]! + ld1 {@tweak[0].4s}, [$ivp] +___ + &load_sbox_matrix(); + &rev32(@tweak[0],@tweak[0]); + &encrypt_1blk(@tweak[0],$rks2); +$code.=<<___; + and $remain,$len,#0x0F + // convert length into blocks + lsr $blocks,$len,4 + cmp $blocks,#1 + b.lt .return${standard} + + cmp $remain,0 + // If the encryption/decryption Length is N times of 16, + // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${standard} + b.eq .xts_encrypt_blocks${standard} + + // If the encryption/decryption length is not N times of 16, + // the last two blocks are encrypted/decrypted in .last_2blks_tweak${standard} or .only_2blks_tweak${standard} + // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${standard} + subs $blocks,$blocks,#1 + b.eq .only_2blks_tweak${standard} +.xts_encrypt_blocks${standard}: +___ + &rbit(@tweak[0],@tweak[0]); + &rev32_armeb(@tweak[0],@tweak[0]); + &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); + &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); + &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); + &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); + &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); + &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); + &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); + &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); +$code.=<<___; +.Lxts_8_blocks_process${standard}: + cmp $blocks,#8 +___ + &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); + &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]); + &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); + &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); + &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); + &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); + &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); + &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); + &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]); + &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); + &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]); + &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); + &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]); + &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); + &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]); + &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); +$code.=<<___; + b.lt .Lxts_4_blocks_process${standard} + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); + &rbit(@tweak[2],@tweak[2]); + &rbit(@tweak[3],@tweak[3]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b + eor @data[3].16b, @data[3].16b, @tweak[3].16b + ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 +___ + &rbit(@tweak[4],@tweak[4]); + &rbit(@tweak[5],@tweak[5]); + &rbit(@tweak[6],@tweak[6]); + &rbit(@tweak[7],@tweak[7]); +$code.=<<___; + eor @datax[0].16b, @datax[0].16b, @tweak[4].16b + eor @datax[1].16b, @datax[1].16b, @tweak[5].16b + eor @datax[2].16b, @datax[2].16b, @tweak[6].16b + eor @datax[3].16b, @datax[3].16b, @tweak[7].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &rev32(@datax[0],@datax[0]); + &rev32(@datax[1],@datax[1]); + &rev32(@datax[2],@datax[2]); + &rev32(@datax[3],@datax[3]); + &transpose(@data,@vtmp); + &transpose(@datax,@vtmp); +$code.=<<___; + bl ${prefix}_enc_8blks +___ + &transpose(@vtmp,@datax); + &transpose(@data,@datax); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b + eor @data[0].16b, @data[0].16b, @tweak[4].16b + eor @data[1].16b, @data[1].16b, @tweak[5].16b + eor @data[2].16b, @data[2].16b, @tweak[6].16b + eor @data[3].16b, @data[3].16b, @tweak[7].16b + + // save the last tweak + mov $lastTweak.16b,@tweak[7].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.gt .Lxts_8_blocks_process${standard} + b 100f +.Lxts_4_blocks_process${standard}: + cmp $blocks,#4 + b.lt 1f + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); + &rbit(@tweak[2],@tweak[2]); + &rbit(@tweak[3],@tweak[3]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b + eor @data[3].16b, @data[3].16b, @tweak[3].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &transpose(@data,@vtmp); +$code.=<<___; + bl ${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + sub $blocks,$blocks,#4 + mov @tweak[0].16b,@tweak[4].16b + mov @tweak[1].16b,@tweak[5].16b + mov @tweak[2].16b,@tweak[6].16b + // save the last tweak + mov $lastTweak.16b,@tweak[3].16b +1: + // process last block + cmp $blocks,#1 + b.lt 100f + b.gt 1f + ld1 {@data[0].4s},[$inp],#16 +___ + &rbit(@tweak[0],@tweak[0]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0],$rks1); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + st1 {@data[0].4s},[$outp],#16 + // save the last tweak + mov $lastTweak.16b,@tweak[0].16b + b 100f +1: // process last 2 blocks + cmp $blocks,#2 + b.gt 1f + ld1 {@data[0].4s,@data[1].4s},[$inp],#32 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &transpose(@data,@vtmp); +$code.=<<___; + bl ${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 + // save the last tweak + mov $lastTweak.16b,@tweak[1].16b + b 100f +1: // process last 3 blocks + ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); + &rbit(@tweak[2],@tweak[2]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &transpose(@data,@vtmp); +$code.=<<___; + bl ${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 + // save the last tweak + mov $lastTweak.16b,@tweak[2].16b +100: + cmp $remain,0 + b.eq .return${standard} + +// This brance calculates the last two tweaks, +// while the encryption/decryption length is larger than 32 +.last_2blks_tweak${standard}: +___ + &rev32_armeb($lastTweak,$lastTweak); + &compute_tweak_vec($lastTweak,@tweak[1]); + &compute_tweak_vec(@tweak[1],@tweak[2]); +$code.=<<___; + b .check_dec${standard} + + +// This brance calculates the last two tweaks, +// while the encryption/decryption length is equal to 32, who only need two tweaks +.only_2blks_tweak${standard}: + mov @tweak[1].16b,@tweak[0].16b +___ + &rev32_armeb(@tweak[1],@tweak[1]); + &compute_tweak_vec(@tweak[1],@tweak[2]); +$code.=<<___; + b .check_dec${standard} + + +// Determine whether encryption or decryption is required. +// The last two tweaks need to be swapped for decryption. +.check_dec${standard}: + // encryption:1 decryption:0 + cmp $enc,1 + b.eq .prcess_last_2blks${standard} + mov @vtmp[0].16B,@tweak[1].16b + mov @tweak[1].16B,@tweak[2].16b + mov @tweak[2].16B,@vtmp[0].16b + +.prcess_last_2blks${standard}: +___ + &rev32_armeb(@tweak[1],@tweak[1]); + &rev32_armeb(@tweak[2],@tweak[2]); +$code.=<<___; + ld1 {@data[0].4s},[$inp],#16 + eor @data[0].16b, @data[0].16b, @tweak[1].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0],$rks1); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[1].16b + st1 {@data[0].4s},[$outp],#16 + + sub $lastBlk,$outp,16 + .loop${standard}: + subs $remain,$remain,1 + ldrb $wtmp0,[$lastBlk,$remain] + ldrb $wtmp1,[$inp,$remain] + strb $wtmp1,[$lastBlk,$remain] + strb $wtmp0,[$outp,$remain] + b.gt .loop${standard} + ld1 {@data[0].4s}, [$lastBlk] + eor @data[0].16b, @data[0].16b, @tweak[2].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0],$rks1); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[2].16b + st1 {@data[0].4s}, [$lastBlk] +.return${standard}: + ldp x29,x30,[sp],#16 + ret +.size ${prefix}_xts_do_cipher${standard},.-${prefix}_xts_do_cipher${standard} +___ +} #end of gen_xts_do_cipher + +}}} + +{{{ +sub gen_xts_cipher() { + my $en = shift; + +$code.=<<___; +.globl ${prefix}_xts_${en}crypt${standard} +.type ${prefix}_xts_${en}crypt${standard},%function +.align 5 +${prefix}_xts_${en}crypt${standard}: + stp x15, x16, [sp, #-0x10]! + stp x17, x18, [sp, #-0x10]! + stp x19, x20, [sp, #-0x10]! + stp x21, x22, [sp, #-0x10]! + stp x23, x24, [sp, #-0x10]! + stp x25, x26, [sp, #-0x10]! + stp x27, x28, [sp, #-0x10]! + stp x29, x30, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d14, d15, [sp, #-0x10]! +___ + &mov_en_to_enc($en); +$code.=<<___; + bl ${prefix}_xts_do_cipher${standard} + ldp d14, d15, [sp], #0x10 + ldp d12, d13, [sp], #0x10 + ldp d10, d11, [sp], #0x10 + ldp d8, d9, [sp], #0x10 + ldp x29, x30, [sp], #0x10 + ldp x27, x28, [sp], #0x10 + ldp x25, x26, [sp], #0x10 + ldp x23, x24, [sp], #0x10 + ldp x21, x22, [sp], #0x10 + ldp x19, x20, [sp], #0x10 + ldp x17, x18, [sp], #0x10 + ldp x15, x16, [sp], #0x10 + ret +.size ${prefix}_xts_${en}crypt${standard},.-${prefix}_xts_${en}crypt${standard} +___ + +} # end of gen_xts_cipher +$standard="_gb"; +&gen_xts_do_cipher(); +&gen_xts_cipher("en"); +&gen_xts_cipher("de"); +$standard=""; +&gen_xts_do_cipher(); +&gen_xts_cipher("en"); +&gen_xts_cipher("de"); +}}} + +######################################## +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info index b65a7d149e..bb042c5792 100644 --- a/crypto/sm4/build.info +++ b/crypto/sm4/build.info @@ -1,4 +1,7 @@ LIBS=../../libcrypto SOURCE[../../libcrypto]=\ - sm4.c + sm4.c {- $target{sm4_asm_src} -} + +GENERATE[vpsm4_ex-armv8.S]=asm/vpsm4_ex-armv8.pl $(PERLASM_SCHEME) +INCLUDE[vpsm4_ex-armv8.o]=.. \ No newline at end of file diff --git a/doc/man3/EVP_sm4_xts.pod b/doc/man3/EVP_sm4_xts.pod new file mode 100644 index 0000000000..09ca3fb341 --- /dev/null +++ b/doc/man3/EVP_sm4_xts.pod @@ -0,0 +1,67 @@ +=pod + +=head1 NAME + +EVP_sm4_xts, +- EVP SM4 cipher + +=head1 SYNOPSIS + + #include + + const EVP_CIPHER *EVP_sm4_xts(void); + +=head1 DESCRIPTION + +The XTS mode of operation (GB/T 17964-2021) for SM4 block cipher. + +=over 4 + +=item EVP_sm4_xts(), + +The SM4 blockcipher with a 256-bit key in XTS mode. This mode use a key length of 256 bits and acts on blocks of 128 bits. + +The B parameter to L or L is the XTS first "tweak" value. XTS mode has two implementations to calculate the following tweak values, one is standardized in IEEE Std. 1619-2007 and has been widely used (e.g., XTS AES), the other is proposed recently (GB/T 17964-2021 implemented in May 2022) and is currently only used in SM4. + +Assume that the input data (B, B, and B) are consistent, the following tweak values are inconsistent due to different standards. As a result, the first ciphertext block are consistent, but the subsequent ciphertext blocks (if any) are different. + +By default, EVP_sm4_xts is standardized in GB/T 17964-2021, and can be changed by EVP_CIPHER_CTX_ctrl. The following Is is supported in XTS mode for SM4. + +=over 4 + +=item EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_XTS_STANDARD, std, NULL) + +Sets the standard of EVP_sm4_xts to B. This must be one of 0 or 1, 0 for XTS mode in GB/T 17964-2021, 1 for XTS mode in IEEE Std 1619-2007. + +=back + +The XTS implementation in OpenSSL does not support streaming. That is there must +only be one L call per L call (and +similarly with the "Decrypt" functions). + +=back + +=head1 RETURN VALUES + +These functions return a B structure that contains the +implementation of the symmetric cipher. See L for +details of the B structure. + +=head1 SEE ALSO + +L, +L, +L + +=head1 COPYRIGHT + +Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. +Copyright 2022 Ribose Inc. All Rights Reserved. + +Licensed under the OpenSSL license (the "License"). You may not use +this file except in compliance with the License. You can obtain a copy +in the file LICENSE in the source distribution or at +L. + +=cut + diff --git a/fuzz/oids.txt b/fuzz/oids.txt index eda55e4e79..a3eaa721a4 100644 --- a/fuzz/oids.txt +++ b/fuzz/oids.txt @@ -1063,3 +1063,4 @@ OBJ_id_tc26_gost_3410_2012_256_paramSetC="\x2A\x85\x03\x07\x01\x02\x01\x01\x03" OBJ_id_tc26_gost_3410_2012_256_paramSetD="\x2A\x85\x03\x07\x01\x02\x01\x01\x04" OBJ_hmacWithSHA512_224="\x2A\x86\x48\x86\xF7\x0D\x02\x0C" OBJ_hmacWithSHA512_256="\x2A\x86\x48\x86\xF7\x0D\x02\x0D" +OBJ_sm4_xts="\x2A\x81\x1C\xCF\x55\x01\x68\x0A" diff --git a/include/openssl/evp.h b/include/openssl/evp.h index a411f3f2f9..d11e5ae12c 100644 --- a/include/openssl/evp.h +++ b/include/openssl/evp.h @@ -353,6 +353,9 @@ int (*EVP_CIPHER_meth_get_ctrl(const EVP_CIPHER *cipher))(EVP_CIPHER_CTX *, # define EVP_CTRL_GET_IVLEN 0x25 +/* Set the XTS mode standard, SM4 only */ +# define EVP_CTRL_XTS_STANDARD 0x26 + /* Padding modes */ #define EVP_PADDING_PKCS7 1 #define EVP_PADDING_ISO7816_4 2 @@ -937,6 +940,7 @@ const EVP_CIPHER *EVP_sm4_cfb128(void); # define EVP_sm4_cfb EVP_sm4_cfb128 const EVP_CIPHER *EVP_sm4_ofb(void); const EVP_CIPHER *EVP_sm4_ctr(void); +const EVP_CIPHER *EVP_sm4_xts(void); # endif # if OPENSSL_API_COMPAT < 0x10100000L diff --git a/include/openssl/modes.h b/include/openssl/modes.h index d544f98d55..dea324f80b 100644 --- a/include/openssl/modes.h +++ b/include/openssl/modes.h @@ -22,6 +22,10 @@ typedef void (*cbc128_f) (const unsigned char *in, unsigned char *out, size_t len, const void *key, unsigned char ivec[16], int enc); +typedef void (*ecb128_f) (const unsigned char *in, unsigned char *out, + size_t len, const void *key, + int enc); + typedef void (*ctr128_f) (const unsigned char *in, unsigned char *out, size_t blocks, const void *key, const unsigned char ivec[16]); @@ -153,6 +157,11 @@ int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char *inp, unsigned char *out, size_t len, int enc); +int CRYPTO_xts128gb_encrypt(const XTS128_CONTEXT *ctx, + const unsigned char iv[16], + const unsigned char *inp, unsigned char *out, + size_t len, int enc); + size_t CRYPTO_128_wrap(void *key, const unsigned char *iv, unsigned char *out, const unsigned char *in, size_t inlen, diff --git a/include/openssl/obj_mac.h b/include/openssl/obj_mac.h index 53516a06c6..9c89f77411 100644 --- a/include/openssl/obj_mac.h +++ b/include/openssl/obj_mac.h @@ -4767,6 +4767,11 @@ #define NID_sm4_ctr 1139 #define OBJ_sm4_ctr OBJ_sm_scheme,104L,7L +#define SN_sm4_xts "SM4-XTS" +#define LN_sm4_xts "sm4-xts" +#define NID_sm4_xts 1196 +#define OBJ_sm4_xts OBJ_sm_scheme,104L,10L + #define SN_hmac "HMAC" #define LN_hmac "hmac" #define NID_hmac 855 diff --git a/test/evp_test.c b/test/evp_test.c index 62f20ece37..3c65ce9ad4 100644 --- a/test/evp_test.c +++ b/test/evp_test.c @@ -485,6 +485,8 @@ typedef struct cipher_data_st { unsigned char *tag; size_t tag_len; int tag_late; + /* SM4 XTS only */ + int std; } CIPHER_DATA; static int cipher_test_init(EVP_TEST *t, const char *alg) @@ -568,6 +570,15 @@ static int cipher_test_parse(EVP_TEST *t, const char *keyword, return -1; return 1; } + if (strcmp(keyword, "Standard") == 0) { + if (strcmp(value, "GB") == 0) + cdat->std = 0; + else if (strcmp(value, "IEEE") == 0) + cdat->std = 1; + else + return -1; + return 1; + } return 0; } @@ -707,7 +718,11 @@ static int cipher_test_enc(EVP_TEST *t, int enc, goto err; } } - + if (expected->std) { + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_XTS_STANDARD, expected->std, NULL)) { + goto err; + }; + } EVP_CIPHER_CTX_set_padding(ctx, 0); t->err = "CIPHERUPDATE_ERROR"; tmplen = 0; diff --git a/test/recipes/30-test_evp_data/evpciph.txt b/test/recipes/30-test_evp_data/evpciph.txt index 8480ddee0b..ae327838d9 100644 --- a/test/recipes/30-test_evp_data/evpciph.txt +++ b/test/recipes/30-test_evp_data/evpciph.txt @@ -2181,6 +2181,28 @@ IV = 0123456789ABCDEFFEDCBA9876543210 Plaintext = AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCDDDDDDDDDDDDDDDDEEEEEEEEEEEEEEEEFFFFFFFFFFFFFFFFEEEEEEEEEEEEEEEEAAAAAAAAAAAAAAAA Ciphertext = C2B4759E78AC3CF43D0852F4E8D5F9FD7256E8A5FCB65A350EE00630912E44492A0B17E1B85B060D0FBA612D8A95831638B361FD5FFACD942F081485A83CA35D +Title = SM4 XTS test vectors, the XTS mode is standardized in GB/T 17964-2021 by default +Cipher = SM4-XTS +Key = 2B7E151628AED2A6ABF7158809CF4F3C000102030405060708090A0B0C0D0E0F +IV = F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF +Plaintext = 6BC1BEE22E409F96E93D7E117393172AAE2D8A571E03AC9C9EB76FAC45AF8E5130C81C46A35CE411E5FBC1191A0A52EFF69F2445DF4F9B17 +Ciphertext = E9538251C71D7B80BBE4483FEF497BD12C5C581BD6242FC51E08964FB4F60FDB0BA42F63499279213D318D2C11F6886E903BE7F93A1B3479 + +Title = SM4 test vectors for XTS mode in GB/T 17964-2021 and IEEE Std 1619-2007 +Cipher = SM4-XTS +Key = 2B7E151628AED2A6ABF7158809CF4F3C000102030405060708090A0B0C0D0E0F +IV = F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF +Plaintext = 6BC1BEE22E409F96E93D7E117393172AAE2D8A571E03AC9C9EB76FAC45AF8E5130C81C46A35CE411E5FBC1191A0A52EFF69F2445DF4F9B17 +Ciphertext = E9538251C71D7B80BBE4483FEF497BD12C5C581BD6242FC51E08964FB4F60FDB0BA42F63499279213D318D2C11F6886E903BE7F93A1B3479 +Standard = GB + +Cipher = SM4-XTS +Key = 2B7E151628AED2A6ABF7158809CF4F3C000102030405060708090A0B0C0D0E0F +IV = F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF +Plaintext = 6BC1BEE22E409F96E93D7E117393172AAE2D8A571E03AC9C9EB76FAC45AF8E5130C81C46A35CE411E5FBC1191A0A52EFF69F2445DF4F9B17 +Ciphertext = E9538251C71D7B80BBE4483FEF497BD1B3DB1A3E60408C575D63FF7DB39F83260869F9E2585FEC9F0B863BF8FD784B8627D16C0DB6D2CFC7 +Standard = IEEE + Title = ARIA test vectors from RFC5794 (and others) Cipher = ARIA-128-ECB diff --git a/util/libcrypto.num b/util/libcrypto.num index 436f799bca..797dac999e 100644 --- a/util/libcrypto.num +++ b/util/libcrypto.num @@ -4591,3 +4591,5 @@ X509_ALGOR_copy 4544 1_1_1h EXIST::FUNCTION: X509_REQ_set0_signature 4545 1_1_1h EXIST::FUNCTION: X509_REQ_set1_signature_algo 4546 1_1_1h EXIST::FUNCTION: EC_KEY_decoded_from_explicit_params 4547 1_1_1h EXIST::FUNCTION:EC +EVP_sm4_xts 4548 1_1_1x EXIST::FUNCTION:SM4 +CRYPTO_xts128gb_encrypt 4549 1_1_1x EXIST::FUNCTION: -- Gitee From ca66a69f05df681cd951f3244c02728d43868e49 Mon Sep 17 00:00:00 2001 From: dumbdog Date: Thu, 14 Sep 2023 11:34:30 +0800 Subject: [PATCH 2/5] SM3 acceleration with SM3 hardware instruction on aarch64 This patch contains the following two PRs, 1. SM3 acceleration with SM3 hardware instruction on aarch64 SM3 hardware instruction is optional feature of crypto extension for aarch64. This implementation accelerates SM3 via SM3 instructions. For the platform not supporting SM3 instruction, the original C implementation still works. Thanks to AliBaba for testing and reporting the following perf numbers for Yitian710: Benchmark on T-Head Yitian-710 2.75GHz: Before: type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes sm3 49297.82k 121062.63k 223106.05k 283371.52k 307574.10k 309400.92k After (33% - 74% faster): type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes sm3 65640.01k 179121.79k 359854.59k 481448.96k 534055.59k 538274.47k Reviewed-by: Paul Dale Reviewed-by: Tomas Mraz (Merged from https://github.com/openssl/openssl/pull/17454) 2. Fix sm3ss1 translation issue in sm3-armv8.pl Reviewed-by: Tomas Mraz Reviewed-by: Matt Caswell Reviewed-by: Paul Dale (Merged from https://github.com/openssl/openssl/pull/17542) reference: https://gitee.com/src-openeuler/openssl/blob/openEuler-22.03-LTS-Next/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch Signed-off-by: dumbdog --- Configurations/00-base-templates.conf | 1 + Configure | 4 + crypto/arm64cpuid.pl | 7 + crypto/arm_arch.h | 1 + crypto/armcap.c | 10 + crypto/sm3/asm/sm3-armv8.pl | 280 ++++++++++++++++++++++++++ crypto/sm3/build.info | 15 +- crypto/sm3/sm3_local.h | 16 +- 8 files changed, 332 insertions(+), 2 deletions(-) create mode 100644 crypto/sm3/asm/sm3-armv8.pl diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf index 1d3501242b..a67ae65c80 100644 --- a/Configurations/00-base-templates.conf +++ b/Configurations/00-base-templates.conf @@ -322,6 +322,7 @@ my %targets=( poly1305_asm_src=> "poly1305-armv8.S", keccak1600_asm_src => "keccak1600-armv8.S", sm4_asm_src => "vpsm4_ex-armv8.S", + sm3_asm_src => "sm3-armv8.S", }, parisc11_asm => { template => 1, diff --git a/Configure b/Configure index ae82beb67d..2ca710f072 100755 --- a/Configure +++ b/Configure @@ -1420,6 +1420,9 @@ unless ($disabled{asm}) { if ($target{sm4_asm_src} ne "") { push @{$config{lib_defines}}, "VPSM4_EX_ASM"; } + if ($target{sm3_asm_src} ne "") { + push @{$config{lib_defines}}, "SM3_ASM"; + } } my %predefined_C = compiler_predefined($config{CROSS_COMPILE}.$config{CC}); @@ -3376,6 +3379,7 @@ sub print_table_entry "multilib", "build_scheme", "sm4_asm_src", + "sm3_asm_src", ); if ($type eq "TABLE") { diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl index 319927e6c7..1e9b1672e8 100755 --- a/crypto/arm64cpuid.pl +++ b/crypto/arm64cpuid.pl @@ -78,6 +78,13 @@ _armv8_sha512_probe: ret .size _armv8_sha512_probe,.-_armv8_sha512_probe +.globl _armv8_sm3_probe +.type _armv8_sm3_probe,%function +_armv8_sm3_probe: + .long 0xce63c004 // sm3partw1 v4.4s, v0.4s, v3.4s + ret +.size _armv8_sm3_probe,.-_armv8_sm3_probe + .globl OPENSSL_cleanse .type OPENSSL_cleanse,%function .align 5 diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h index 8b7105571d..8839b21204 100644 --- a/crypto/arm_arch.h +++ b/crypto/arm_arch.h @@ -80,5 +80,6 @@ extern unsigned int OPENSSL_armcap_P; # define ARMV8_SHA256 (1<<4) # define ARMV8_PMULL (1<<5) # define ARMV8_SHA512 (1<<6) +# define ARMV8_SM3 (1<<9) #endif diff --git a/crypto/armcap.c b/crypto/armcap.c index 48c5d4d64e..8b2f4a5c3a 100644 --- a/crypto/armcap.c +++ b/crypto/armcap.c @@ -47,6 +47,7 @@ void _armv8_sha1_probe(void); void _armv8_sha256_probe(void); void _armv8_pmull_probe(void); # ifdef __aarch64__ +void _armv8_sm3_probe(void); void _armv8_sha512_probe(void); # endif uint32_t _armv7_tick(void); @@ -130,6 +131,7 @@ static unsigned long getauxval(unsigned long key) # define HWCAP_CE_PMULL (1 << 4) # define HWCAP_CE_SHA1 (1 << 5) # define HWCAP_CE_SHA256 (1 << 6) +# define HWCAP_CE_SM3 (1 << 18) # define HWCAP_CE_SHA512 (1 << 21) # endif @@ -190,6 +192,9 @@ void OPENSSL_cpuid_setup(void) # ifdef __aarch64__ if (hwcap & HWCAP_CE_SHA512) OPENSSL_armcap_P |= ARMV8_SHA512; + + if (hwcap & HWCAP_CE_SM3) + OPENSSL_armcap_P |= ARMV8_SM3; # endif } # endif @@ -233,6 +238,11 @@ void OPENSSL_cpuid_setup(void) _armv8_sha512_probe(); OPENSSL_armcap_P |= ARMV8_SHA512; } + + if (sigsetjmp(ill_jmp, 1) == 0) { + _armv8_sm3_probe(); + OPENSSL_armcap_P |= ARMV8_SM3; + } # endif } # endif diff --git a/crypto/sm3/asm/sm3-armv8.pl b/crypto/sm3/asm/sm3-armv8.pl new file mode 100644 index 0000000000..677ca525d6 --- /dev/null +++ b/crypto/sm3/asm/sm3-armv8.pl @@ -0,0 +1,280 @@ +#! /usr/bin/env perl +# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# This module implements support for Armv8 SM3 instructions + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +# Message expanding: +# Wj <- P1(W[j-16]^W[j-9]^(W[j-3]<<<15))^(W[j-13]<<<7)^W[j-6] +# Input: s0, s1, s2, s3 +# s0 = w0 | w1 | w2 | w3 +# s1 = w4 | w5 | w6 | w7 +# s2 = w8 | w9 | w10 | w11 +# s3 = w12 | w13 | w14 | w15 +# Output: s4 +sub msg_exp () { +my $s0 = shift; +my $s1 = shift; +my $s2 = shift; +my $s3 = shift; +my $s4 = shift; +my $vtmp1 = shift; +my $vtmp2 = shift; +$code.=<<___; + // s4 = w7 | w8 | w9 | w10 + ext $s4.16b, $s1.16b, $s2.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext $vtmp1.16b, $s0.16b, $s1.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext $vtmp2.16b, $s2.16b, $s3.16b, #8 + sm3partw1 $s4.4s, $s0.4s, $s3.4s + sm3partw2 $s4.4s, $vtmp2.4s, $vtmp1.4s +___ +} + +# A round of compresson function +# Input: +# ab - choose instruction among sm3tt1a, sm3tt1b, sm3tt2a, sm3tt2b +# vstate0 - vstate1, store digest status(A - H) +# vconst0 - vconst1, interleaved used to store Tj <<< j +# vtmp - temporary register +# vw - for sm3tt1ab, vw = s0 eor s1 +# s0 - for sm3tt2ab, just be s0 +# i, choose wj' or wj from vw +sub round () { +my $ab = shift; +my $vstate0 = shift; +my $vstate1 = shift; +my $vconst0 = shift; +my $vconst1 = shift; +my $vtmp = shift; +my $vw = shift; +my $s0 = shift; +my $i = shift; +$code.=<<___; + sm3ss1 $vtmp.4s, $vstate0.4s, $vconst0.4s, $vstate1.4s + shl $vconst1.4s, $vconst0.4s, #1 + sri $vconst1.4s, $vconst0.4s, #31 + sm3tt1$ab $vstate0.4s, $vtmp.4s, $vw.4s[$i] + sm3tt2$ab $vstate1.4s, $vtmp.4s, $s0.4s[$i] +___ +} + +sub qround () { +my $ab = shift; +my $vstate0 = shift; +my $vstate1 = shift; +my $vconst0 = shift; +my $vconst1 = shift; +my $vtmp1 = shift; +my $vtmp2 = shift; +my $s0 = shift; +my $s1 = shift; +my $s2 = shift; +my $s3 = shift; +my $s4 = shift; + if($s4) { + &msg_exp($s0, $s1, $s2, $s3, $s4, $vtmp1, $vtmp2); + } +$code.=<<___; + eor $vtmp1.16b, $s0.16b, $s1.16b +___ + &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2, + $vtmp1, $s0, 0); + &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2, + $vtmp1, $s0, 1); + &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2, + $vtmp1, $s0, 2); + &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2, + $vtmp1, $s0, 3); +} + +$code=<<___; +#include "arm_arch.h" +.arch armv8.2-a +.text +___ + +{{{ +my ($pstate,$pdata,$num)=("x0","x1","w2"); +my ($state1,$state2)=("v5","v6"); +my ($sconst1, $sconst2)=("s16","s17"); +my ($vconst1, $vconst2)=("v16","v17"); +my ($s0,$s1,$s2,$s3,$s4)=map("v$_",(0..4)); +my ($bkstate1,$bkstate2)=("v18","v19"); +my ($vconst_tmp1,$vconst_tmp2)=("v20","v21"); +my ($vtmp1,$vtmp2)=("v22","v23"); +my $constaddr="x8"; +# void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num) +$code.=<<___; +.globl ossl_hwsm3_block_data_order +.type ossl_hwsm3_block_data_order,%function +.align 5 +ossl_hwsm3_block_data_order: + // load state + ld1 {$state1.4s-$state2.4s}, [$pstate] + rev64 $state1.4s, $state1.4s + rev64 $state2.4s, $state2.4s + ext $state1.16b, $state1.16b, $state1.16b, #8 + ext $state2.16b, $state2.16b, $state2.16b, #8 + + adr $constaddr, .Tj + ldp $sconst1, $sconst2, [$constaddr] + +.Loop: + // load input + ld1 {$s0.16b-$s3.16b}, [$pdata], #64 + sub $num, $num, #1 + + mov $bkstate1.16b, $state1.16b + mov $bkstate2.16b, $state2.16b + +#ifndef __ARMEB__ + rev32 $s0.16b, $s0.16b + rev32 $s1.16b, $s1.16b + rev32 $s2.16b, $s2.16b + rev32 $s3.16b, $s3.16b +#endif + + ext $vconst_tmp1.16b, $vconst1.16b, $vconst1.16b, #4 +___ + &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s0,$s1,$s2,$s3,$s4); + &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s1,$s2,$s3,$s4,$s0); + &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s2,$s3,$s4,$s0,$s1); + &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s3,$s4,$s0,$s1,$s2); + +$code.=<<___; + ext $vconst_tmp1.16b, $vconst2.16b, $vconst2.16b, #4 +___ + + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s4,$s0,$s1,$s2,$s3); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s0,$s1,$s2,$s3,$s4); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s1,$s2,$s3,$s4,$s0); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s2,$s3,$s4,$s0,$s1); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s3,$s4,$s0,$s1,$s2); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s4,$s0,$s1,$s2,$s3); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s0,$s1,$s2,$s3,$s4); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s1,$s2,$s3,$s4,$s0); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s2,$s3,$s4,$s0,$s1); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s3,$s4); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s4,$s0); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s0,$s1); + +$code.=<<___; + eor $state1.16b, $state1.16b, $bkstate1.16b + eor $state2.16b, $state2.16b, $bkstate2.16b + + // any remained blocks? + cbnz $num, .Loop + + // save state + rev64 $state1.4s, $state1.4s + rev64 $state2.4s, $state2.4s + ext $state1.16b, $state1.16b, $state1.16b, #8 + ext $state2.16b, $state2.16b, $state2.16b, #8 + st1 {$state1.4s-$state2.4s}, [$pstate] + ret +.size ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order + +.align 3 +.Tj: +.word 0x79cc4519, 0x9d8a7a87 +___ +}}} + +######################################### +my %sm3partopcode = ( + "sm3partw1" => 0xce60C000, + "sm3partw2" => 0xce60C400); + +my %sm3ss1opcode = ( + "sm3ss1" => 0xce400000); + +my %sm3ttopcode = ( + "sm3tt1a" => 0xce408000, + "sm3tt1b" => 0xce408400, + "sm3tt2a" => 0xce408800, + "sm3tt2b" => 0xce408C00); + +sub unsm3part { + my ($mnemonic,$arg)=@_; + + $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $sm3partopcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; +} + +sub unsm3ss1 { + my ($mnemonic,$arg)=@_; + + $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $sm3ss1opcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10), + $mnemonic,$arg; +} + +sub unsm3tt { + my ($mnemonic,$arg)=@_; + + $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*\[([0-3])\]/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $sm3ttopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<12), + $mnemonic,$arg; +} + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge; + s/\b(sm3ss1)\s+([qv].*)/unsm3ss1($1,$2)/ge; + s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge; + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/sm3/build.info b/crypto/sm3/build.info index 6009b1949e..e113729834 100644 --- a/crypto/sm3/build.info +++ b/crypto/sm3/build.info @@ -1,2 +1,15 @@ LIBS=../../libcrypto -SOURCE[../../libcrypto]=sm3.c m_sm3.c +SOURCE[../../libcrypto]=\ + sm3.c m_sm3.c {- $target{sm3_asm_src} -} + +GENERATE[sm3-armv8.S]=asm/sm3-armv8.pl $(PERLASM_SCHEME) +INCLUDE[sm3-armv8.o]=.. + +BEGINRAW[Makefile] +##### SM3 assembler implementations + +# GNU make "catch all" +{- $builddir -}/sm3-%.S: {- $sourcedir -}/asm/sm3-%.pl + CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ + +ENDRAW[Makefile] \ No newline at end of file diff --git a/crypto/sm3/sm3_local.h b/crypto/sm3/sm3_local.h index 7171de510d..aafff63796 100644 --- a/crypto/sm3/sm3_local.h +++ b/crypto/sm3/sm3_local.h @@ -32,7 +32,21 @@ ll=(c)->G; (void)HOST_l2c(ll, (s)); \ ll=(c)->H; (void)HOST_l2c(ll, (s)); \ } while (0) -#define HASH_BLOCK_DATA_ORDER sm3_block_data_order + +#if defined(SM3_ASM) +# if defined(__aarch64__) +# include "crypto/arm_arch.h" +# define HWSM3_CAPABLE (OPENSSL_armcap_P & ARMV8_SM3) +void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num); +# endif +#endif + +#if defined(HWSM3_CAPABLE) +# define HASH_BLOCK_DATA_ORDER (HWSM3_CAPABLE ? ossl_hwsm3_block_data_order \ + : sm3_block_data_order) +#else +# define HASH_BLOCK_DATA_ORDER sm3_block_data_order +#endif void sm3_transform(SM3_CTX *c, const unsigned char *data); -- Gitee From cb6db7f92d9d0e0b50f3bc94d7dda79a9b5af8cb Mon Sep 17 00:00:00 2001 From: dumbdog Date: Thu, 14 Sep 2023 11:38:15 +0800 Subject: [PATCH 3/5] SM4 optimization for ARM by HW instruction This patch is a copy of the following PR, with some extra supporting code. 1. SM4 optimization for ARM by HW instruction This patch implements the SM4 optimization for ARM processor, using SM4 HW instruction, which is an optional feature of crypto extension for aarch64 V8. Tested on some modern ARM micro-architectures with SM4 support, the performance uplift can be observed around 8X~40X over existing C implementation in openssl. Algorithms that can be parallelized (like CTR, ECB, CBC decryption) are on higher end, with algorithm like CBC encryption on lower end (due to inter-block dependency) Perf data on Yitian-710 2.75GHz hardware, before and after optimization: Before: type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes SM4-CTR 105787.80k 107837.87k 108380.84k 108462.08k 108549.46k 108554.92k SM4-ECB 111924.58k 118173.76k 119776.00k 120093.70k 120264.02k 120274.94k SM4-CBC 106428.09k 109190.98k 109674.33k 109774.51k 109827.41k 109827.41k After (7.4x - 36.6x faster): type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes SM4-CTR 781979.02k 2432994.28k 3437753.86k 3834177.88k 3963715.58k 3974556.33k SM4-ECB 937590.69k 2941689.02k 3945751.81k 4328655.87k 4459181.40k 4468692.31k SM4-CBC 890639.88k 1027746.58k 1050621.78k 1056696.66k 1058613.93k 1058701.31k Signed-off-by: Daniel Hu Reviewed-by: Paul Dale Reviewed-by: Tomas Mraz (Merged from https://github.com/openssl/openssl/pull/17455\) reference: https://gitee.com/src-openeuler/openssl/blob/openEuler-22.03-LTS-Next/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch Signed-off-by: dumbdog --- Configurations/00-base-templates.conf | 2 +- Configure | 3 +- crypto/arm64cpuid.pl | 7 + crypto/arm_arch.h | 1 + crypto/armcap.c | 10 + crypto/evp/e_sm4.c | 88 ++-- crypto/sm4/asm/sm4-armv8.pl | 629 ++++++++++++++++++++++++++ crypto/sm4/build.info | 13 +- include/crypto/sm4_platform.h | 70 +++ 9 files changed, 788 insertions(+), 35 deletions(-) create mode 100644 crypto/sm4/asm/sm4-armv8.pl create mode 100644 include/crypto/sm4_platform.h diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf index a67ae65c80..a26d0810c9 100644 --- a/Configurations/00-base-templates.conf +++ b/Configurations/00-base-templates.conf @@ -321,7 +321,7 @@ my %targets=( chacha_asm_src => "chacha-armv8.S", poly1305_asm_src=> "poly1305-armv8.S", keccak1600_asm_src => "keccak1600-armv8.S", - sm4_asm_src => "vpsm4_ex-armv8.S", + sm4_asm_src => "sm4-armv8.S vpsm4_ex-armv8.S", sm3_asm_src => "sm3-armv8.S", }, parisc11_asm => { diff --git a/Configure b/Configure index 2ca710f072..f7f1919b22 100755 --- a/Configure +++ b/Configure @@ -1418,7 +1418,8 @@ unless ($disabled{asm}) { push @{$config{lib_defines}}, "POLY1305_ASM"; } if ($target{sm4_asm_src} ne "") { - push @{$config{lib_defines}}, "VPSM4_EX_ASM"; + push @{$config{lib_defines}}, "SM4_ASM" if ($target{sm4_asm_src} =~ m/sm4/); + push @{$config{lib_defines}}, "VPSM4_EX_ASM" if ($target{sm4_asm_src} =~ m/vpsm4_ex/); } if ($target{sm3_asm_src} ne "") { push @{$config{lib_defines}}, "SM3_ASM"; diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl index 1e9b1672e8..341167b570 100755 --- a/crypto/arm64cpuid.pl +++ b/crypto/arm64cpuid.pl @@ -71,6 +71,13 @@ _armv8_pmull_probe: ret .size _armv8_pmull_probe,.-_armv8_pmull_probe +.globl _armv8_sm4_probe +.type _armv8_sm4_probe,%function +_armv8_sm4_probe: + .long 0xcec08400 // sm4e v0.4s, v0.4s + ret +.size _armv8_sm4_probe,.-_armv8_sm4_probe + .globl _armv8_sha512_probe .type _armv8_sha512_probe,%function _armv8_sha512_probe: diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h index 8839b21204..0f6f7ca24c 100644 --- a/crypto/arm_arch.h +++ b/crypto/arm_arch.h @@ -81,5 +81,6 @@ extern unsigned int OPENSSL_armcap_P; # define ARMV8_PMULL (1<<5) # define ARMV8_SHA512 (1<<6) # define ARMV8_SM3 (1<<9) +# define ARMV8_SM4 (1<<10) #endif diff --git a/crypto/armcap.c b/crypto/armcap.c index 8b2f4a5c3a..73bcad1a57 100644 --- a/crypto/armcap.c +++ b/crypto/armcap.c @@ -48,6 +48,7 @@ void _armv8_sha256_probe(void); void _armv8_pmull_probe(void); # ifdef __aarch64__ void _armv8_sm3_probe(void); +void _armv8_sm4_probe(void); void _armv8_sha512_probe(void); # endif uint32_t _armv7_tick(void); @@ -132,6 +133,7 @@ static unsigned long getauxval(unsigned long key) # define HWCAP_CE_SHA1 (1 << 5) # define HWCAP_CE_SHA256 (1 << 6) # define HWCAP_CE_SM3 (1 << 18) +# define HWCAP_CE_SM4 (1 << 19) # define HWCAP_CE_SHA512 (1 << 21) # endif @@ -190,6 +192,9 @@ void OPENSSL_cpuid_setup(void) OPENSSL_armcap_P |= ARMV8_SHA256; # ifdef __aarch64__ + if (hwcap & HWCAP_CE_SM4) + OPENSSL_armcap_P |= ARMV8_SM4; + if (hwcap & HWCAP_CE_SHA512) OPENSSL_armcap_P |= ARMV8_SHA512; @@ -234,6 +239,11 @@ void OPENSSL_cpuid_setup(void) OPENSSL_armcap_P |= ARMV8_SHA256; } # if defined(__aarch64__) && !defined(__APPLE__) + if (sigsetjmp(ill_jmp, 1) == 0) { + _armv8_sm4_probe(); + OPENSSL_armcap_P |= ARMV8_SM4; + } + if (sigsetjmp(ill_jmp, 1) == 0) { _armv8_sha512_probe(); OPENSSL_armcap_P |= ARMV8_SHA512; diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c index 169d6c71d1..eaa5ba0f9a 100644 --- a/crypto/evp/e_sm4.c +++ b/crypto/evp/e_sm4.c @@ -15,17 +15,11 @@ # include # include "crypto/sm4.h" # include "crypto/evp.h" +# include "crypto/sm4_platform.h" # include "evp_local.h" # include "modes_local.h" -#if defined(OPENSSL_CPUID_OBJ) && (defined(__arm__) || defined(__arm) || defined(__aarch64__)) -# include "arm_arch.h" -# if __ARM_MAX_ARCH__>=7 -# if defined(VPSM4_EX_ASM) -# define VPSM4_EX_CAPABLE (OPENSSL_armcap_P & ARMV8_AES) -# endif -# endif -#endif + typedef struct { union { @@ -35,28 +29,11 @@ typedef struct { block128_f block; union { ecb128_f ecb; + cbc128_f cbc; + ctr128_f ctr; } stream; } EVP_SM4_KEY; -#ifdef VPSM4_EX_CAPABLE -void vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); -void vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); -#define vpsm4_ex_encrypt SM4_encrypt -#define vpsm4_ex_decrypt SM4_encrypt -void vpsm4_ex_ecb_encrypt( - const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key, const int enc); -/* xts mode in GB/T 17964-2021 */ -void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, - const SM4_KEY *key2, const uint8_t iv[16]); -void vpsm4_ex_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, - const SM4_KEY *key2, const uint8_t iv[16]); -/* xts mode in IEEE Std 1619-2007 */ -void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, - const SM4_KEY *key2, const uint8_t iv[16]); -void vpsm4_ex_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, - const SM4_KEY *key2, const uint8_t iv[16]); -#endif - # define BLOCK_CIPHER_generic(nid,blocksize,ivlen,nmode,mode,MODE,flags) \ static const EVP_CIPHER sm4_##mode = { \ nid##_##nmode,blocksize,128/8,ivlen, \ @@ -84,6 +61,21 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, mode = EVP_CIPHER_CTX_mode(ctx); if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) && !enc) { +#ifdef HWSM4_CAPABLE + if (HWSM4_CAPABLE) { + HWSM4_set_decrypt_key(key, &dat->ks.ks); + dat->block = (block128_f) HWSM4_decrypt; + dat->stream.cbc = NULL; +# ifdef HWSM4_cbc_encrypt + if (mode == EVP_CIPH_CBC_MODE) + dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt; +# endif +# ifdef HWSM4_ecb_encrypt + if (mode == EVP_CIPH_ECB_MODE) + dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt; +# endif + } else +#endif #ifdef VPSM4_EX_CAPABLE if (VPSM4_EX_CAPABLE) { vpsm4_ex_set_decrypt_key(key, &dat->ks.ks); @@ -97,6 +89,29 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, SM4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); } } else { +#ifdef HWSM4_CAPABLE + if (HWSM4_CAPABLE) { + HWSM4_set_encrypt_key(key, &dat->ks.ks); + dat->block = (block128_f) HWSM4_encrypt; + dat->stream.cbc = NULL; +# ifdef HWSM4_cbc_encrypt + if (mode == EVP_CIPH_CBC_MODE) + dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt; + else +# endif +# ifdef HWSM4_ecb_encrypt + if (mode == EVP_CIPH_ECB_MODE) + dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt; + else +# endif +# ifdef HWSM4_ctr32_encrypt_blocks + if (mode == EVP_CIPH_CTR_MODE) + dat->stream.ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks; + else +# endif + (void)0; /* terminate potentially open 'else' */ + } else +#endif #ifdef VPSM4_EX_CAPABLE if (VPSM4_EX_CAPABLE) { vpsm4_ex_set_encrypt_key(key, &dat->ks.ks); @@ -118,7 +133,10 @@ static int sm4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, { EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); - if (EVP_CIPHER_CTX_encrypting(ctx)) + if (dat->stream.cbc) + (*dat->stream.cbc) (in, out, len, &dat->ks.ks, ctx->iv, + EVP_CIPHER_CTX_encrypting(ctx)); + else if (EVP_CIPHER_CTX_encrypting(ctx)) CRYPTO_cbc128_encrypt(in, out, len, &dat->ks.ks, EVP_CIPHER_CTX_iv_noconst(ctx), dat->block); else @@ -183,10 +201,16 @@ static int sm4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, return 0; num = (unsigned int)n; - CRYPTO_ctr128_encrypt(in, out, len, &dat->ks.ks, - ctx->iv, - EVP_CIPHER_CTX_buf_noconst(ctx), &num, - dat->block); + if (dat->stream.ctr) + CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks, + ctx->iv, + EVP_CIPHER_CTX_buf_noconst(ctx), + &num, dat->stream.ctr); + else + CRYPTO_ctr128_encrypt(in, out, len, &dat->ks.ks, + ctx->iv, + EVP_CIPHER_CTX_buf_noconst(ctx), &num, + dat->block); EVP_CIPHER_CTX_set_num(ctx, num); return 1; } diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl new file mode 100644 index 0000000000..dbacad2f4f --- /dev/null +++ b/crypto/sm4/asm/sm4-armv8.pl @@ -0,0 +1,629 @@ +#! /usr/bin/env perl +# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# This module implements support for SM4 hw support on aarch64 +# Oct 2021 +# + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +$prefix="sm4_v8"; +my @rks=map("v$_",(0..7)); + +sub rev32() { +my $dst = shift; +my $src = shift; +$code.=<<___; +#ifndef __ARMEB__ + rev32 $dst.16b,$src.16b +#endif +___ +} + +sub enc_blk () { +my $data = shift; +$code.=<<___; + sm4e $data.4s,@rks[0].4s + sm4e $data.4s,@rks[1].4s + sm4e $data.4s,@rks[2].4s + sm4e $data.4s,@rks[3].4s + sm4e $data.4s,@rks[4].4s + sm4e $data.4s,@rks[5].4s + sm4e $data.4s,@rks[6].4s + sm4e $data.4s,@rks[7].4s + rev64 $data.4S,$data.4S + ext $data.16b,$data.16b,$data.16b,#8 +___ +} + +sub enc_4blks () { +my $data0 = shift; +my $data1 = shift; +my $data2 = shift; +my $data3 = shift; +$code.=<<___; + sm4e $data0.4s,@rks[0].4s + sm4e $data1.4s,@rks[0].4s + sm4e $data2.4s,@rks[0].4s + sm4e $data3.4s,@rks[0].4s + + sm4e $data0.4s,@rks[1].4s + sm4e $data1.4s,@rks[1].4s + sm4e $data2.4s,@rks[1].4s + sm4e $data3.4s,@rks[1].4s + + sm4e $data0.4s,@rks[2].4s + sm4e $data1.4s,@rks[2].4s + sm4e $data2.4s,@rks[2].4s + sm4e $data3.4s,@rks[2].4s + + sm4e $data0.4s,@rks[3].4s + sm4e $data1.4s,@rks[3].4s + sm4e $data2.4s,@rks[3].4s + sm4e $data3.4s,@rks[3].4s + + sm4e $data0.4s,@rks[4].4s + sm4e $data1.4s,@rks[4].4s + sm4e $data2.4s,@rks[4].4s + sm4e $data3.4s,@rks[4].4s + + sm4e $data0.4s,@rks[5].4s + sm4e $data1.4s,@rks[5].4s + sm4e $data2.4s,@rks[5].4s + sm4e $data3.4s,@rks[5].4s + + sm4e $data0.4s,@rks[6].4s + sm4e $data1.4s,@rks[6].4s + sm4e $data2.4s,@rks[6].4s + sm4e $data3.4s,@rks[6].4s + + sm4e $data0.4s,@rks[7].4s + rev64 $data0.4S,$data0.4S + sm4e $data1.4s,@rks[7].4s + ext $data0.16b,$data0.16b,$data0.16b,#8 + rev64 $data1.4S,$data1.4S + sm4e $data2.4s,@rks[7].4s + ext $data1.16b,$data1.16b,$data1.16b,#8 + rev64 $data2.4S,$data2.4S + sm4e $data3.4s,@rks[7].4s + ext $data2.16b,$data2.16b,$data2.16b,#8 + rev64 $data3.4S,$data3.4S + ext $data3.16b,$data3.16b,$data3.16b,#8 +___ +} + +$code=<<___; +#include "arm_arch.h" +.arch armv8-a+crypto +.text +___ + +{{{ +$code.=<<___; +.align 6 +.Lck: + .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 + .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 + .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 + .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 + .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 + .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 + .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 + .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 +.Lfk: + .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc +___ +}}} + +{{{ +my ($key,$keys)=("x0","x1"); +my ($tmp)=("x2"); +my ($key0,$key1,$key2,$key3,$key4,$key5,$key6,$key7)=map("v$_",(0..7)); +my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23)); +my ($fkconst) = ("v24"); +$code.=<<___; +.globl ${prefix}_set_encrypt_key +.type ${prefix}_set_encrypt_key,%function +.align 5 +${prefix}_set_encrypt_key: + ld1 {$key0.4s},[$key] + adr $tmp,.Lfk + ld1 {$fkconst.4s},[$tmp] + adr $tmp,.Lck + ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64 +___ + &rev32($key0, $key0); +$code.=<<___; + ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp] + eor $key0.16b,$key0.16b,$fkconst.16b; + sm4ekey $key0.4S,$key0.4S,$const0.4S + sm4ekey $key1.4S,$key0.4S,$const1.4S + sm4ekey $key2.4S,$key1.4S,$const2.4S + sm4ekey $key3.4S,$key2.4S,$const3.4S + sm4ekey $key4.4S,$key3.4S,$const4.4S + st1 {$key0.4s,$key1.4s,$key2.4s,$key3.4s},[$keys],64 + sm4ekey $key5.4S,$key4.4S,$const5.4S + sm4ekey $key6.4S,$key5.4S,$const6.4S + sm4ekey $key7.4S,$key6.4S,$const7.4S + st1 {$key4.4s,$key5.4s,$key6.4s,$key7.4s},[$keys] + ret +.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key +___ +}}} + +{{{ +my ($key,$keys)=("x0","x1"); +my ($tmp)=("x2"); +my ($key7,$key6,$key5,$key4,$key3,$key2,$key1,$key0)=map("v$_",(0..7)); +my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23)); +my ($fkconst) = ("v24"); +$code.=<<___; +.globl ${prefix}_set_decrypt_key +.type ${prefix}_set_decrypt_key,%function +.align 5 +${prefix}_set_decrypt_key: + ld1 {$key0.4s},[$key] + adr $tmp,.Lfk + ld1 {$fkconst.4s},[$tmp] + adr $tmp, .Lck + ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64 +___ + &rev32($key0, $key0); +$code.=<<___; + ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp] + eor $key0.16b, $key0.16b,$fkconst.16b; + sm4ekey $key0.4S,$key0.4S,$const0.4S + sm4ekey $key1.4S,$key0.4S,$const1.4S + sm4ekey $key2.4S,$key1.4S,$const2.4S + rev64 $key0.4s,$key0.4s + rev64 $key1.4s,$key1.4s + ext $key0.16b,$key0.16b,$key0.16b,#8 + ext $key1.16b,$key1.16b,$key1.16b,#8 + sm4ekey $key3.4S,$key2.4S,$const3.4S + sm4ekey $key4.4S,$key3.4S,$const4.4S + rev64 $key2.4s,$key2.4s + rev64 $key3.4s,$key3.4s + ext $key2.16b,$key2.16b,$key2.16b,#8 + ext $key3.16b,$key3.16b,$key3.16b,#8 + sm4ekey $key5.4S,$key4.4S,$const5.4S + sm4ekey $key6.4S,$key5.4S,$const6.4S + rev64 $key4.4s,$key4.4s + rev64 $key5.4s,$key5.4s + ext $key4.16b,$key4.16b,$key4.16b,#8 + ext $key5.16b,$key5.16b,$key5.16b,#8 + sm4ekey $key7.4S,$key6.4S,$const7.4S + rev64 $key6.4s, $key6.4s + rev64 $key7.4s, $key7.4s + ext $key6.16b,$key6.16b,$key6.16b,#8 + ext $key7.16b,$key7.16b,$key7.16b,#8 + st1 {$key7.4s,$key6.4s,$key5.4s,$key4.4s},[$keys],64 + st1 {$key3.4s,$key2.4s,$key1.4s,$key0.4s},[$keys] + ret +.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key +___ +}}} + +{{{ +sub gen_block () { +my $dir = shift; +my ($inp,$out,$rk)=map("x$_",(0..2)); +my ($data)=("v16"); +$code.=<<___; +.globl ${prefix}_${dir}crypt +.type ${prefix}_${dir}crypt,%function +.align 5 +${prefix}_${dir}crypt: + ld1 {$data.4s},[$inp] + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] +___ + &rev32($data,$data); + &enc_blk($data); + &rev32($data,$data); +$code.=<<___; + st1 {$data.4s},[$out] + ret +.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt +___ +} + +&gen_block("en"); +&gen_block("de"); +}}} + +{{{ +my ($inp,$out,$len,$rk)=map("x$_",(0..3)); +my ($enc) = ("w4"); +my @dat=map("v$_",(16..23)); +$code.=<<___; +.globl ${prefix}_ecb_encrypt +.type ${prefix}_ecb_encrypt,%function +.align 5 +${prefix}_ecb_encrypt: + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] +1: + cmp $len,#64 + b.lt 1f + ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 + cmp $len,#128 + b.lt 2f + ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64 + // 8 blocks +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); +$code.=<<___; + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 +___ + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); +$code.=<<___; + st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 + subs $len,$len,#128 + b.gt 1b + ret + // 4 blocks +2: +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); +$code.=<<___; + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + subs $len,$len,#64 + b.gt 1b +1: + subs $len,$len,#16 + b.lt 1f + ld1 {@dat[0].4s},[$inp],#16 +___ + &rev32(@dat[0],@dat[0]); + &enc_blk(@dat[0]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + st1 {@dat[0].4s},[$out],#16 + b.ne 1b +1: + ret +.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt +___ +}}} + +{{{ +my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4)); +my ($enc) = ("w5"); +my @dat=map("v$_",(16..23)); +my @in=map("v$_",(24..31)); +my ($ivec) = ("v8"); +$code.=<<___; +.globl ${prefix}_cbc_encrypt +.type ${prefix}_cbc_encrypt,%function +.align 5 +${prefix}_cbc_encrypt: + stp d8,d9,[sp, #-16]! + + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] + ld1 {$ivec.4s},[$ivp] + cmp $enc,#0 + b.eq .Ldec +1: + cmp $len, #64 + b.lt 1f + ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 + eor @dat[0].16b,@dat[0].16b,$ivec.16b +___ + &rev32(@dat[1],@dat[1]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &enc_blk(@dat[0]); +$code.=<<___; + eor @dat[1].16b,@dat[1].16b,@dat[0].16b +___ + &enc_blk(@dat[1]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + eor @dat[2].16b,@dat[2].16b,@dat[1].16b +___ + &enc_blk(@dat[2]); + &rev32(@dat[1],@dat[1]); +$code.=<<___; + eor @dat[3].16b,@dat[3].16b,@dat[2].16b +___ + &enc_blk(@dat[3]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); +$code.=<<___; + mov $ivec.16b,@dat[3].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + subs $len,$len,#64 + b.ne 1b +1: + subs $len,$len,#16 + b.lt 3f + ld1 {@dat[0].4s},[$inp],#16 + eor $ivec.16b,$ivec.16b,@dat[0].16b +___ + &rev32($ivec,$ivec); + &enc_blk($ivec); + &rev32($ivec,$ivec); +$code.=<<___; + st1 {$ivec.16b},[$out],#16 + b.ne 1b + b 3f +.Ldec: +1: + cmp $len, #64 + b.lt 1f + ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp] + ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64 + cmp $len,#128 + b.lt 2f + // 8 blocks mode + ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp] + ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64 +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],$dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],$dat[7]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,$ivec.16b + eor @dat[1].16b,@dat[1].16b,@in[0].16b + eor @dat[2].16b,@dat[2].16b,@in[1].16b + mov $ivec.16b,@in[7].16b + eor @dat[3].16b,$dat[3].16b,@in[2].16b + eor @dat[4].16b,$dat[4].16b,@in[3].16b + eor @dat[5].16b,$dat[5].16b,@in[4].16b + eor @dat[6].16b,$dat[6].16b,@in[5].16b + eor @dat[7].16b,$dat[7].16b,@in[6].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 + subs $len,$len,128 + b.gt 1b + b 3f + // 4 blocks mode +2: +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],$dat[3]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,$ivec.16b + eor @dat[1].16b,@dat[1].16b,@in[0].16b + mov $ivec.16b,@in[3].16b + eor @dat[2].16b,@dat[2].16b,@in[1].16b + eor @dat[3].16b,$dat[3].16b,@in[2].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + subs $len,$len,#64 + b.gt 1b +1: + subs $len,$len,#16 + b.lt 3f + ld1 {@dat[0].4s},[$inp],#16 + mov @in[0].16b,@dat[0].16b +___ + &rev32(@dat[0],@dat[0]); + &enc_blk(@dat[0]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,$ivec.16b + mov $ivec.16b,@in[0].16b + st1 {@dat[0].16b},[$out],#16 + b.ne 1b +3: + // save back IV + st1 {$ivec.16b},[$ivp] + ldp d8,d9,[sp],#16 + ret +.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt +___ +}}} + +{{{ +my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4)); +my ($ctr)=("w5"); +my @dat=map("v$_",(16..23)); +my @in=map("v$_",(24..31)); +my ($ivec)=("v8"); +$code.=<<___; +.globl ${prefix}_ctr32_encrypt_blocks +.type ${prefix}_ctr32_encrypt_blocks,%function +.align 5 +${prefix}_ctr32_encrypt_blocks: + stp d8,d9,[sp, #-16]! + + ld1 {$ivec.4s},[$ivp] + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] +___ + &rev32($ivec,$ivec); +$code.=<<___; + mov $ctr,$ivec.s[3] +1: + cmp $len,#4 + b.lt 1f + ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64 + mov @dat[0].16b,$ivec.16b + mov @dat[1].16b,$ivec.16b + mov @dat[2].16b,$ivec.16b + mov @dat[3].16b,$ivec.16b + add $ctr,$ctr,#1 + mov $dat[1].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[2].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[3].s[3],$ctr + cmp $len,#8 + b.lt 2f + ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64 + mov @dat[4].16b,$ivec.16b + mov @dat[5].16b,$ivec.16b + mov @dat[6].16b,$ivec.16b + mov @dat[7].16b,$ivec.16b + add $ctr,$ctr,#1 + mov $dat[4].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[5].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[6].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[7].s[3],$ctr +___ + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,@in[0].16b + eor @dat[1].16b,@dat[1].16b,@in[1].16b + eor @dat[2].16b,@dat[2].16b,@in[2].16b + eor @dat[3].16b,@dat[3].16b,@in[3].16b + eor @dat[4].16b,@dat[4].16b,@in[4].16b + eor @dat[5].16b,@dat[5].16b,@in[5].16b + eor @dat[6].16b,@dat[6].16b,@in[6].16b + eor @dat[7].16b,@dat[7].16b,@in[7].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 + subs $len,$len,#8 + b.eq 3f + add $ctr,$ctr,#1 + mov $ivec.s[3],$ctr + b 1b +2: +___ + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,@in[0].16b + eor @dat[1].16b,@dat[1].16b,@in[1].16b + eor @dat[2].16b,@dat[2].16b,@in[2].16b + eor @dat[3].16b,@dat[3].16b,@in[3].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + subs $len,$len,#4 + b.eq 3f + add $ctr,$ctr,#1 + mov $ivec.s[3],$ctr + b 1b +1: + subs $len,$len,#1 + b.lt 3f + mov $dat[0].16b,$ivec.16b + ld1 {@in[0].4s},[$inp],#16 +___ + &enc_blk(@dat[0]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + eor $dat[0].16b,$dat[0].16b,@in[0].16b + st1 {$dat[0].4s},[$out],#16 + b.eq 3f + add $ctr,$ctr,#1 + mov $ivec.s[3],$ctr + b 1b +3: + ldp d8,d9,[sp],#16 + ret +.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks +___ +}}} +######################################## +{ my %opcode = ( + "sm4e" => 0xcec08400, + "sm4ekey" => 0xce60c800); + + sub unsm4 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sm4\w+)\s+([qv].*)/unsm4($1,$2)/ge; + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info index bb042c5792..4d26ede7da 100644 --- a/crypto/sm4/build.info +++ b/crypto/sm4/build.info @@ -2,6 +2,17 @@ LIBS=../../libcrypto SOURCE[../../libcrypto]=\ sm4.c {- $target{sm4_asm_src} -} +GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl $(PERLASM_SCHEME) +INCLUDE[sm4-armv8.o]=.. GENERATE[vpsm4_ex-armv8.S]=asm/vpsm4_ex-armv8.pl $(PERLASM_SCHEME) -INCLUDE[vpsm4_ex-armv8.o]=.. \ No newline at end of file +INCLUDE[vpsm4_ex-armv8.o]=.. + +BEGINRAW[Makefile] +##### SM4 assembler implementations + +# GNU make "catch all" +{- $builddir -}/sm4-%.S: {- $sourcedir -}/asm/sm4-%.pl + CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ + +ENDRAW[Makefile] diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h new file mode 100644 index 0000000000..838cb4e72c --- /dev/null +++ b/include/crypto/sm4_platform.h @@ -0,0 +1,70 @@ +/* + * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#ifndef OSSL_SM4_PLATFORM_H +# define OSSL_SM4_PLATFORM_H +# pragma once + +# if defined(OPENSSL_CPUID_OBJ) +# if (defined(__arm__) || defined(__arm) || defined(__aarch64__)) +# include "arm_arch.h" +# if __ARM_MAX_ARCH__>=7 +# if defined(VPSM4_EX_ASM) +# define VPSM4_EX_CAPABLE (OPENSSL_armcap_P & ARMV8_AES) +# endif +# define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4) +# define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key +# define HWSM4_set_decrypt_key sm4_v8_set_decrypt_key +# define HWSM4_encrypt sm4_v8_encrypt +# define HWSM4_decrypt sm4_v8_decrypt +# define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt +# define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt +# define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks +# endif +# endif +# endif /* OPENSSL_CPUID_OBJ */ + +# if defined(HWSM4_CAPABLE) +int HWSM4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); +int HWSM4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); +void HWSM4_encrypt(const unsigned char *in, unsigned char *out, + const SM4_KEY *key); +void HWSM4_decrypt(const unsigned char *in, unsigned char *out, + const SM4_KEY *key); +void HWSM4_cbc_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const SM4_KEY *key, + unsigned char *ivec, const int enc); +void HWSM4_ecb_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const SM4_KEY *key, + const int enc); +void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + const unsigned char ivec[16]); +# endif /* HWSM4_CAPABLE */ + +#ifdef VPSM4_EX_CAPABLE +void vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); +void vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); +#define vpsm4_ex_encrypt SM4_encrypt +#define vpsm4_ex_decrypt SM4_encrypt +void vpsm4_ex_ecb_encrypt( + const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key, const int enc); +/* xts mode in GB/T 17964-2021 */ +void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +void vpsm4_ex_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +/* xts mode in IEEE Std 1619-2007 */ +void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +void vpsm4_ex_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +#endif /* VPSM4_EX_CAPABLE */ + +#endif /* OSSL_SM4_PLATFORM_H */ -- Gitee From 8038358c7e84e070469d8c54176e605fc0b1455d Mon Sep 17 00:00:00 2001 From: dumbdog Date: Thu, 14 Sep 2023 11:40:07 +0800 Subject: [PATCH 4/5] SM4 XTS optimization for ARM by HW instruction This patch implements the SM4 XTS optimization for ARM processor, using SM4 HW instruction, which is an optional feature of crypto extension for aarch64 V8. reference: https://gitee.com/src-openeuler/openssl/blob/openEuler-22.03-LTS-Next/Feature-SM4-XTS-optimization-for-ARM-by-HW-instruction.patch Signed-off-by: dumbdog --- crypto/evp/e_sm4.c | 28 ++ crypto/sm4/asm/sm4-armv8.pl | 498 +++++++++++++++++++++++++++++++++- include/crypto/sm4_platform.h | 14 + 3 files changed, 537 insertions(+), 3 deletions(-) diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c index eaa5ba0f9a..da4dbd34a6 100644 --- a/crypto/evp/e_sm4.c +++ b/crypto/evp/e_sm4.c @@ -281,6 +281,34 @@ static int sm4_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, const int bytes = EVP_CIPHER_CTX_key_length(ctx) / 2; xctx->stream_gb = NULL; xctx->stream = NULL; +#ifdef HWSM4_CAPABLE + if (HWSM4_CAPABLE) { + if (enc) { + HWSM4_set_encrypt_key(key, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f) HWSM4_encrypt; +# ifdef HWSM4_xts_encrypt_gb + xctx->stream_gb = HWSM4_xts_encrypt_gb; +# endif +# ifdef HWSM4_xts_encrypt + xctx->stream = HWSM4_xts_encrypt; +# endif + } else { + HWSM4_set_decrypt_key(key, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f) HWSM4_decrypt; +# ifdef HWSM4_xts_decrypt_gb + xctx->stream_gb = HWSM4_xts_decrypt_gb; +# endif +# ifdef HWSM4_xts_decrypt + xctx->stream = HWSM4_xts_decrypt; +# endif + } + HWSM4_set_encrypt_key(key + bytes, &xctx->ks2.ks); + xctx->xts.block2 = (block128_f) HWSM4_encrypt; + + xctx->xts.key1 = &xctx->ks1; + break; + } else +#endif #ifdef VPSM4_EX_CAPABLE if (VPSM4_EX_CAPABLE) { if (enc) { diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl index dbacad2f4f..923c1c0197 100644 --- a/crypto/sm4/asm/sm4-armv8.pl +++ b/crypto/sm4/asm/sm4-armv8.pl @@ -11,9 +11,9 @@ # Oct 2021 # -# $output is the last argument if it looks like a file (it has an extension) +# $outut is the last argument if it looks like a file (it has an extension) # $flavour is the first argument if it doesn't look like a file -$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$outut = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; @@ -21,7 +21,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour \"$output\"" +open OUT,"| \"$^X\" $xlate $flavour \"$outut\"" or die "can't call $xlate: $!"; *STDOUT=*OUT; @@ -110,6 +110,120 @@ $code.=<<___; ___ } +sub mov_reg_to_vec() { + my $src0 = shift; + my $src1 = shift; + my $desv = shift; +$code.=<<___; + mov $desv.d[0],$src0 + mov $desv.d[1],$src1 +#ifdef __ARMEB__ + rev32 $desv.16b,$desv.16b +#endif +___ +} + +sub mov_vec_to_reg() { + my $srcv = shift; + my $des0 = shift; + my $des1 = shift; +$code.=<<___; + mov $des0,$srcv.d[0] + mov $des1,$srcv.d[1] +___ +} + +sub compute_tweak() { + my $src0 = shift; + my $src1 = shift; + my $des0 = shift; + my $des1 = shift; + my $tmp0 = shift; + my $tmp1 = shift; + my $magic = shift; +$code.=<<___; + extr x$tmp1,$src1,$src1,#32 + extr $des1,$src1,$src0,#63 + and w$tmp0,w$magic,w$tmp1,asr#31 + eor $des0,x$tmp0,$src0,lsl#1 +___ +} + +sub compute_tweak_vec() { + my $src = shift; + my $des = shift; + my $tmp0 = shift; + my $tmp1 = shift; + my $magic = shift; + &rbit($tmp1,$src); +$code.=<<___; + shl $des.16b, $tmp1.16b, #1 + ext $tmp0.16b, $tmp1.16b, $tmp1.16b,#15 + ushr $tmp0.16b, $tmp0.16b, #7 + mul $tmp0.16b, $tmp0.16b, $magic.16b + eor $des.16b, $des.16b, $tmp0.16b +___ + &rbit($des,$des); +} + +sub mov_en_to_enc(){ + my $en = shift; + my $enc = shift; + if ($en eq "en") { +$code.=<<___; + mov $enc,1 +___ + } else { +$code.=<<___; + mov $enc,0 +___ + } +} + +sub rbit() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { + if ($standard eq "_gb") { +$code.=<<___; + rbit $dst.16b,$src.16b +___ + } else { +$code.=<<___; + mov $dst.16b,$src.16b +___ + } + } else { + if ($standard eq "_gb") { +$code.=<<___; + rbit $dst.16b,$src.16b +___ + } + } +} + +sub rev32_armeb() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { +$code.=<<___; +#ifdef __ARMEB__ + rev32 $dst.16b,$src.16b +#else + mov $dst.16b,$src.16b +#endif +___ + } else { +$code.=<<___; +#ifdef __ARMEB__ + rev32 $dst.16b,$dst.16b +#endif +___ + } +} + $code=<<___; #include "arm_arch.h" .arch armv8-a+crypto @@ -595,6 +709,384 @@ $code.=<<___; .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks ___ }}} + + +{{{ +my ($inp,$out,$len,$rk1,$rk2,$ivp)=map("x$_",(0..5)); +my ($blocks)=("x2"); +my ($enc)=("x6"); +my ($remain)=("x7"); +my @twx=map("x$_",(9..24)); +my $lastBlk=("x25"); + +my @tweak=map("v$_",(8..15)); +my @dat=map("v$_",(16..23)); +my $lastTweak=("v24"); + +# x/w/v/q registers for compute tweak +my ($magic)=("8"); +my ($tmp0,$tmp1)=("26","27"); +my ($qMagic,$vMagic)=("q25","v25"); +my ($vTmp0,$vTmp1)=("v26","v27"); + +sub gen_xts_do_cipher() { +$code.=<<___; +.globl ${prefix}_xts_do_cipher${standard} +.type ${prefix}_xts_do_cipher${standard},%function +.align 5 +${prefix}_xts_do_cipher${standard}: + mov w$magic,0x87 + ldr $qMagic, =0x01010101010101010101010101010187 + // used to encrypt the XORed plaintext blocks + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk2],#64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk2] + ld1 {@tweak[0].4s}, [$ivp] +___ + &rev32(@tweak[0],@tweak[0]); + &enc_blk(@tweak[0]); + &rev32(@tweak[0],@tweak[0]); +$code.=<<___; + // used to encrypt the initial vector to yield the initial tweak + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk1],#64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk1] + + and $remain,$len,#0x0F + // convert length into blocks + lsr $blocks,$len,4 + cmp $blocks,#1 // $len must be at least 16 + b.lt 99f + + cmp $remain,0 // if $len is a multiple of 16 + b.eq .xts_encrypt_blocks${standard} + // if $len is not a multiple of 16 + subs $blocks,$blocks,#1 + b.eq .only_2blks_tweak${standard} // if $len is less than 32 + +.xts_encrypt_blocks${standard}: +___ + &rbit(@tweak[0],@tweak[0]); + &rev32_armeb(@tweak[0],@tweak[0]); + &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); + &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3],$tmp0,$tmp1,$magic); + &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5],$tmp0,$tmp1,$magic); + &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7],$tmp0,$tmp1,$magic); + &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9],$tmp0,$tmp1,$magic); + &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11],$tmp0,$tmp1,$magic); + &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13],$tmp0,$tmp1,$magic); + &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15],$tmp0,$tmp1,$magic); +$code.=<<___; +1: + cmp $blocks,#8 +___ + &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); + &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1],$tmp0,$tmp1,$magic); + &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); + &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3],$tmp0,$tmp1,$magic); + &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); + &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5],$tmp0,$tmp1,$magic); + &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); + &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7],$tmp0,$tmp1,$magic); + &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]); + &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9],$tmp0,$tmp1,$magic); + &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]); + &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11],$tmp0,$tmp1,$magic); + &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]); + &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13],$tmp0,$tmp1,$magic); + &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]); + &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15],$tmp0,$tmp1,$magic); +$code.=<<___; + b.lt 2f + ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); + &rbit(@tweak[2],@tweak[2]); + &rbit(@tweak[3],@tweak[3]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b + eor @dat[1].16b, @dat[1].16b, @tweak[1].16b + eor @dat[2].16b, @dat[2].16b, @tweak[2].16b + eor @dat[3].16b, @dat[3].16b, @tweak[3].16b + ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64 +___ + &rbit(@tweak[4],@tweak[4]); + &rbit(@tweak[5],@tweak[5]); + &rbit(@tweak[6],@tweak[6]); + &rbit(@tweak[7],@tweak[7]); +$code.=<<___; + eor @dat[4].16b, @dat[4].16b, @tweak[4].16b + eor @dat[5].16b, @dat[5].16b, @tweak[5].16b + eor @dat[6].16b, @dat[6].16b, @tweak[6].16b + eor @dat[7].16b, @dat[7].16b, @tweak[7].16b +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b + eor @dat[1].16b, @dat[1].16b, @tweak[1].16b + eor @dat[2].16b, @dat[2].16b, @tweak[2].16b + eor @dat[3].16b, @dat[3].16b, @tweak[3].16b + eor @dat[4].16b, @dat[4].16b, @tweak[4].16b + eor @dat[5].16b, @dat[5].16b, @tweak[5].16b + eor @dat[6].16b, @dat[6].16b, @tweak[6].16b + eor @dat[7].16b, @dat[7].16b, @tweak[7].16b + + // save the last tweak + mov $lastTweak.16b,@tweak[7].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 + subs $blocks,$blocks,#8 + b.eq 100f + b 1b +2: + // process 4 blocks + cmp $blocks,#4 + b.lt 1f + ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); + &rbit(@tweak[2],@tweak[2]); + &rbit(@tweak[3],@tweak[3]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b + eor @dat[1].16b, @dat[1].16b, @tweak[1].16b + eor @dat[2].16b, @dat[2].16b, @tweak[2].16b + eor @dat[3].16b, @dat[3].16b, @tweak[3].16b +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b + eor @dat[1].16b, @dat[1].16b, @tweak[1].16b + eor @dat[2].16b, @dat[2].16b, @tweak[2].16b + eor @dat[3].16b, @dat[3].16b, @tweak[3].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + sub $blocks,$blocks,#4 + mov @tweak[0].16b,@tweak[4].16b + mov @tweak[1].16b,@tweak[5].16b + mov @tweak[2].16b,@tweak[6].16b + // save the last tweak + mov $lastTweak.16b,@tweak[3].16b +1: + // process last block + cmp $blocks,#1 + b.lt 100f + b.gt 1f + ld1 {@dat[0].4s},[$inp],#16 +___ + &rbit(@tweak[0],@tweak[0]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b +___ + &rev32(@dat[0],@dat[0]); + &enc_blk(@dat[0]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b + st1 {@dat[0].4s},[$out],#16 + // save the last tweak + mov $lastTweak.16b,@tweak[0].16b + b 100f +1: // process last 2 blocks + cmp $blocks,#2 + b.gt 1f + ld1 {@dat[0].4s,@dat[1].4s},[$inp],#32 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b + eor @dat[1].16b, @dat[1].16b, @tweak[1].16b +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b + eor @dat[1].16b, @dat[1].16b, @tweak[1].16b + st1 {@dat[0].4s,@dat[1].4s},[$out],#32 + // save the last tweak + mov $lastTweak.16b,@tweak[1].16b + b 100f +1: // process last 3 blocks + ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s},[$inp],#48 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); + &rbit(@tweak[2],@tweak[2]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b + eor @dat[1].16b, @dat[1].16b, @tweak[1].16b + eor @dat[2].16b, @dat[2].16b, @tweak[2].16b +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b + eor @dat[1].16b, @dat[1].16b, @tweak[1].16b + eor @dat[2].16b, @dat[2].16b, @tweak[2].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s},[$out],#48 + // save the last tweak + mov $lastTweak.16b,@tweak[2].16b +100: + cmp $remain,0 + b.eq 99f + +// This brance calculates the last two tweaks, +// while the encryption/decryption length is larger than 32 +.last_2blks_tweak${standard}: +___ + &rev32_armeb($lastTweak,$lastTweak); + &compute_tweak_vec($lastTweak,@tweak[1],$vTmp0,$vTmp1,$vMagic); + &compute_tweak_vec(@tweak[1],@tweak[2],$vTmp0,$vTmp1,$vMagic); +$code.=<<___; + b .check_dec${standard} + + +// This brance calculates the last two tweaks, +// while the encryption/decryption length is less than 32, who only need two tweaks +.only_2blks_tweak${standard}: + mov @tweak[1].16b,@tweak[0].16b +___ + &rev32_armeb(@tweak[1],@tweak[1]); + &compute_tweak_vec(@tweak[1],@tweak[2],$vTmp0,$vTmp1,$vMagic); +$code.=<<___; + b .check_dec${standard} + + +// Determine whether encryption or decryption is required. +// The last two tweaks need to be swapped for decryption. +.check_dec${standard}: + // encryption:1 decryption:0 + cmp $enc,1 + b.eq .prcess_last_2blks${standard} + mov $vTmp0.16B,@tweak[1].16b + mov @tweak[1].16B,@tweak[2].16b + mov @tweak[2].16B,$vTmp0.16b + +.prcess_last_2blks${standard}: +___ + &rev32_armeb(@tweak[1],@tweak[1]); + &rev32_armeb(@tweak[2],@tweak[2]); +$code.=<<___; + ld1 {@dat[0].4s},[$inp],#16 + eor @dat[0].16b, @dat[0].16b, @tweak[1].16b +___ + &rev32(@dat[0],@dat[0]); + &enc_blk(@dat[0]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[1].16b + st1 {@dat[0].4s},[$out],#16 + + sub $lastBlk,$out,16 + .loop${standard}: + subs $remain,$remain,1 + ldrb w$tmp0,[$lastBlk,$remain] + ldrb w$tmp1,[$inp,$remain] + strb w$tmp1,[$lastBlk,$remain] + strb w$tmp0,[$out,$remain] + b.gt .loop${standard} + ld1 {@dat[0].4s}, [$lastBlk] + eor @dat[0].16b, @dat[0].16b, @tweak[2].16b +___ + &rev32(@dat[0],@dat[0]); + &enc_blk(@dat[0]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[2].16b + st1 {@dat[0].4s}, [$lastBlk] +99: + ret +.size ${prefix}_xts_do_cipher${standard},.-${prefix}_xts_do_cipher${standard} +___ +} #end of gen_xts_do_cipher + +}}} + +{{{ +my ($enc)=("w6"); + +sub gen_xts_cipher() { + my $en = shift; +$code.=<<___; +.globl ${prefix}_xts_${en}crypt${standard} +.type ${prefix}_xts_${en}crypt${standard},%function +.align 5 +${prefix}_xts_${en}crypt${standard}: + stp x15, x16, [sp, #-0x10]! + stp x17, x18, [sp, #-0x10]! + stp x19, x20, [sp, #-0x10]! + stp x21, x22, [sp, #-0x10]! + stp x23, x24, [sp, #-0x10]! + stp x25, x26, [sp, #-0x10]! + stp x27, x28, [sp, #-0x10]! + stp x29, x30, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d14, d15, [sp, #-0x10]! +___ + &mov_en_to_enc($en,$enc); +$code.=<<___; + bl ${prefix}_xts_do_cipher${standard} + ldp d14, d15, [sp], #0x10 + ldp d12, d13, [sp], #0x10 + ldp d10, d11, [sp], #0x10 + ldp d8, d9, [sp], #0x10 + ldp x29, x30, [sp], #0x10 + ldp x27, x28, [sp], #0x10 + ldp x25, x26, [sp], #0x10 + ldp x23, x24, [sp], #0x10 + ldp x21, x22, [sp], #0x10 + ldp x19, x20, [sp], #0x10 + ldp x17, x18, [sp], #0x10 + ldp x15, x16, [sp], #0x10 + ret +.size ${prefix}_xts_${en}crypt${standard},.-${prefix}_xts_${en}crypt${standard} +___ + +} # end of gen_xts_cipher +$standard="_gb"; +&gen_xts_do_cipher(); +&gen_xts_cipher("en"); +&gen_xts_cipher("de"); +$standard=""; +&gen_xts_do_cipher(); +&gen_xts_cipher("en"); +&gen_xts_cipher("de"); +}}} ######################################## { my %opcode = ( "sm4e" => 0xcec08400, diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h index 838cb4e72c..8c45cb7062 100644 --- a/include/crypto/sm4_platform.h +++ b/include/crypto/sm4_platform.h @@ -26,6 +26,10 @@ # define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt # define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt # define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks +# define HWSM4_xts_encrypt_gb sm4_v8_xts_encrypt_gb +# define HWSM4_xts_decrypt_gb sm4_v8_xts_decrypt_gb +# define HWSM4_xts_encrypt sm4_v8_xts_encrypt +# define HWSM4_xts_decrypt sm4_v8_xts_decrypt # endif # endif # endif /* OPENSSL_CPUID_OBJ */ @@ -46,6 +50,16 @@ void HWSM4_ecb_encrypt(const unsigned char *in, unsigned char *out, void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, size_t len, const void *key, const unsigned char ivec[16]); +/* xts mode in GB/T 17964-2021 */ +void HWSM4_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +void HWSM4_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +/* xts mode in IEEE Std 1619-2007 */ +void HWSM4_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +void HWSM4_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); # endif /* HWSM4_CAPABLE */ #ifdef VPSM4_EX_CAPABLE -- Gitee From b1e7c37f70e14e13f8393e0b9394a826b6fec700 Mon Sep 17 00:00:00 2001 From: dumbdog Date: Thu, 14 Sep 2023 11:43:40 +0800 Subject: [PATCH 5/5] Fix SM4-XTS build failure using clang The OpenSSL community also has similar issues, and the corresponding solutions can be found in this [PR] (https://github.com/openssl/openssl/pull/20202). Moreover, the community has added restrictions in the arm-xlate.pl file to recognize the 'LDR REG, =VALUE' pseudo instruction on Neon, as shown in this [PR] (https://github.com/openssl/openssl/pull/20222). reference: https://gitee.com/src-openeuler/openssl/blob/openEuler-22.03-LTS-Next/Fix-SM4-XTS-build-failure-using-clang.patch Signed-off-by: dumbdog --- crypto/perlasm/arm-xlate.pl | 10 ++++++++++ crypto/sm4/asm/sm4-armv8.pl | 12 ++++++----- crypto/sm4/asm/vpsm4_ex-armv8.pl | 34 ++++++++++++++++++++------------ 3 files changed, 38 insertions(+), 18 deletions(-) diff --git a/crypto/perlasm/arm-xlate.pl b/crypto/perlasm/arm-xlate.pl index 48819be540..a2f3838346 100755 --- a/crypto/perlasm/arm-xlate.pl +++ b/crypto/perlasm/arm-xlate.pl @@ -170,6 +170,16 @@ while(my $line=<>) { } } + # ldr REG, #VALUE psuedo-instruction - avoid clang issue with Neon registers + # + if ($line =~ /^\s*ldr\s+([qd]\d\d?)\s*,\s*=(\w+)/i) { + # Immediate load via literal pool into qN or DN - clang max is 2^32-1 + my ($reg, $value) = ($1, $2); + # If $value is hex, 0x + 8 hex chars = 10 chars total will be okay + # If $value is decimal, 2^32 - 1 = 4294967295 will be okay (also 10 chars) + die("$line: immediate load via literal pool into $reg: value too large for clang - redo manually") if length($value) > 10; + } + print $line if ($line); print "\n"; } diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl index 923c1c0197..07ba53afdf 100644 --- a/crypto/sm4/asm/sm4-armv8.pl +++ b/crypto/sm4/asm/sm4-armv8.pl @@ -244,6 +244,8 @@ $code.=<<___; .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 .Lfk: .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc +.Lxts_magic: + .dword 0x0101010101010187,0x0101010101010101 ___ }}} @@ -604,7 +606,7 @@ $code.=<<___; .globl ${prefix}_ctr32_encrypt_blocks .type ${prefix}_ctr32_encrypt_blocks,%function .align 5 -${prefix}_ctr32_encrypt_blocks: +${prefix}_ctr32_encrypt_blocks: stp d8,d9,[sp, #-16]! ld1 {$ivec.4s},[$ivp] @@ -736,7 +738,7 @@ $code.=<<___; .align 5 ${prefix}_xts_do_cipher${standard}: mov w$magic,0x87 - ldr $qMagic, =0x01010101010101010101010101010187 + ldr $qMagic, .Lxts_magic // used to encrypt the XORed plaintext blocks ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk2],#64 ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk2] @@ -963,7 +965,7 @@ $code.=<<___; cmp $remain,0 b.eq 99f -// This brance calculates the last two tweaks, +// This brance calculates the last two tweaks, // while the encryption/decryption length is larger than 32 .last_2blks_tweak${standard}: ___ @@ -974,7 +976,7 @@ $code.=<<___; b .check_dec${standard} -// This brance calculates the last two tweaks, +// This brance calculates the last two tweaks, // while the encryption/decryption length is less than 32, who only need two tweaks .only_2blks_tweak${standard}: mov @tweak[1].16b,@tweak[0].16b @@ -1018,7 +1020,7 @@ $code.=<<___; strb w$tmp1,[$lastBlk,$remain] strb w$tmp0,[$out,$remain] b.gt .loop${standard} - ld1 {@dat[0].4s}, [$lastBlk] + ld1 {@dat[0].4s}, [$lastBlk] eor @dat[0].16b, @dat[0].16b, @tweak[2].16b ___ &rev32(@dat[0],@dat[0]); diff --git a/crypto/sm4/asm/vpsm4_ex-armv8.pl b/crypto/sm4/asm/vpsm4_ex-armv8.pl index 86a6f89f52..4fd2975b52 100644 --- a/crypto/sm4/asm/vpsm4_ex-armv8.pl +++ b/crypto/sm4/asm/vpsm4_ex-armv8.pl @@ -108,12 +108,12 @@ ___ sub load_sbox_matrix () { $code.=<<___; - ldr $MaskQ, =0x0306090c0f0205080b0e0104070a0d00 - ldr $TAHMatQ, =0x22581a6002783a4062185a2042387a00 - ldr $TALMatQ, =0xc10bb67c4a803df715df62a89e54e923 - ldr $ATAHMatQ, =0x1407c6d56c7fbeadb9aa6b78c1d21300 - ldr $ATALMatQ, =0xe383c1a1fe9edcbc6404462679195b3b - ldr $ANDMaskQ, =0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f + ldr $MaskQ, .Lsbox_magic + ldr $TAHMatQ, .Lsbox_magic+16 + ldr $TALMatQ, .Lsbox_magic+32 + ldr $ATAHMatQ, .Lsbox_magic+48 + ldr $ATALMatQ, .Lsbox_magic+64 + ldr $ANDMaskQ, .Lsbox_magic+80 ___ } # matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x) @@ -505,7 +505,7 @@ sub compute_tweak_vec() { my $des = shift; &rbit(@vtmp[2],$src); $code.=<<___; - ldr @qtmp[0], =0x01010101010101010101010101010187 + ldr @qtmp[0], .Lxts_magic shl $des.16b, @vtmp[2].16b, #1 ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 ushr @vtmp[1].16b, @vtmp[1].16b, #7 @@ -569,10 +569,18 @@ ${prefix}_consts: .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 .Lfk: - .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc + .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc .Lshuffles: - .long 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x03020100 - + .long 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x03020100 +.Lxts_magic: + .dword 0x0101010101010187,0x0101010101010101 +.Lsbox_magic: + .dword 0x0b0e0104070a0d00,0x0306090c0f020508 + .dword 0x62185a2042387a00,0x22581a6002783a40 + .dword 0x15df62a89e54e923,0xc10bb67c4a803df7 + .dword 0xb9aa6b78c1d21300,0x1407c6d56c7fbead + .dword 0x6404462679195b3b,0xe383c1a1fe9edcbc + .dword 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f .size ${prefix}_consts,.-${prefix}_consts ___ @@ -1033,7 +1041,7 @@ $code.=<<___; cmp $remain,0 b.eq .return${standard} -// This brance calculates the last two tweaks, +// This brance calculates the last two tweaks, // while the encryption/decryption length is larger than 32 .last_2blks_tweak${standard}: ___ @@ -1044,7 +1052,7 @@ $code.=<<___; b .check_dec${standard} -// This brance calculates the last two tweaks, +// This brance calculates the last two tweaks, // while the encryption/decryption length is equal to 32, who only need two tweaks .only_2blks_tweak${standard}: mov @tweak[1].16b,@tweak[0].16b @@ -1087,7 +1095,7 @@ $code.=<<___; strb $wtmp1,[$lastBlk,$remain] strb $wtmp0,[$outp,$remain] b.gt .loop${standard} - ld1 {@data[0].4s}, [$lastBlk] + ld1 {@data[0].4s}, [$lastBlk] eor @data[0].16b, @data[0].16b, @tweak[2].16b ___ &rev32(@data[0],@data[0]); -- Gitee