diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf index e01dc63a8bf40855ea9137e13c8f835be1dc9547..a26d0810c91db8b6beff056c03033a342ce812cd 100644 --- a/Configurations/00-base-templates.conf +++ b/Configurations/00-base-templates.conf @@ -321,6 +321,8 @@ my %targets=( chacha_asm_src => "chacha-armv8.S", poly1305_asm_src=> "poly1305-armv8.S", keccak1600_asm_src => "keccak1600-armv8.S", + sm4_asm_src => "sm4-armv8.S vpsm4_ex-armv8.S", + sm3_asm_src => "sm3-armv8.S", }, parisc11_asm => { template => 1, diff --git a/Configure b/Configure index 78cc15d184748322e0a97954ece8de4a69655452..f7f1919b22426d0976fc3f61218aa94b7256a6b8 100755 --- a/Configure +++ b/Configure @@ -1417,6 +1417,13 @@ unless ($disabled{asm}) { if ($target{poly1305_asm_src} ne "") { push @{$config{lib_defines}}, "POLY1305_ASM"; } + if ($target{sm4_asm_src} ne "") { + push @{$config{lib_defines}}, "SM4_ASM" if ($target{sm4_asm_src} =~ m/sm4/); + push @{$config{lib_defines}}, "VPSM4_EX_ASM" if ($target{sm4_asm_src} =~ m/vpsm4_ex/); + } + if ($target{sm3_asm_src} ne "") { + push @{$config{lib_defines}}, "SM3_ASM"; + } } my %predefined_C = compiler_predefined($config{CROSS_COMPILE}.$config{CC}); @@ -3372,6 +3379,8 @@ sub print_table_entry "mtoutflag", "multilib", "build_scheme", + "sm4_asm_src", + "sm3_asm_src", ); if ($type eq "TABLE") { diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl index 319927e6c729e7ff4d8fe2b2d92e22a48cda6997..341167b57056c61df83ee11dd4bf565f0b2080fb 100755 --- a/crypto/arm64cpuid.pl +++ b/crypto/arm64cpuid.pl @@ -71,6 +71,13 @@ _armv8_pmull_probe: ret .size _armv8_pmull_probe,.-_armv8_pmull_probe +.globl _armv8_sm4_probe +.type _armv8_sm4_probe,%function +_armv8_sm4_probe: + .long 0xcec08400 // sm4e v0.4s, v0.4s + ret +.size _armv8_sm4_probe,.-_armv8_sm4_probe + .globl _armv8_sha512_probe .type _armv8_sha512_probe,%function _armv8_sha512_probe: @@ -78,6 +85,13 @@ _armv8_sha512_probe: ret .size _armv8_sha512_probe,.-_armv8_sha512_probe +.globl _armv8_sm3_probe +.type _armv8_sm3_probe,%function +_armv8_sm3_probe: + .long 0xce63c004 // sm3partw1 v4.4s, v0.4s, v3.4s + ret +.size _armv8_sm3_probe,.-_armv8_sm3_probe + .globl OPENSSL_cleanse .type OPENSSL_cleanse,%function .align 5 diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h index 8b7105571d78a4b6e62bf4ca84d8f706074b5ce5..0f6f7ca24ce054c848621835faf238609006db9b 100644 --- a/crypto/arm_arch.h +++ b/crypto/arm_arch.h @@ -80,5 +80,7 @@ extern unsigned int OPENSSL_armcap_P; # define ARMV8_SHA256 (1<<4) # define ARMV8_PMULL (1<<5) # define ARMV8_SHA512 (1<<6) +# define ARMV8_SM3 (1<<9) +# define ARMV8_SM4 (1<<10) #endif diff --git a/crypto/armcap.c b/crypto/armcap.c index 48c5d4d64e32a8c355ce75d1ec1ed6f5aef65331..73bcad1a574c2c4aebbf3eb042feabac3cfbed11 100644 --- a/crypto/armcap.c +++ b/crypto/armcap.c @@ -47,6 +47,8 @@ void _armv8_sha1_probe(void); void _armv8_sha256_probe(void); void _armv8_pmull_probe(void); # ifdef __aarch64__ +void _armv8_sm3_probe(void); +void _armv8_sm4_probe(void); void _armv8_sha512_probe(void); # endif uint32_t _armv7_tick(void); @@ -130,6 +132,8 @@ static unsigned long getauxval(unsigned long key) # define HWCAP_CE_PMULL (1 << 4) # define HWCAP_CE_SHA1 (1 << 5) # define HWCAP_CE_SHA256 (1 << 6) +# define HWCAP_CE_SM3 (1 << 18) +# define HWCAP_CE_SM4 (1 << 19) # define HWCAP_CE_SHA512 (1 << 21) # endif @@ -188,8 +192,14 @@ void OPENSSL_cpuid_setup(void) OPENSSL_armcap_P |= ARMV8_SHA256; # ifdef __aarch64__ + if (hwcap & HWCAP_CE_SM4) + OPENSSL_armcap_P |= ARMV8_SM4; + if (hwcap & HWCAP_CE_SHA512) OPENSSL_armcap_P |= ARMV8_SHA512; + + if (hwcap & HWCAP_CE_SM3) + OPENSSL_armcap_P |= ARMV8_SM3; # endif } # endif @@ -229,10 +239,20 @@ void OPENSSL_cpuid_setup(void) OPENSSL_armcap_P |= ARMV8_SHA256; } # if defined(__aarch64__) && !defined(__APPLE__) + if (sigsetjmp(ill_jmp, 1) == 0) { + _armv8_sm4_probe(); + OPENSSL_armcap_P |= ARMV8_SM4; + } + if (sigsetjmp(ill_jmp, 1) == 0) { _armv8_sha512_probe(); OPENSSL_armcap_P |= ARMV8_SHA512; } + + if (sigsetjmp(ill_jmp, 1) == 0) { + _armv8_sm3_probe(); + OPENSSL_armcap_P |= ARMV8_SM3; + } # endif } # endif diff --git a/crypto/evp/c_allc.c b/crypto/evp/c_allc.c index 22fdcc409c165c2d695f4be6663b7435be34e366..01b0d1f8ca14d713533dc56988305817c564180b 100644 --- a/crypto/evp/c_allc.c +++ b/crypto/evp/c_allc.c @@ -85,6 +85,7 @@ void openssl_add_all_ciphers_int(void) EVP_add_cipher(EVP_sm4_cfb()); EVP_add_cipher(EVP_sm4_ofb()); EVP_add_cipher(EVP_sm4_ctr()); + EVP_add_cipher(EVP_sm4_xts()); EVP_add_cipher_alias(SN_sm4_cbc, "SM4"); EVP_add_cipher_alias(SN_sm4_cbc, "sm4"); #endif diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c index fce32794fc51405c1b3e69d272aefff5ec11e5cc..da4dbd34a670bb0b610406102a2a39a99f09affb 100644 --- a/crypto/evp/e_sm4.c +++ b/crypto/evp/e_sm4.c @@ -15,86 +15,398 @@ # include # include "crypto/sm4.h" # include "crypto/evp.h" +# include "crypto/sm4_platform.h" +# include "evp_local.h" +# include "modes_local.h" + + typedef struct { - SM4_KEY ks; + union { + double align; + SM4_KEY ks; + } ks; + block128_f block; + union { + ecb128_f ecb; + cbc128_f cbc; + ctr128_f ctr; + } stream; } EVP_SM4_KEY; +# define BLOCK_CIPHER_generic(nid,blocksize,ivlen,nmode,mode,MODE,flags) \ +static const EVP_CIPHER sm4_##mode = { \ + nid##_##nmode,blocksize,128/8,ivlen, \ + flags|EVP_CIPH_##MODE##_MODE, \ + sm4_init_key, \ + sm4_##mode##_cipher, \ + NULL, \ + sizeof(EVP_SM4_KEY), \ + NULL,NULL,NULL,NULL }; \ +const EVP_CIPHER *EVP_sm4_##mode(void) \ +{ return &sm4_##mode; } + +#define BLOCK_CIPHER_generic_pack(nid,flags) \ + BLOCK_CIPHER_generic(nid,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,1,16,ctr,ctr,CTR,flags) + static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, const unsigned char *iv, int enc) { - SM4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); + int mode; + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY, ctx); + + mode = EVP_CIPHER_CTX_mode(ctx); + if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) && !enc) { +#ifdef HWSM4_CAPABLE + if (HWSM4_CAPABLE) { + HWSM4_set_decrypt_key(key, &dat->ks.ks); + dat->block = (block128_f) HWSM4_decrypt; + dat->stream.cbc = NULL; +# ifdef HWSM4_cbc_encrypt + if (mode == EVP_CIPH_CBC_MODE) + dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt; +# endif +# ifdef HWSM4_ecb_encrypt + if (mode == EVP_CIPH_ECB_MODE) + dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt; +# endif + } else +#endif +#ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + vpsm4_ex_set_decrypt_key(key, &dat->ks.ks); + dat->block = (block128_f) vpsm4_ex_decrypt; + if (mode == EVP_CIPH_ECB_MODE) + dat->stream.ecb = (ecb128_f) vpsm4_ex_ecb_encrypt; + } else +#endif + { + dat->block = (block128_f)SM4_decrypt; + SM4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); + } + } else { +#ifdef HWSM4_CAPABLE + if (HWSM4_CAPABLE) { + HWSM4_set_encrypt_key(key, &dat->ks.ks); + dat->block = (block128_f) HWSM4_encrypt; + dat->stream.cbc = NULL; +# ifdef HWSM4_cbc_encrypt + if (mode == EVP_CIPH_CBC_MODE) + dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt; + else +# endif +# ifdef HWSM4_ecb_encrypt + if (mode == EVP_CIPH_ECB_MODE) + dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt; + else +# endif +# ifdef HWSM4_ctr32_encrypt_blocks + if (mode == EVP_CIPH_CTR_MODE) + dat->stream.ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks; + else +# endif + (void)0; /* terminate potentially open 'else' */ + } else +#endif +#ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + vpsm4_ex_set_encrypt_key(key, &dat->ks.ks); + dat->block = (block128_f) vpsm4_ex_encrypt; + if (mode == EVP_CIPH_ECB_MODE) + dat->stream.ecb = (ecb128_f) vpsm4_ex_ecb_encrypt; + } else +#endif + { + dat->block = (block128_f)SM4_encrypt; + SM4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); + } + } return 1; } -static void sm4_cbc_encrypt(const unsigned char *in, unsigned char *out, - size_t len, const SM4_KEY *key, - unsigned char *ivec, const int enc) +static int sm4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) { - if (enc) - CRYPTO_cbc128_encrypt(in, out, len, key, ivec, - (block128_f)SM4_encrypt); + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + + if (dat->stream.cbc) + (*dat->stream.cbc) (in, out, len, &dat->ks.ks, ctx->iv, + EVP_CIPHER_CTX_encrypting(ctx)); + else if (EVP_CIPHER_CTX_encrypting(ctx)) + CRYPTO_cbc128_encrypt(in, out, len, &dat->ks.ks, + EVP_CIPHER_CTX_iv_noconst(ctx), dat->block); else - CRYPTO_cbc128_decrypt(in, out, len, key, ivec, - (block128_f)SM4_decrypt); + CRYPTO_cbc128_decrypt(in, out, len, &dat->ks.ks, + EVP_CIPHER_CTX_iv_noconst(ctx), dat->block); + return 1; } -static void sm4_cfb128_encrypt(const unsigned char *in, unsigned char *out, - size_t length, const SM4_KEY *key, - unsigned char *ivec, int *num, const int enc) +static int sm4_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) { - CRYPTO_cfb128_encrypt(in, out, length, key, ivec, num, enc, - (block128_f)SM4_encrypt); + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + int num = EVP_CIPHER_CTX_num(ctx); + + CRYPTO_cfb128_encrypt(in, out, len, &dat->ks.ks, + ctx->iv, &num, + EVP_CIPHER_CTX_encrypting(ctx), dat->block); + EVP_CIPHER_CTX_set_num(ctx, num); + + return 1; } -static void sm4_ecb_encrypt(const unsigned char *in, unsigned char *out, - const SM4_KEY *key, const int enc) +static int sm4_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) { - if (enc) - SM4_encrypt(in, out, key); + size_t bl = EVP_CIPHER_CTX_block_size(ctx); + size_t i; + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + + if (len < bl){ + return 1; + } + if (dat->stream.ecb != NULL) + (*dat->stream.ecb) (in, out, len, &dat->ks.ks, + EVP_CIPHER_CTX_encrypting(ctx)); else - SM4_decrypt(in, out, key); + for (i = 0, len -= bl; i <= len; i += bl) + (*dat->block) (in + i, out + i, &dat->ks.ks); + return 1; } -static void sm4_ofb128_encrypt(const unsigned char *in, unsigned char *out, - size_t length, const SM4_KEY *key, - unsigned char *ivec, int *num) +static int sm4_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) { - CRYPTO_ofb128_encrypt(in, out, length, key, ivec, num, - (block128_f)SM4_encrypt); -} + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + int num = EVP_CIPHER_CTX_num(ctx); -IMPLEMENT_BLOCK_CIPHER(sm4, ks, sm4, EVP_SM4_KEY, NID_sm4, - 16, 16, 16, 128, EVP_CIPH_FLAG_DEFAULT_ASN1, - sm4_init_key, 0, 0, 0, 0) + CRYPTO_ofb128_encrypt(in, out, len, &dat->ks.ks, + ctx->iv, &num, dat->block); + EVP_CIPHER_CTX_set_num(ctx, num); + return 1; +} static int sm4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, const unsigned char *in, size_t len) { - unsigned int num = EVP_CIPHER_CTX_num(ctx); - EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY, ctx); + int n = EVP_CIPHER_CTX_num(ctx); + unsigned int num; + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + + if (n < 0) + return 0; + num = (unsigned int)n; - CRYPTO_ctr128_encrypt(in, out, len, &dat->ks, - EVP_CIPHER_CTX_iv_noconst(ctx), - EVP_CIPHER_CTX_buf_noconst(ctx), &num, - (block128_f)SM4_encrypt); + if (dat->stream.ctr) + CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks, + ctx->iv, + EVP_CIPHER_CTX_buf_noconst(ctx), + &num, dat->stream.ctr); + else + CRYPTO_ctr128_encrypt(in, out, len, &dat->ks.ks, + ctx->iv, + EVP_CIPHER_CTX_buf_noconst(ctx), &num, + dat->block); EVP_CIPHER_CTX_set_num(ctx, num); return 1; } -static const EVP_CIPHER sm4_ctr_mode = { - NID_sm4_ctr, 1, 16, 16, - EVP_CIPH_CTR_MODE, - sm4_init_key, - sm4_ctr_cipher, - NULL, - sizeof(EVP_SM4_KEY), - NULL, NULL, NULL, NULL -}; +BLOCK_CIPHER_generic_pack(NID_sm4, 0) + +typedef struct { + union { + double align; + SM4_KEY ks; + } ks1, ks2; /* sm4 key schedules to use */ + XTS128_CONTEXT xts; + int std; /* 0 for xts mode in GB/T 17964-2021 */ + /* 1 for xts mode in IEEE Std 1619-2007 */ + void (*stream_gb) (const unsigned char *in, + unsigned char *out, size_t length, + const SM4_KEY *key1, const SM4_KEY *key2, + const unsigned char iv[16]); /* stream for xts mode in GB/T 17964-2021 */ + void (*stream) (const unsigned char *in, + unsigned char *out, size_t length, + const SM4_KEY *key1, const SM4_KEY *key2, + const unsigned char iv[16]); /* stream for xts mode in IEEE Std 1619-2007 */ +} EVP_SM4_XTS_CTX; + +static int sm4_xts_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) +{ + EVP_SM4_XTS_CTX *xctx = EVP_C_DATA(EVP_SM4_XTS_CTX, c); + + if (type == EVP_CTRL_COPY) { + EVP_CIPHER_CTX *out = ptr; + EVP_SM4_XTS_CTX *xctx_out = EVP_C_DATA(EVP_SM4_XTS_CTX,out); + + if (xctx->xts.key1) { + if (xctx->xts.key1 != &xctx->ks1) + return 0; + xctx_out->xts.key1 = &xctx_out->ks1; + } + if (xctx->xts.key2) { + if (xctx->xts.key2 != &xctx->ks2) + return 0; + xctx_out->xts.key2 = &xctx_out->ks2; + } + return 1; + } else if (type == EVP_CTRL_XTS_STANDARD) { + if ((arg < 0) || (arg > 1)) + return 0; + xctx->std = arg; + return 1; + } else if (type != EVP_CTRL_INIT) + return -1; + /* key1 and key2 are used as an indicator both key and IV are set */ + xctx->xts.key1 = NULL; + xctx->xts.key2 = NULL; + return 1; +} + +static int sm4_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + const unsigned char *iv, int enc) +{ + EVP_SM4_XTS_CTX *xctx = EVP_C_DATA(EVP_SM4_XTS_CTX,ctx); + + if (!iv && !key) + return 1; + + if (key) + do { + /* The key is two half length keys in reality */ + const int bytes = EVP_CIPHER_CTX_key_length(ctx) / 2; + xctx->stream_gb = NULL; + xctx->stream = NULL; +#ifdef HWSM4_CAPABLE + if (HWSM4_CAPABLE) { + if (enc) { + HWSM4_set_encrypt_key(key, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f) HWSM4_encrypt; +# ifdef HWSM4_xts_encrypt_gb + xctx->stream_gb = HWSM4_xts_encrypt_gb; +# endif +# ifdef HWSM4_xts_encrypt + xctx->stream = HWSM4_xts_encrypt; +# endif + } else { + HWSM4_set_decrypt_key(key, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f) HWSM4_decrypt; +# ifdef HWSM4_xts_decrypt_gb + xctx->stream_gb = HWSM4_xts_decrypt_gb; +# endif +# ifdef HWSM4_xts_decrypt + xctx->stream = HWSM4_xts_decrypt; +# endif + } + HWSM4_set_encrypt_key(key + bytes, &xctx->ks2.ks); + xctx->xts.block2 = (block128_f) HWSM4_encrypt; -const EVP_CIPHER *EVP_sm4_ctr(void) + xctx->xts.key1 = &xctx->ks1; + break; + } else +#endif +#ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + if (enc) { + vpsm4_ex_set_encrypt_key(key, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f) vpsm4_ex_encrypt; + xctx->stream_gb = vpsm4_ex_xts_encrypt_gb; + xctx->stream = vpsm4_ex_xts_encrypt; + } else { + vpsm4_ex_set_decrypt_key(key, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f) vpsm4_ex_decrypt; + xctx->stream_gb = vpsm4_ex_xts_decrypt_gb; + xctx->stream = vpsm4_ex_xts_decrypt; + } + vpsm4_ex_set_encrypt_key(key + bytes, &xctx->ks2.ks); + xctx->xts.block2 = (block128_f) vpsm4_ex_encrypt; + + xctx->xts.key1 = &xctx->ks1; + break; + } else +#endif + (void)0; /* terminate potentially open 'else' */ + + if (enc) { + SM4_set_key(key, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f) SM4_encrypt; + } else { + SM4_set_key(key, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f) SM4_decrypt; + } + + SM4_set_key(key + bytes, &xctx->ks2.ks); + xctx->xts.block2 = (block128_f) SM4_encrypt; + + xctx->xts.key1 = &xctx->ks1; + } while (0); + + if (iv) { + xctx->xts.key2 = &xctx->ks2; + memcpy(EVP_CIPHER_CTX_iv_noconst(ctx), iv, 16); + } + + return 1; +} + +static int sm4_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) +{ + EVP_SM4_XTS_CTX *xctx = EVP_C_DATA(EVP_SM4_XTS_CTX,ctx); + if (!xctx->xts.key1 || !xctx->xts.key2) + return 0; + if (!out || !in || len < SM4_BLOCK_SIZE) + return 0; + if (xctx->std) { + if (xctx->stream) + (*xctx->stream) (in, out, len, + xctx->xts.key1, xctx->xts.key2, + EVP_CIPHER_CTX_iv_noconst(ctx)); + else if (CRYPTO_xts128_encrypt(&xctx->xts, EVP_CIPHER_CTX_iv_noconst(ctx), + in, out, len, + EVP_CIPHER_CTX_encrypting(ctx))) + return 0; + } else { + if (xctx->stream_gb) + (*xctx->stream_gb) (in, out, len, + xctx->xts.key1, xctx->xts.key2, + EVP_CIPHER_CTX_iv_noconst(ctx)); + else if (CRYPTO_xts128gb_encrypt(&xctx->xts, EVP_CIPHER_CTX_iv_noconst(ctx), + in, out, len, + EVP_CIPHER_CTX_encrypting(ctx))) + return 0; + } + return 1; +} + +#define SM4_XTS_BLOCK_SIZE 1 +#define SM4_XTS_IV_LENGTH 16 +#define SM4_XTS_KEY_LENGTH 32 + +#define XTS_FLAGS (EVP_CIPH_FLAG_DEFAULT_ASN1 | EVP_CIPH_CUSTOM_IV \ + | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT \ + | EVP_CIPH_CUSTOM_COPY | EVP_CIPH_XTS_MODE) + +static const EVP_CIPHER sm4_xts_mode = { + NID_sm4_xts, + SM4_XTS_BLOCK_SIZE, + SM4_XTS_KEY_LENGTH, + SM4_XTS_IV_LENGTH, + XTS_FLAGS, + sm4_xts_init_key, + sm4_xts_cipher, + NULL, + sizeof(EVP_SM4_XTS_CTX), + NULL, NULL, sm4_xts_ctrl, NULL +}; + +const EVP_CIPHER *EVP_sm4_xts(void) { - return &sm4_ctr_mode; + return &sm4_xts_mode; } #endif diff --git a/crypto/modes/build.info b/crypto/modes/build.info index 821340eb909a298fae6c006368e2317a215fde13..f974b044576c173680a0913c6359f2a5cb6fdc8f 100644 --- a/crypto/modes/build.info +++ b/crypto/modes/build.info @@ -1,7 +1,7 @@ LIBS=../../libcrypto SOURCE[../../libcrypto]=\ cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \ - ccm128.c xts128.c wrap128.c ocb128.c \ + ccm128.c xts128.c xts128gb.c wrap128.c ocb128.c \ {- $target{modes_asm_src} -} INCLUDE[gcm128.o]=.. diff --git a/crypto/modes/xts128gb.c b/crypto/modes/xts128gb.c new file mode 100644 index 0000000000000000000000000000000000000000..370b97522913435d13519f5639380183369ee915 --- /dev/null +++ b/crypto/modes/xts128gb.c @@ -0,0 +1,204 @@ +/* + * Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +// This is the xts mode in GB/T 17964-2021 +#include +#include "modes_local.h" +#include + +#ifndef STRICT_ALIGNMENT +# ifdef __GNUC__ +typedef u64 u64_a1 __attribute((__aligned__(1))); +# else +typedef u64 u64_a1; +# endif +#endif + +int CRYPTO_xts128gb_encrypt(const XTS128_CONTEXT *ctx, + const unsigned char iv[16], + const unsigned char *inp, unsigned char *out, + size_t len, int enc) +{ + const union { + long one; + char little; + } is_endian = { + 1 + }; + union { + u64 u[2]; + u32 d[4]; + u8 c[16]; + } tweak, scratch; + unsigned int i; + + if (len < 16) + return -1; + + memcpy(tweak.c, iv, 16); + + (*ctx->block2) (tweak.c, tweak.c, ctx->key2); + + if (!enc && (len % 16)) + len -= 16; + + while (len >= 16) { +#if defined(STRICT_ALIGNMENT) + memcpy(scratch.c, inp, 16); + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; +#else + scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak.u[0]; + scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak.u[1]; +#endif + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); +#if defined(STRICT_ALIGNMENT) + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out, scratch.c, 16); +#else + ((u64_a1 *)out)[0] = scratch.u[0] ^= tweak.u[0]; + ((u64_a1 *)out)[1] = scratch.u[1] ^= tweak.u[1]; +#endif + inp += 16; + out += 16; + len -= 16; + + if (len == 0) + return 0; + + if (is_endian.little) { + u8 res; + u64 hi, lo; +#ifdef BSWAP8 + hi = BSWAP8(tweak.u[0]); + lo = BSWAP8(tweak.u[1]); +#else + u8 *p = tweak.c; + + hi = (u64)GETU32(p) << 32 | GETU32(p + 4); + lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); +#endif + res = (u8)lo & 1; + tweak.u[0] = (lo >> 1) | (hi << 63); + tweak.u[1] = hi >> 1; + if (res) + tweak.c[15] ^= 0xe1; +#ifdef BSWAP8 + hi = BSWAP8(tweak.u[0]); + lo = BSWAP8(tweak.u[1]); +#else + p = tweak.c; + + hi = (u64)GETU32(p) << 32 | GETU32(p + 4); + lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); +#endif + tweak.u[0] = lo; + tweak.u[1] = hi; + } else { + u8 Cin, Cout; + Cin = Cout = 0; + for (i = 0; i < 16; ++i) { + Cout = (tweak.c[i] << 7) & 0x80; + tweak.c[i] = ((tweak.c[i] >> 1) + Cin) & 0xff; + Cin = Cout; + } + if (Cout) + tweak.c[0] ^= 0xe1; + } + } + if (enc) { + for (i = 0; i < len; ++i) { + u8 c = inp[i]; + out[i] = scratch.c[i]; + scratch.c[i] = c; + } + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out - 16, scratch.c, 16); + } else { + union { + u64 u[2]; + u8 c[16]; + } tweak1; + + if (is_endian.little) { + u8 res; + u64 hi, lo; +#ifdef BSWAP8 + hi = BSWAP8(tweak.u[0]); + lo = BSWAP8(tweak.u[1]); +#else + u8 *p = tweak.c; + + hi = (u64)GETU32(p) << 32 | GETU32(p + 4); + lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); +#endif + res = (u8)lo & 1; + tweak1.u[0] = (lo >> 1) | (hi << 63); + tweak1.u[1] = hi >> 1; + if (res) + tweak1.c[15] ^= 0xe1; +#ifdef BSWAP8 + hi = BSWAP8(tweak1.u[0]); + lo = BSWAP8(tweak1.u[1]); +#else + p = tweak1.c; + + hi = (u64)GETU32(p) << 32 | GETU32(p + 4); + lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); +#endif + tweak1.u[0] = lo; + tweak1.u[1] = hi; + } else { + u8 Cin, Cout; + Cin = Cout = 0; + for ( i = 0; i < 16; ++i ) { + Cout = (tweak.c[i] << 7) & 0x80; + tweak1.c[i] = ((tweak.c[i] >> 1) + Cin) & 0xff; + Cin = Cout; + } + if (Cout) + tweak1.c[0] ^= 0xe1; + } +#if defined(STRICT_ALIGNMENT) + memcpy(scratch.c, inp, 16); + scratch.u[0] ^= tweak1.u[0]; + scratch.u[1] ^= tweak1.u[1]; +#else + scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak1.u[0]; + scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak1.u[1]; +#endif + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); + scratch.u[0] ^= tweak1.u[0]; + scratch.u[1] ^= tweak1.u[1]; + + for (i = 0; i < len; ++i) { + u8 c = inp[16 + i]; + out[16 + i] = scratch.c[i]; + scratch.c[i] = c; + } + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); +#if defined(STRICT_ALIGNMENT) + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out, scratch.c, 16); +#else + ((u64_a1 *)out)[0] = scratch.u[0] ^ tweak.u[0]; + ((u64_a1 *)out)[1] = scratch.u[1] ^ tweak.u[1]; +#endif + } + + return 0; +} diff --git a/crypto/objects/obj_dat.h b/crypto/objects/obj_dat.h index 63bf69e4437d956633eb6ad4bc37bbca422f1204..36c38d0d2233075bc03c8454b51eba25a4136944 100644 --- a/crypto/objects/obj_dat.h +++ b/crypto/objects/obj_dat.h @@ -10,7 +10,7 @@ */ /* Serialized OID's */ -static const unsigned char so[7762] = { +static const unsigned char so[7770] = { 0x2A,0x86,0x48,0x86,0xF7,0x0D, /* [ 0] OBJ_rsadsi */ 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01, /* [ 6] OBJ_pkcs */ 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x02,0x02, /* [ 13] OBJ_md2 */ @@ -1076,9 +1076,10 @@ static const unsigned char so[7762] = { 0x2A,0x85,0x03,0x07,0x01,0x02,0x01,0x01,0x04, /* [ 7736] OBJ_id_tc26_gost_3410_2012_256_paramSetD */ 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x02,0x0C, /* [ 7745] OBJ_hmacWithSHA512_224 */ 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x02,0x0D, /* [ 7753] OBJ_hmacWithSHA512_256 */ + 0x2A,0x81,0x1C,0xCF,0x55,0x01,0x68,0x0A, /* [ 7761] OBJ_sm4_xts */ }; -#define NUM_NID 1195 +#define NUM_NID 1197 static const ASN1_OBJECT nid_objs[NUM_NID] = { {"UNDEF", "undefined", NID_undef}, {"rsadsi", "RSA Data Security, Inc.", NID_rsadsi, 6, &so[0]}, @@ -2275,9 +2276,11 @@ static const ASN1_OBJECT nid_objs[NUM_NID] = { {"magma-mac", "magma-mac", NID_magma_mac}, {"hmacWithSHA512-224", "hmacWithSHA512-224", NID_hmacWithSHA512_224, 8, &so[7745]}, {"hmacWithSHA512-256", "hmacWithSHA512-256", NID_hmacWithSHA512_256, 8, &so[7753]}, + { NULL, NULL, NID_undef }, + {"SM4-XTS", "sm4-xts", NID_sm4_xts, 8, &so[7761]}, }; -#define NUM_SN 1186 +#define NUM_SN 1187 static const unsigned int sn_objs[NUM_SN] = { 364, /* "AD_DVCS" */ 419, /* "AES-128-CBC" */ @@ -2551,6 +2554,7 @@ static const unsigned int sn_objs[NUM_SN] = { 1139, /* "SM4-CTR" */ 1133, /* "SM4-ECB" */ 1135, /* "SM4-OFB" */ + 1196, /* "SM4-XTS" */ 188, /* "SMIME" */ 167, /* "SMIME-CAPS" */ 100, /* "SN" */ @@ -3467,7 +3471,7 @@ static const unsigned int sn_objs[NUM_SN] = { 1093, /* "x509ExtAdmission" */ }; -#define NUM_LN 1186 +#define NUM_LN 1187 static const unsigned int ln_objs[NUM_LN] = { 363, /* "AD Time Stamping" */ 405, /* "ANSI X9.62" */ @@ -4609,6 +4613,7 @@ static const unsigned int ln_objs[NUM_LN] = { 1139, /* "sm4-ctr" */ 1133, /* "sm4-ecb" */ 1135, /* "sm4-ofb" */ + 1196, /* "sm4-xts" */ 16, /* "stateOrProvinceName" */ 660, /* "streetAddress" */ 498, /* "subtreeMaximumQuality" */ @@ -4657,7 +4662,7 @@ static const unsigned int ln_objs[NUM_LN] = { 125, /* "zlib compression" */ }; -#define NUM_OBJ 1071 +#define NUM_OBJ 1072 static const unsigned int obj_objs[NUM_OBJ] = { 0, /* OBJ_undef 0 */ 181, /* OBJ_iso 1 */ @@ -5124,6 +5129,7 @@ static const unsigned int obj_objs[NUM_OBJ] = { 1136, /* OBJ_sm4_cfb1 1 2 156 10197 1 104 5 */ 1138, /* OBJ_sm4_cfb8 1 2 156 10197 1 104 6 */ 1139, /* OBJ_sm4_ctr 1 2 156 10197 1 104 7 */ + 1196, /* OBJ_sm4_xts 1 2 156 10197 1 104 10 */ 1172, /* OBJ_sm2 1 2 156 10197 1 301 */ 1143, /* OBJ_sm3 1 2 156 10197 1 401 */ 1144, /* OBJ_sm3WithRSAEncryption 1 2 156 10197 1 504 */ diff --git a/crypto/objects/obj_mac.num b/crypto/objects/obj_mac.num index 1b6a9c61a1c873dbe14100460e1eba0439e27c8d..d1de6e19976d96f909d0c68eda3f5f4360bbdc7b 100644 --- a/crypto/objects/obj_mac.num +++ b/crypto/objects/obj_mac.num @@ -1192,3 +1192,4 @@ magma_cfb 1191 magma_mac 1192 hmacWithSHA512_224 1193 hmacWithSHA512_256 1194 +sm4_xts 1196 diff --git a/crypto/objects/objects.txt b/crypto/objects/objects.txt index c49d4c568b59e3f6566a452ce73e2dcb60ae9a97..14495f2ad899d7f2d69c4df21b02c97a2abc81fe 100644 --- a/crypto/objects/objects.txt +++ b/crypto/objects/objects.txt @@ -1518,6 +1518,7 @@ sm-scheme 104 4 : SM4-CFB : sm4-cfb sm-scheme 104 5 : SM4-CFB1 : sm4-cfb1 sm-scheme 104 6 : SM4-CFB8 : sm4-cfb8 sm-scheme 104 7 : SM4-CTR : sm4-ctr +sm-scheme 104 10 : SM4-XTS : sm4-xts # There is no OID that just denotes "HMAC" oddly enough... diff --git a/crypto/perlasm/arm-xlate.pl b/crypto/perlasm/arm-xlate.pl index 48819be54052d5ec8b482a23d65667b4c02d8d98..a2f38383469b489724832018b77f8c8f5ecc9789 100755 --- a/crypto/perlasm/arm-xlate.pl +++ b/crypto/perlasm/arm-xlate.pl @@ -170,6 +170,16 @@ while(my $line=<>) { } } + # ldr REG, #VALUE psuedo-instruction - avoid clang issue with Neon registers + # + if ($line =~ /^\s*ldr\s+([qd]\d\d?)\s*,\s*=(\w+)/i) { + # Immediate load via literal pool into qN or DN - clang max is 2^32-1 + my ($reg, $value) = ($1, $2); + # If $value is hex, 0x + 8 hex chars = 10 chars total will be okay + # If $value is decimal, 2^32 - 1 = 4294967295 will be okay (also 10 chars) + die("$line: immediate load via literal pool into $reg: value too large for clang - redo manually") if length($value) > 10; + } + print $line if ($line); print "\n"; } diff --git a/crypto/sm3/asm/sm3-armv8.pl b/crypto/sm3/asm/sm3-armv8.pl new file mode 100644 index 0000000000000000000000000000000000000000..677ca525d6964259402cf698b31fe11eaa60941f --- /dev/null +++ b/crypto/sm3/asm/sm3-armv8.pl @@ -0,0 +1,280 @@ +#! /usr/bin/env perl +# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html +# +# This module implements support for Armv8 SM3 instructions + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +# Message expanding: +# Wj <- P1(W[j-16]^W[j-9]^(W[j-3]<<<15))^(W[j-13]<<<7)^W[j-6] +# Input: s0, s1, s2, s3 +# s0 = w0 | w1 | w2 | w3 +# s1 = w4 | w5 | w6 | w7 +# s2 = w8 | w9 | w10 | w11 +# s3 = w12 | w13 | w14 | w15 +# Output: s4 +sub msg_exp () { +my $s0 = shift; +my $s1 = shift; +my $s2 = shift; +my $s3 = shift; +my $s4 = shift; +my $vtmp1 = shift; +my $vtmp2 = shift; +$code.=<<___; + // s4 = w7 | w8 | w9 | w10 + ext $s4.16b, $s1.16b, $s2.16b, #12 + // vtmp1 = w3 | w4 | w5 | w6 + ext $vtmp1.16b, $s0.16b, $s1.16b, #12 + // vtmp2 = w10 | w11 | w12 | w13 + ext $vtmp2.16b, $s2.16b, $s3.16b, #8 + sm3partw1 $s4.4s, $s0.4s, $s3.4s + sm3partw2 $s4.4s, $vtmp2.4s, $vtmp1.4s +___ +} + +# A round of compresson function +# Input: +# ab - choose instruction among sm3tt1a, sm3tt1b, sm3tt2a, sm3tt2b +# vstate0 - vstate1, store digest status(A - H) +# vconst0 - vconst1, interleaved used to store Tj <<< j +# vtmp - temporary register +# vw - for sm3tt1ab, vw = s0 eor s1 +# s0 - for sm3tt2ab, just be s0 +# i, choose wj' or wj from vw +sub round () { +my $ab = shift; +my $vstate0 = shift; +my $vstate1 = shift; +my $vconst0 = shift; +my $vconst1 = shift; +my $vtmp = shift; +my $vw = shift; +my $s0 = shift; +my $i = shift; +$code.=<<___; + sm3ss1 $vtmp.4s, $vstate0.4s, $vconst0.4s, $vstate1.4s + shl $vconst1.4s, $vconst0.4s, #1 + sri $vconst1.4s, $vconst0.4s, #31 + sm3tt1$ab $vstate0.4s, $vtmp.4s, $vw.4s[$i] + sm3tt2$ab $vstate1.4s, $vtmp.4s, $s0.4s[$i] +___ +} + +sub qround () { +my $ab = shift; +my $vstate0 = shift; +my $vstate1 = shift; +my $vconst0 = shift; +my $vconst1 = shift; +my $vtmp1 = shift; +my $vtmp2 = shift; +my $s0 = shift; +my $s1 = shift; +my $s2 = shift; +my $s3 = shift; +my $s4 = shift; + if($s4) { + &msg_exp($s0, $s1, $s2, $s3, $s4, $vtmp1, $vtmp2); + } +$code.=<<___; + eor $vtmp1.16b, $s0.16b, $s1.16b +___ + &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2, + $vtmp1, $s0, 0); + &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2, + $vtmp1, $s0, 1); + &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2, + $vtmp1, $s0, 2); + &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2, + $vtmp1, $s0, 3); +} + +$code=<<___; +#include "arm_arch.h" +.arch armv8.2-a +.text +___ + +{{{ +my ($pstate,$pdata,$num)=("x0","x1","w2"); +my ($state1,$state2)=("v5","v6"); +my ($sconst1, $sconst2)=("s16","s17"); +my ($vconst1, $vconst2)=("v16","v17"); +my ($s0,$s1,$s2,$s3,$s4)=map("v$_",(0..4)); +my ($bkstate1,$bkstate2)=("v18","v19"); +my ($vconst_tmp1,$vconst_tmp2)=("v20","v21"); +my ($vtmp1,$vtmp2)=("v22","v23"); +my $constaddr="x8"; +# void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num) +$code.=<<___; +.globl ossl_hwsm3_block_data_order +.type ossl_hwsm3_block_data_order,%function +.align 5 +ossl_hwsm3_block_data_order: + // load state + ld1 {$state1.4s-$state2.4s}, [$pstate] + rev64 $state1.4s, $state1.4s + rev64 $state2.4s, $state2.4s + ext $state1.16b, $state1.16b, $state1.16b, #8 + ext $state2.16b, $state2.16b, $state2.16b, #8 + + adr $constaddr, .Tj + ldp $sconst1, $sconst2, [$constaddr] + +.Loop: + // load input + ld1 {$s0.16b-$s3.16b}, [$pdata], #64 + sub $num, $num, #1 + + mov $bkstate1.16b, $state1.16b + mov $bkstate2.16b, $state2.16b + +#ifndef __ARMEB__ + rev32 $s0.16b, $s0.16b + rev32 $s1.16b, $s1.16b + rev32 $s2.16b, $s2.16b + rev32 $s3.16b, $s3.16b +#endif + + ext $vconst_tmp1.16b, $vconst1.16b, $vconst1.16b, #4 +___ + &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s0,$s1,$s2,$s3,$s4); + &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s1,$s2,$s3,$s4,$s0); + &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s2,$s3,$s4,$s0,$s1); + &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s3,$s4,$s0,$s1,$s2); + +$code.=<<___; + ext $vconst_tmp1.16b, $vconst2.16b, $vconst2.16b, #4 +___ + + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s4,$s0,$s1,$s2,$s3); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s0,$s1,$s2,$s3,$s4); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s1,$s2,$s3,$s4,$s0); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s2,$s3,$s4,$s0,$s1); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s3,$s4,$s0,$s1,$s2); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s4,$s0,$s1,$s2,$s3); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s0,$s1,$s2,$s3,$s4); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s1,$s2,$s3,$s4,$s0); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s2,$s3,$s4,$s0,$s1); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s3,$s4); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s4,$s0); + &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, + $s0,$s1); + +$code.=<<___; + eor $state1.16b, $state1.16b, $bkstate1.16b + eor $state2.16b, $state2.16b, $bkstate2.16b + + // any remained blocks? + cbnz $num, .Loop + + // save state + rev64 $state1.4s, $state1.4s + rev64 $state2.4s, $state2.4s + ext $state1.16b, $state1.16b, $state1.16b, #8 + ext $state2.16b, $state2.16b, $state2.16b, #8 + st1 {$state1.4s-$state2.4s}, [$pstate] + ret +.size ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order + +.align 3 +.Tj: +.word 0x79cc4519, 0x9d8a7a87 +___ +}}} + +######################################### +my %sm3partopcode = ( + "sm3partw1" => 0xce60C000, + "sm3partw2" => 0xce60C400); + +my %sm3ss1opcode = ( + "sm3ss1" => 0xce400000); + +my %sm3ttopcode = ( + "sm3tt1a" => 0xce408000, + "sm3tt1b" => 0xce408400, + "sm3tt2a" => 0xce408800, + "sm3tt2b" => 0xce408C00); + +sub unsm3part { + my ($mnemonic,$arg)=@_; + + $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $sm3partopcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; +} + +sub unsm3ss1 { + my ($mnemonic,$arg)=@_; + + $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $sm3ss1opcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10), + $mnemonic,$arg; +} + +sub unsm3tt { + my ($mnemonic,$arg)=@_; + + $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*\[([0-3])\]/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $sm3ttopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<12), + $mnemonic,$arg; +} + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge; + s/\b(sm3ss1)\s+([qv].*)/unsm3ss1($1,$2)/ge; + s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge; + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/sm3/build.info b/crypto/sm3/build.info index 6009b1949eb69b63b1076493e61e6a13da337155..e1137298343ff0ee0276f847c45f117940740db5 100644 --- a/crypto/sm3/build.info +++ b/crypto/sm3/build.info @@ -1,2 +1,15 @@ LIBS=../../libcrypto -SOURCE[../../libcrypto]=sm3.c m_sm3.c +SOURCE[../../libcrypto]=\ + sm3.c m_sm3.c {- $target{sm3_asm_src} -} + +GENERATE[sm3-armv8.S]=asm/sm3-armv8.pl $(PERLASM_SCHEME) +INCLUDE[sm3-armv8.o]=.. + +BEGINRAW[Makefile] +##### SM3 assembler implementations + +# GNU make "catch all" +{- $builddir -}/sm3-%.S: {- $sourcedir -}/asm/sm3-%.pl + CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ + +ENDRAW[Makefile] \ No newline at end of file diff --git a/crypto/sm3/sm3_local.h b/crypto/sm3/sm3_local.h index 7171de510de3b7f9f15d83877d9b238c287b085e..aafff637965566dab63f410b85fd1f5732f7942f 100644 --- a/crypto/sm3/sm3_local.h +++ b/crypto/sm3/sm3_local.h @@ -32,7 +32,21 @@ ll=(c)->G; (void)HOST_l2c(ll, (s)); \ ll=(c)->H; (void)HOST_l2c(ll, (s)); \ } while (0) -#define HASH_BLOCK_DATA_ORDER sm3_block_data_order + +#if defined(SM3_ASM) +# if defined(__aarch64__) +# include "crypto/arm_arch.h" +# define HWSM3_CAPABLE (OPENSSL_armcap_P & ARMV8_SM3) +void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num); +# endif +#endif + +#if defined(HWSM3_CAPABLE) +# define HASH_BLOCK_DATA_ORDER (HWSM3_CAPABLE ? ossl_hwsm3_block_data_order \ + : sm3_block_data_order) +#else +# define HASH_BLOCK_DATA_ORDER sm3_block_data_order +#endif void sm3_transform(SM3_CTX *c, const unsigned char *data); diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl new file mode 100644 index 0000000000000000000000000000000000000000..07ba53afdf159c5251b79c7bdcb883f6647de44a --- /dev/null +++ b/crypto/sm4/asm/sm4-armv8.pl @@ -0,0 +1,1123 @@ +#! /usr/bin/env perl +# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# This module implements support for SM4 hw support on aarch64 +# Oct 2021 +# + +# $outut is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$outut = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour \"$outut\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +$prefix="sm4_v8"; +my @rks=map("v$_",(0..7)); + +sub rev32() { +my $dst = shift; +my $src = shift; +$code.=<<___; +#ifndef __ARMEB__ + rev32 $dst.16b,$src.16b +#endif +___ +} + +sub enc_blk () { +my $data = shift; +$code.=<<___; + sm4e $data.4s,@rks[0].4s + sm4e $data.4s,@rks[1].4s + sm4e $data.4s,@rks[2].4s + sm4e $data.4s,@rks[3].4s + sm4e $data.4s,@rks[4].4s + sm4e $data.4s,@rks[5].4s + sm4e $data.4s,@rks[6].4s + sm4e $data.4s,@rks[7].4s + rev64 $data.4S,$data.4S + ext $data.16b,$data.16b,$data.16b,#8 +___ +} + +sub enc_4blks () { +my $data0 = shift; +my $data1 = shift; +my $data2 = shift; +my $data3 = shift; +$code.=<<___; + sm4e $data0.4s,@rks[0].4s + sm4e $data1.4s,@rks[0].4s + sm4e $data2.4s,@rks[0].4s + sm4e $data3.4s,@rks[0].4s + + sm4e $data0.4s,@rks[1].4s + sm4e $data1.4s,@rks[1].4s + sm4e $data2.4s,@rks[1].4s + sm4e $data3.4s,@rks[1].4s + + sm4e $data0.4s,@rks[2].4s + sm4e $data1.4s,@rks[2].4s + sm4e $data2.4s,@rks[2].4s + sm4e $data3.4s,@rks[2].4s + + sm4e $data0.4s,@rks[3].4s + sm4e $data1.4s,@rks[3].4s + sm4e $data2.4s,@rks[3].4s + sm4e $data3.4s,@rks[3].4s + + sm4e $data0.4s,@rks[4].4s + sm4e $data1.4s,@rks[4].4s + sm4e $data2.4s,@rks[4].4s + sm4e $data3.4s,@rks[4].4s + + sm4e $data0.4s,@rks[5].4s + sm4e $data1.4s,@rks[5].4s + sm4e $data2.4s,@rks[5].4s + sm4e $data3.4s,@rks[5].4s + + sm4e $data0.4s,@rks[6].4s + sm4e $data1.4s,@rks[6].4s + sm4e $data2.4s,@rks[6].4s + sm4e $data3.4s,@rks[6].4s + + sm4e $data0.4s,@rks[7].4s + rev64 $data0.4S,$data0.4S + sm4e $data1.4s,@rks[7].4s + ext $data0.16b,$data0.16b,$data0.16b,#8 + rev64 $data1.4S,$data1.4S + sm4e $data2.4s,@rks[7].4s + ext $data1.16b,$data1.16b,$data1.16b,#8 + rev64 $data2.4S,$data2.4S + sm4e $data3.4s,@rks[7].4s + ext $data2.16b,$data2.16b,$data2.16b,#8 + rev64 $data3.4S,$data3.4S + ext $data3.16b,$data3.16b,$data3.16b,#8 +___ +} + +sub mov_reg_to_vec() { + my $src0 = shift; + my $src1 = shift; + my $desv = shift; +$code.=<<___; + mov $desv.d[0],$src0 + mov $desv.d[1],$src1 +#ifdef __ARMEB__ + rev32 $desv.16b,$desv.16b +#endif +___ +} + +sub mov_vec_to_reg() { + my $srcv = shift; + my $des0 = shift; + my $des1 = shift; +$code.=<<___; + mov $des0,$srcv.d[0] + mov $des1,$srcv.d[1] +___ +} + +sub compute_tweak() { + my $src0 = shift; + my $src1 = shift; + my $des0 = shift; + my $des1 = shift; + my $tmp0 = shift; + my $tmp1 = shift; + my $magic = shift; +$code.=<<___; + extr x$tmp1,$src1,$src1,#32 + extr $des1,$src1,$src0,#63 + and w$tmp0,w$magic,w$tmp1,asr#31 + eor $des0,x$tmp0,$src0,lsl#1 +___ +} + +sub compute_tweak_vec() { + my $src = shift; + my $des = shift; + my $tmp0 = shift; + my $tmp1 = shift; + my $magic = shift; + &rbit($tmp1,$src); +$code.=<<___; + shl $des.16b, $tmp1.16b, #1 + ext $tmp0.16b, $tmp1.16b, $tmp1.16b,#15 + ushr $tmp0.16b, $tmp0.16b, #7 + mul $tmp0.16b, $tmp0.16b, $magic.16b + eor $des.16b, $des.16b, $tmp0.16b +___ + &rbit($des,$des); +} + +sub mov_en_to_enc(){ + my $en = shift; + my $enc = shift; + if ($en eq "en") { +$code.=<<___; + mov $enc,1 +___ + } else { +$code.=<<___; + mov $enc,0 +___ + } +} + +sub rbit() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { + if ($standard eq "_gb") { +$code.=<<___; + rbit $dst.16b,$src.16b +___ + } else { +$code.=<<___; + mov $dst.16b,$src.16b +___ + } + } else { + if ($standard eq "_gb") { +$code.=<<___; + rbit $dst.16b,$src.16b +___ + } + } +} + +sub rev32_armeb() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { +$code.=<<___; +#ifdef __ARMEB__ + rev32 $dst.16b,$src.16b +#else + mov $dst.16b,$src.16b +#endif +___ + } else { +$code.=<<___; +#ifdef __ARMEB__ + rev32 $dst.16b,$dst.16b +#endif +___ + } +} + +$code=<<___; +#include "arm_arch.h" +.arch armv8-a+crypto +.text +___ + +{{{ +$code.=<<___; +.align 6 +.Lck: + .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 + .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 + .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 + .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 + .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 + .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 + .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 + .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 +.Lfk: + .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc +.Lxts_magic: + .dword 0x0101010101010187,0x0101010101010101 +___ +}}} + +{{{ +my ($key,$keys)=("x0","x1"); +my ($tmp)=("x2"); +my ($key0,$key1,$key2,$key3,$key4,$key5,$key6,$key7)=map("v$_",(0..7)); +my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23)); +my ($fkconst) = ("v24"); +$code.=<<___; +.globl ${prefix}_set_encrypt_key +.type ${prefix}_set_encrypt_key,%function +.align 5 +${prefix}_set_encrypt_key: + ld1 {$key0.4s},[$key] + adr $tmp,.Lfk + ld1 {$fkconst.4s},[$tmp] + adr $tmp,.Lck + ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64 +___ + &rev32($key0, $key0); +$code.=<<___; + ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp] + eor $key0.16b,$key0.16b,$fkconst.16b; + sm4ekey $key0.4S,$key0.4S,$const0.4S + sm4ekey $key1.4S,$key0.4S,$const1.4S + sm4ekey $key2.4S,$key1.4S,$const2.4S + sm4ekey $key3.4S,$key2.4S,$const3.4S + sm4ekey $key4.4S,$key3.4S,$const4.4S + st1 {$key0.4s,$key1.4s,$key2.4s,$key3.4s},[$keys],64 + sm4ekey $key5.4S,$key4.4S,$const5.4S + sm4ekey $key6.4S,$key5.4S,$const6.4S + sm4ekey $key7.4S,$key6.4S,$const7.4S + st1 {$key4.4s,$key5.4s,$key6.4s,$key7.4s},[$keys] + ret +.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key +___ +}}} + +{{{ +my ($key,$keys)=("x0","x1"); +my ($tmp)=("x2"); +my ($key7,$key6,$key5,$key4,$key3,$key2,$key1,$key0)=map("v$_",(0..7)); +my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23)); +my ($fkconst) = ("v24"); +$code.=<<___; +.globl ${prefix}_set_decrypt_key +.type ${prefix}_set_decrypt_key,%function +.align 5 +${prefix}_set_decrypt_key: + ld1 {$key0.4s},[$key] + adr $tmp,.Lfk + ld1 {$fkconst.4s},[$tmp] + adr $tmp, .Lck + ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64 +___ + &rev32($key0, $key0); +$code.=<<___; + ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp] + eor $key0.16b, $key0.16b,$fkconst.16b; + sm4ekey $key0.4S,$key0.4S,$const0.4S + sm4ekey $key1.4S,$key0.4S,$const1.4S + sm4ekey $key2.4S,$key1.4S,$const2.4S + rev64 $key0.4s,$key0.4s + rev64 $key1.4s,$key1.4s + ext $key0.16b,$key0.16b,$key0.16b,#8 + ext $key1.16b,$key1.16b,$key1.16b,#8 + sm4ekey $key3.4S,$key2.4S,$const3.4S + sm4ekey $key4.4S,$key3.4S,$const4.4S + rev64 $key2.4s,$key2.4s + rev64 $key3.4s,$key3.4s + ext $key2.16b,$key2.16b,$key2.16b,#8 + ext $key3.16b,$key3.16b,$key3.16b,#8 + sm4ekey $key5.4S,$key4.4S,$const5.4S + sm4ekey $key6.4S,$key5.4S,$const6.4S + rev64 $key4.4s,$key4.4s + rev64 $key5.4s,$key5.4s + ext $key4.16b,$key4.16b,$key4.16b,#8 + ext $key5.16b,$key5.16b,$key5.16b,#8 + sm4ekey $key7.4S,$key6.4S,$const7.4S + rev64 $key6.4s, $key6.4s + rev64 $key7.4s, $key7.4s + ext $key6.16b,$key6.16b,$key6.16b,#8 + ext $key7.16b,$key7.16b,$key7.16b,#8 + st1 {$key7.4s,$key6.4s,$key5.4s,$key4.4s},[$keys],64 + st1 {$key3.4s,$key2.4s,$key1.4s,$key0.4s},[$keys] + ret +.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key +___ +}}} + +{{{ +sub gen_block () { +my $dir = shift; +my ($inp,$out,$rk)=map("x$_",(0..2)); +my ($data)=("v16"); +$code.=<<___; +.globl ${prefix}_${dir}crypt +.type ${prefix}_${dir}crypt,%function +.align 5 +${prefix}_${dir}crypt: + ld1 {$data.4s},[$inp] + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] +___ + &rev32($data,$data); + &enc_blk($data); + &rev32($data,$data); +$code.=<<___; + st1 {$data.4s},[$out] + ret +.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt +___ +} + +&gen_block("en"); +&gen_block("de"); +}}} + +{{{ +my ($inp,$out,$len,$rk)=map("x$_",(0..3)); +my ($enc) = ("w4"); +my @dat=map("v$_",(16..23)); +$code.=<<___; +.globl ${prefix}_ecb_encrypt +.type ${prefix}_ecb_encrypt,%function +.align 5 +${prefix}_ecb_encrypt: + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] +1: + cmp $len,#64 + b.lt 1f + ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 + cmp $len,#128 + b.lt 2f + ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64 + // 8 blocks +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); +$code.=<<___; + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 +___ + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); +$code.=<<___; + st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 + subs $len,$len,#128 + b.gt 1b + ret + // 4 blocks +2: +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); +$code.=<<___; + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + subs $len,$len,#64 + b.gt 1b +1: + subs $len,$len,#16 + b.lt 1f + ld1 {@dat[0].4s},[$inp],#16 +___ + &rev32(@dat[0],@dat[0]); + &enc_blk(@dat[0]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + st1 {@dat[0].4s},[$out],#16 + b.ne 1b +1: + ret +.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt +___ +}}} + +{{{ +my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4)); +my ($enc) = ("w5"); +my @dat=map("v$_",(16..23)); +my @in=map("v$_",(24..31)); +my ($ivec) = ("v8"); +$code.=<<___; +.globl ${prefix}_cbc_encrypt +.type ${prefix}_cbc_encrypt,%function +.align 5 +${prefix}_cbc_encrypt: + stp d8,d9,[sp, #-16]! + + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] + ld1 {$ivec.4s},[$ivp] + cmp $enc,#0 + b.eq .Ldec +1: + cmp $len, #64 + b.lt 1f + ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 + eor @dat[0].16b,@dat[0].16b,$ivec.16b +___ + &rev32(@dat[1],@dat[1]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &enc_blk(@dat[0]); +$code.=<<___; + eor @dat[1].16b,@dat[1].16b,@dat[0].16b +___ + &enc_blk(@dat[1]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + eor @dat[2].16b,@dat[2].16b,@dat[1].16b +___ + &enc_blk(@dat[2]); + &rev32(@dat[1],@dat[1]); +$code.=<<___; + eor @dat[3].16b,@dat[3].16b,@dat[2].16b +___ + &enc_blk(@dat[3]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); +$code.=<<___; + mov $ivec.16b,@dat[3].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + subs $len,$len,#64 + b.ne 1b +1: + subs $len,$len,#16 + b.lt 3f + ld1 {@dat[0].4s},[$inp],#16 + eor $ivec.16b,$ivec.16b,@dat[0].16b +___ + &rev32($ivec,$ivec); + &enc_blk($ivec); + &rev32($ivec,$ivec); +$code.=<<___; + st1 {$ivec.16b},[$out],#16 + b.ne 1b + b 3f +.Ldec: +1: + cmp $len, #64 + b.lt 1f + ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp] + ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64 + cmp $len,#128 + b.lt 2f + // 8 blocks mode + ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp] + ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64 +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],$dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],$dat[7]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,$ivec.16b + eor @dat[1].16b,@dat[1].16b,@in[0].16b + eor @dat[2].16b,@dat[2].16b,@in[1].16b + mov $ivec.16b,@in[7].16b + eor @dat[3].16b,$dat[3].16b,@in[2].16b + eor @dat[4].16b,$dat[4].16b,@in[3].16b + eor @dat[5].16b,$dat[5].16b,@in[4].16b + eor @dat[6].16b,$dat[6].16b,@in[5].16b + eor @dat[7].16b,$dat[7].16b,@in[6].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 + subs $len,$len,128 + b.gt 1b + b 3f + // 4 blocks mode +2: +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],$dat[3]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,$ivec.16b + eor @dat[1].16b,@dat[1].16b,@in[0].16b + mov $ivec.16b,@in[3].16b + eor @dat[2].16b,@dat[2].16b,@in[1].16b + eor @dat[3].16b,$dat[3].16b,@in[2].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + subs $len,$len,#64 + b.gt 1b +1: + subs $len,$len,#16 + b.lt 3f + ld1 {@dat[0].4s},[$inp],#16 + mov @in[0].16b,@dat[0].16b +___ + &rev32(@dat[0],@dat[0]); + &enc_blk(@dat[0]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,$ivec.16b + mov $ivec.16b,@in[0].16b + st1 {@dat[0].16b},[$out],#16 + b.ne 1b +3: + // save back IV + st1 {$ivec.16b},[$ivp] + ldp d8,d9,[sp],#16 + ret +.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt +___ +}}} + +{{{ +my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4)); +my ($ctr)=("w5"); +my @dat=map("v$_",(16..23)); +my @in=map("v$_",(24..31)); +my ($ivec)=("v8"); +$code.=<<___; +.globl ${prefix}_ctr32_encrypt_blocks +.type ${prefix}_ctr32_encrypt_blocks,%function +.align 5 +${prefix}_ctr32_encrypt_blocks: + stp d8,d9,[sp, #-16]! + + ld1 {$ivec.4s},[$ivp] + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] +___ + &rev32($ivec,$ivec); +$code.=<<___; + mov $ctr,$ivec.s[3] +1: + cmp $len,#4 + b.lt 1f + ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64 + mov @dat[0].16b,$ivec.16b + mov @dat[1].16b,$ivec.16b + mov @dat[2].16b,$ivec.16b + mov @dat[3].16b,$ivec.16b + add $ctr,$ctr,#1 + mov $dat[1].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[2].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[3].s[3],$ctr + cmp $len,#8 + b.lt 2f + ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64 + mov @dat[4].16b,$ivec.16b + mov @dat[5].16b,$ivec.16b + mov @dat[6].16b,$ivec.16b + mov @dat[7].16b,$ivec.16b + add $ctr,$ctr,#1 + mov $dat[4].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[5].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[6].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[7].s[3],$ctr +___ + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,@in[0].16b + eor @dat[1].16b,@dat[1].16b,@in[1].16b + eor @dat[2].16b,@dat[2].16b,@in[2].16b + eor @dat[3].16b,@dat[3].16b,@in[3].16b + eor @dat[4].16b,@dat[4].16b,@in[4].16b + eor @dat[5].16b,@dat[5].16b,@in[5].16b + eor @dat[6].16b,@dat[6].16b,@in[6].16b + eor @dat[7].16b,@dat[7].16b,@in[7].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 + subs $len,$len,#8 + b.eq 3f + add $ctr,$ctr,#1 + mov $ivec.s[3],$ctr + b 1b +2: +___ + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,@in[0].16b + eor @dat[1].16b,@dat[1].16b,@in[1].16b + eor @dat[2].16b,@dat[2].16b,@in[2].16b + eor @dat[3].16b,@dat[3].16b,@in[3].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + subs $len,$len,#4 + b.eq 3f + add $ctr,$ctr,#1 + mov $ivec.s[3],$ctr + b 1b +1: + subs $len,$len,#1 + b.lt 3f + mov $dat[0].16b,$ivec.16b + ld1 {@in[0].4s},[$inp],#16 +___ + &enc_blk(@dat[0]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + eor $dat[0].16b,$dat[0].16b,@in[0].16b + st1 {$dat[0].4s},[$out],#16 + b.eq 3f + add $ctr,$ctr,#1 + mov $ivec.s[3],$ctr + b 1b +3: + ldp d8,d9,[sp],#16 + ret +.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks +___ +}}} + + +{{{ +my ($inp,$out,$len,$rk1,$rk2,$ivp)=map("x$_",(0..5)); +my ($blocks)=("x2"); +my ($enc)=("x6"); +my ($remain)=("x7"); +my @twx=map("x$_",(9..24)); +my $lastBlk=("x25"); + +my @tweak=map("v$_",(8..15)); +my @dat=map("v$_",(16..23)); +my $lastTweak=("v24"); + +# x/w/v/q registers for compute tweak +my ($magic)=("8"); +my ($tmp0,$tmp1)=("26","27"); +my ($qMagic,$vMagic)=("q25","v25"); +my ($vTmp0,$vTmp1)=("v26","v27"); + +sub gen_xts_do_cipher() { +$code.=<<___; +.globl ${prefix}_xts_do_cipher${standard} +.type ${prefix}_xts_do_cipher${standard},%function +.align 5 +${prefix}_xts_do_cipher${standard}: + mov w$magic,0x87 + ldr $qMagic, .Lxts_magic + // used to encrypt the XORed plaintext blocks + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk2],#64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk2] + ld1 {@tweak[0].4s}, [$ivp] +___ + &rev32(@tweak[0],@tweak[0]); + &enc_blk(@tweak[0]); + &rev32(@tweak[0],@tweak[0]); +$code.=<<___; + // used to encrypt the initial vector to yield the initial tweak + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk1],#64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk1] + + and $remain,$len,#0x0F + // convert length into blocks + lsr $blocks,$len,4 + cmp $blocks,#1 // $len must be at least 16 + b.lt 99f + + cmp $remain,0 // if $len is a multiple of 16 + b.eq .xts_encrypt_blocks${standard} + // if $len is not a multiple of 16 + subs $blocks,$blocks,#1 + b.eq .only_2blks_tweak${standard} // if $len is less than 32 + +.xts_encrypt_blocks${standard}: +___ + &rbit(@tweak[0],@tweak[0]); + &rev32_armeb(@tweak[0],@tweak[0]); + &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); + &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3],$tmp0,$tmp1,$magic); + &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5],$tmp0,$tmp1,$magic); + &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7],$tmp0,$tmp1,$magic); + &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9],$tmp0,$tmp1,$magic); + &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11],$tmp0,$tmp1,$magic); + &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13],$tmp0,$tmp1,$magic); + &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15],$tmp0,$tmp1,$magic); +$code.=<<___; +1: + cmp $blocks,#8 +___ + &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); + &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1],$tmp0,$tmp1,$magic); + &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); + &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3],$tmp0,$tmp1,$magic); + &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); + &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5],$tmp0,$tmp1,$magic); + &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); + &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7],$tmp0,$tmp1,$magic); + &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]); + &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9],$tmp0,$tmp1,$magic); + &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]); + &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11],$tmp0,$tmp1,$magic); + &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]); + &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13],$tmp0,$tmp1,$magic); + &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]); + &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15],$tmp0,$tmp1,$magic); +$code.=<<___; + b.lt 2f + ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); + &rbit(@tweak[2],@tweak[2]); + &rbit(@tweak[3],@tweak[3]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b + eor @dat[1].16b, @dat[1].16b, @tweak[1].16b + eor @dat[2].16b, @dat[2].16b, @tweak[2].16b + eor @dat[3].16b, @dat[3].16b, @tweak[3].16b + ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64 +___ + &rbit(@tweak[4],@tweak[4]); + &rbit(@tweak[5],@tweak[5]); + &rbit(@tweak[6],@tweak[6]); + &rbit(@tweak[7],@tweak[7]); +$code.=<<___; + eor @dat[4].16b, @dat[4].16b, @tweak[4].16b + eor @dat[5].16b, @dat[5].16b, @tweak[5].16b + eor @dat[6].16b, @dat[6].16b, @tweak[6].16b + eor @dat[7].16b, @dat[7].16b, @tweak[7].16b +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b + eor @dat[1].16b, @dat[1].16b, @tweak[1].16b + eor @dat[2].16b, @dat[2].16b, @tweak[2].16b + eor @dat[3].16b, @dat[3].16b, @tweak[3].16b + eor @dat[4].16b, @dat[4].16b, @tweak[4].16b + eor @dat[5].16b, @dat[5].16b, @tweak[5].16b + eor @dat[6].16b, @dat[6].16b, @tweak[6].16b + eor @dat[7].16b, @dat[7].16b, @tweak[7].16b + + // save the last tweak + mov $lastTweak.16b,@tweak[7].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 + subs $blocks,$blocks,#8 + b.eq 100f + b 1b +2: + // process 4 blocks + cmp $blocks,#4 + b.lt 1f + ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); + &rbit(@tweak[2],@tweak[2]); + &rbit(@tweak[3],@tweak[3]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b + eor @dat[1].16b, @dat[1].16b, @tweak[1].16b + eor @dat[2].16b, @dat[2].16b, @tweak[2].16b + eor @dat[3].16b, @dat[3].16b, @tweak[3].16b +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b + eor @dat[1].16b, @dat[1].16b, @tweak[1].16b + eor @dat[2].16b, @dat[2].16b, @tweak[2].16b + eor @dat[3].16b, @dat[3].16b, @tweak[3].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + sub $blocks,$blocks,#4 + mov @tweak[0].16b,@tweak[4].16b + mov @tweak[1].16b,@tweak[5].16b + mov @tweak[2].16b,@tweak[6].16b + // save the last tweak + mov $lastTweak.16b,@tweak[3].16b +1: + // process last block + cmp $blocks,#1 + b.lt 100f + b.gt 1f + ld1 {@dat[0].4s},[$inp],#16 +___ + &rbit(@tweak[0],@tweak[0]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b +___ + &rev32(@dat[0],@dat[0]); + &enc_blk(@dat[0]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b + st1 {@dat[0].4s},[$out],#16 + // save the last tweak + mov $lastTweak.16b,@tweak[0].16b + b 100f +1: // process last 2 blocks + cmp $blocks,#2 + b.gt 1f + ld1 {@dat[0].4s,@dat[1].4s},[$inp],#32 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b + eor @dat[1].16b, @dat[1].16b, @tweak[1].16b +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b + eor @dat[1].16b, @dat[1].16b, @tweak[1].16b + st1 {@dat[0].4s,@dat[1].4s},[$out],#32 + // save the last tweak + mov $lastTweak.16b,@tweak[1].16b + b 100f +1: // process last 3 blocks + ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s},[$inp],#48 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); + &rbit(@tweak[2],@tweak[2]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b + eor @dat[1].16b, @dat[1].16b, @tweak[1].16b + eor @dat[2].16b, @dat[2].16b, @tweak[2].16b +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[0].16b + eor @dat[1].16b, @dat[1].16b, @tweak[1].16b + eor @dat[2].16b, @dat[2].16b, @tweak[2].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s},[$out],#48 + // save the last tweak + mov $lastTweak.16b,@tweak[2].16b +100: + cmp $remain,0 + b.eq 99f + +// This brance calculates the last two tweaks, +// while the encryption/decryption length is larger than 32 +.last_2blks_tweak${standard}: +___ + &rev32_armeb($lastTweak,$lastTweak); + &compute_tweak_vec($lastTweak,@tweak[1],$vTmp0,$vTmp1,$vMagic); + &compute_tweak_vec(@tweak[1],@tweak[2],$vTmp0,$vTmp1,$vMagic); +$code.=<<___; + b .check_dec${standard} + + +// This brance calculates the last two tweaks, +// while the encryption/decryption length is less than 32, who only need two tweaks +.only_2blks_tweak${standard}: + mov @tweak[1].16b,@tweak[0].16b +___ + &rev32_armeb(@tweak[1],@tweak[1]); + &compute_tweak_vec(@tweak[1],@tweak[2],$vTmp0,$vTmp1,$vMagic); +$code.=<<___; + b .check_dec${standard} + + +// Determine whether encryption or decryption is required. +// The last two tweaks need to be swapped for decryption. +.check_dec${standard}: + // encryption:1 decryption:0 + cmp $enc,1 + b.eq .prcess_last_2blks${standard} + mov $vTmp0.16B,@tweak[1].16b + mov @tweak[1].16B,@tweak[2].16b + mov @tweak[2].16B,$vTmp0.16b + +.prcess_last_2blks${standard}: +___ + &rev32_armeb(@tweak[1],@tweak[1]); + &rev32_armeb(@tweak[2],@tweak[2]); +$code.=<<___; + ld1 {@dat[0].4s},[$inp],#16 + eor @dat[0].16b, @dat[0].16b, @tweak[1].16b +___ + &rev32(@dat[0],@dat[0]); + &enc_blk(@dat[0]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[1].16b + st1 {@dat[0].4s},[$out],#16 + + sub $lastBlk,$out,16 + .loop${standard}: + subs $remain,$remain,1 + ldrb w$tmp0,[$lastBlk,$remain] + ldrb w$tmp1,[$inp,$remain] + strb w$tmp1,[$lastBlk,$remain] + strb w$tmp0,[$out,$remain] + b.gt .loop${standard} + ld1 {@dat[0].4s}, [$lastBlk] + eor @dat[0].16b, @dat[0].16b, @tweak[2].16b +___ + &rev32(@dat[0],@dat[0]); + &enc_blk(@dat[0]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + eor @dat[0].16b, @dat[0].16b, @tweak[2].16b + st1 {@dat[0].4s}, [$lastBlk] +99: + ret +.size ${prefix}_xts_do_cipher${standard},.-${prefix}_xts_do_cipher${standard} +___ +} #end of gen_xts_do_cipher + +}}} + +{{{ +my ($enc)=("w6"); + +sub gen_xts_cipher() { + my $en = shift; +$code.=<<___; +.globl ${prefix}_xts_${en}crypt${standard} +.type ${prefix}_xts_${en}crypt${standard},%function +.align 5 +${prefix}_xts_${en}crypt${standard}: + stp x15, x16, [sp, #-0x10]! + stp x17, x18, [sp, #-0x10]! + stp x19, x20, [sp, #-0x10]! + stp x21, x22, [sp, #-0x10]! + stp x23, x24, [sp, #-0x10]! + stp x25, x26, [sp, #-0x10]! + stp x27, x28, [sp, #-0x10]! + stp x29, x30, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d14, d15, [sp, #-0x10]! +___ + &mov_en_to_enc($en,$enc); +$code.=<<___; + bl ${prefix}_xts_do_cipher${standard} + ldp d14, d15, [sp], #0x10 + ldp d12, d13, [sp], #0x10 + ldp d10, d11, [sp], #0x10 + ldp d8, d9, [sp], #0x10 + ldp x29, x30, [sp], #0x10 + ldp x27, x28, [sp], #0x10 + ldp x25, x26, [sp], #0x10 + ldp x23, x24, [sp], #0x10 + ldp x21, x22, [sp], #0x10 + ldp x19, x20, [sp], #0x10 + ldp x17, x18, [sp], #0x10 + ldp x15, x16, [sp], #0x10 + ret +.size ${prefix}_xts_${en}crypt${standard},.-${prefix}_xts_${en}crypt${standard} +___ + +} # end of gen_xts_cipher +$standard="_gb"; +&gen_xts_do_cipher(); +&gen_xts_cipher("en"); +&gen_xts_cipher("de"); +$standard=""; +&gen_xts_do_cipher(); +&gen_xts_cipher("en"); +&gen_xts_cipher("de"); +}}} +######################################## +{ my %opcode = ( + "sm4e" => 0xcec08400, + "sm4ekey" => 0xce60c800); + + sub unsm4 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sm4\w+)\s+([qv].*)/unsm4($1,$2)/ge; + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/sm4/asm/vpsm4_ex-armv8.pl b/crypto/sm4/asm/vpsm4_ex-armv8.pl new file mode 100644 index 0000000000000000000000000000000000000000..4fd2975b52b786de7430b6a8d97eb49602352d56 --- /dev/null +++ b/crypto/sm4/asm/vpsm4_ex-armv8.pl @@ -0,0 +1,1181 @@ +#! /usr/bin/env perl +# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# This module implements SM4 with ASIMD and AESE on AARCH64 +# +# Feb 2022 +# + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +$prefix="vpsm4_ex"; +my ($inp,$outp,$rks1,$rks2,$ivp,$enc)=("x0","x1","x3","x4","x5","x6"); +my ($blocks,$len)=("x2","x2"); +my $remain=("x7"); +my ($ptr,$counter)=("x12","w13"); +my ($wtmp0,$wtmp1,$wtmp2,$wtmp3)=("w8","w9","w10","w11"); +my ($xtmp0,$xtmp1,$xtmp2,$xtmp3)=("x8","x9","x10","x11"); +my ($word0,$word1,$word2,$word3)=("w14","w15","w16","w17"); +my @twx=map("x$_",(14..29)); +my $lastBlk=("x26"); + +my @tweak=map("v$_",(0..7)); +my @qtmp=map("q$_",(8..11)); +my @vtmp=map("v$_",(8..11)); +my ($rk0,$rk1)=("v12","v13"); +my ($rka,$rkb)=("v14","v15"); +my @data=map("v$_",(16..19)); +my @datax=map("v$_",(20..23)); +my ($vtmp4,$vtmp5)=("v24","v25"); +my $lastTweak=("v25"); +my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31"); +my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31"); + +sub rev32() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { +$code.=<<___; +#ifndef __ARMEB__ + rev32 $dst.16b,$src.16b +#else + mov $dst.16b,$src.16b +#endif +___ + } else { +$code.=<<___; +#ifndef __ARMEB__ + rev32 $dst.16b,$dst.16b +#endif +___ + } +} + +sub rev32_armeb() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { +$code.=<<___; +#ifdef __ARMEB__ + rev32 $dst.16b,$src.16b +#else + mov $dst.16b,$src.16b +#endif +___ + } else { +$code.=<<___; +#ifdef __ARMEB__ + rev32 $dst.16b,$dst.16b +#endif +___ + } +} + +sub transpose() { + my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; + +$code.=<<___; + zip1 $vt0.4s,$dat0.4s,$dat1.4s + zip2 $vt1.4s,$dat0.4s,$dat1.4s + zip1 $vt2.4s,$dat2.4s,$dat3.4s + zip2 $vt3.4s,$dat2.4s,$dat3.4s + zip1 $dat0.2d,$vt0.2d,$vt2.2d + zip2 $dat1.2d,$vt0.2d,$vt2.2d + zip1 $dat2.2d,$vt1.2d,$vt3.2d + zip2 $dat3.2d,$vt1.2d,$vt3.2d +___ +} + +sub load_sbox_matrix () { +$code.=<<___; + ldr $MaskQ, .Lsbox_magic + ldr $TAHMatQ, .Lsbox_magic+16 + ldr $TALMatQ, .Lsbox_magic+32 + ldr $ATAHMatQ, .Lsbox_magic+48 + ldr $ATALMatQ, .Lsbox_magic+64 + ldr $ANDMaskQ, .Lsbox_magic+80 +___ +} +# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x) +sub mul_matrix() { + my $x = shift; + my $higherMat = shift; + my $lowerMat = shift; + my $tmp = shift; +$code.=<<___; + ushr $tmp.16b, $x.16b, 4 + and $x.16b, $x.16b, $ANDMaskV.16b + tbl $x.16b, {$lowerMat.16b}, $x.16b + tbl $tmp.16b, {$higherMat.16b}, $tmp.16b + eor $x.16b, $x.16b, $tmp.16b +___ +} + +# sbox operation for one single word +sub sbox_1word () { + my $word = shift; + +$code.=<<___; + mov @vtmp[3].s[0],$word + // optimize sbox using AESE instruction + tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); +$code.=<<___; + eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b + aese @vtmp[0].16b,@vtmp[1].16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); +$code.=<<___; + + mov $wtmp0,@vtmp[0].s[0] + eor $word,$wtmp0,$wtmp0,ror #32-2 + eor $word,$word,$wtmp0,ror #32-10 + eor $word,$word,$wtmp0,ror #32-18 + eor $word,$word,$wtmp0,ror #32-24 +___ +} + +# sbox operation for 4-lane of words +sub sbox() { + my $dat = shift; + +$code.=<<___; + // optimize sbox using AESE instruction + tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); +$code.=<<___; + eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b + aese @vtmp[0].16b,@vtmp[1].16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4); +$code.=<<___; + mov $dat.16b,@vtmp[0].16b + + // linear transformation + ushr @vtmp[0].4s,$dat.4s,32-2 + ushr @vtmp[1].4s,$dat.4s,32-10 + ushr @vtmp[2].4s,$dat.4s,32-18 + ushr @vtmp[3].4s,$dat.4s,32-24 + sli @vtmp[0].4s,$dat.4s,2 + sli @vtmp[1].4s,$dat.4s,10 + sli @vtmp[2].4s,$dat.4s,18 + sli @vtmp[3].4s,$dat.4s,24 + eor $vtmp4.16b,@vtmp[0].16b,$dat.16b + eor $vtmp4.16b,$vtmp4.16b,$vtmp[1].16b + eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b + eor $dat.16b,$dat.16b,$vtmp4.16b +___ +} + +# sbox operation for 8-lane of words +sub sbox_double() { + my $dat = shift; + my $datx = shift; + +$code.=<<___; + // optimize sbox using AESE instruction + tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b + tbl @vtmp[1].16b, {$datx.16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); + &mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4); +$code.=<<___; + eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b + aese @vtmp[0].16b,$vtmp5.16b + aese @vtmp[1].16b,$vtmp5.16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4); + &mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4); +$code.=<<___; + mov $dat.16b,@vtmp[0].16b + mov $datx.16b,@vtmp[1].16b + + // linear transformation + ushr @vtmp[0].4s,$dat.4s,32-2 + ushr $vtmp5.4s,$datx.4s,32-2 + ushr @vtmp[1].4s,$dat.4s,32-10 + ushr @vtmp[2].4s,$dat.4s,32-18 + ushr @vtmp[3].4s,$dat.4s,32-24 + sli @vtmp[0].4s,$dat.4s,2 + sli $vtmp5.4s,$datx.4s,2 + sli @vtmp[1].4s,$dat.4s,10 + sli @vtmp[2].4s,$dat.4s,18 + sli @vtmp[3].4s,$dat.4s,24 + eor $vtmp4.16b,@vtmp[0].16b,$dat.16b + eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b + eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b + eor $dat.16b,$dat.16b,$vtmp4.16b + ushr @vtmp[1].4s,$datx.4s,32-10 + ushr @vtmp[2].4s,$datx.4s,32-18 + ushr @vtmp[3].4s,$datx.4s,32-24 + sli @vtmp[1].4s,$datx.4s,10 + sli @vtmp[2].4s,$datx.4s,18 + sli @vtmp[3].4s,$datx.4s,24 + eor $vtmp4.16b,$vtmp5.16b,$datx.16b + eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b + eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b + eor $datx.16b,$datx.16b,$vtmp4.16b +___ +} + +# sm4 for one block of data, in scalar registers word0/word1/word2/word3 +sub sm4_1blk () { + my $kptr = shift; + +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + /* B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) */ + eor $wtmp3,$word2,$word3 + eor $wtmp2,$wtmp0,$word1 + eor $wtmp3,$wtmp3,$wtmp2 +___ + &sbox_1word($wtmp3); +$code.=<<___; + eor $word0,$word0,$wtmp3 + /* B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) */ + eor $wtmp3,$word2,$word3 + eor $wtmp2,$word0,$wtmp1 + eor $wtmp3,$wtmp3,$wtmp2 +___ + &sbox_1word($wtmp3); +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + eor $word1,$word1,$wtmp3 + /* B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) */ + eor $wtmp3,$word0,$word1 + eor $wtmp2,$wtmp0,$word3 + eor $wtmp3,$wtmp3,$wtmp2 +___ + &sbox_1word($wtmp3); +$code.=<<___; + eor $word2,$word2,$wtmp3 + /* B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) */ + eor $wtmp3,$word0,$word1 + eor $wtmp2,$word2,$wtmp1 + eor $wtmp3,$wtmp3,$wtmp2 +___ + &sbox_1word($wtmp3); +$code.=<<___; + eor $word3,$word3,$wtmp3 +___ +} + +# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3 +sub sm4_4blks () { + my $kptr = shift; + +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + dup $rk0.4s,$wtmp0 + dup $rk1.4s,$wtmp1 + + /* B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) */ + eor $rka.16b,@data[2].16b,@data[3].16b + eor $rk0.16b,@data[1].16b,$rk0.16b + eor $rk0.16b,$rka.16b,$rk0.16b +___ + &sbox($rk0); +$code.=<<___; + eor @data[0].16b,@data[0].16b,$rk0.16b + + /* B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) */ + eor $rka.16b,$rka.16b,@data[0].16b + eor $rk1.16b,$rka.16b,$rk1.16b +___ + &sbox($rk1); +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + eor @data[1].16b,@data[1].16b,$rk1.16b + + dup $rk0.4s,$wtmp0 + dup $rk1.4s,$wtmp1 + + /* B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) */ + eor $rka.16b,@data[0].16b,@data[1].16b + eor $rk0.16b,@data[3].16b,$rk0.16b + eor $rk0.16b,$rka.16b,$rk0.16b +___ + &sbox($rk0); +$code.=<<___; + eor @data[2].16b,@data[2].16b,$rk0.16b + + /* B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) */ + eor $rka.16b,$rka.16b,@data[2].16b + eor $rk1.16b,$rka.16b,$rk1.16b +___ + &sbox($rk1); +$code.=<<___; + eor @data[3].16b,@data[3].16b,$rk1.16b +___ +} + +# sm4 for 8 lanes of data, in neon registers +# data0/data1/data2/data3 datax0/datax1/datax2/datax3 +sub sm4_8blks () { + my $kptr = shift; + +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + /* B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) */ + dup $rk0.4s,$wtmp0 + eor $rka.16b,@data[2].16b,@data[3].16b + eor $rkb.16b,@datax[2].16b,@datax[3].16b + eor @vtmp[0].16b,@data[1].16b,$rk0.16b + eor @vtmp[1].16b,@datax[1].16b,$rk0.16b + eor $rk0.16b,$rka.16b,@vtmp[0].16b + eor $rk1.16b,$rkb.16b,@vtmp[1].16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + eor @data[0].16b,@data[0].16b,$rk0.16b + eor @datax[0].16b,@datax[0].16b,$rk1.16b + + /* B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) */ + dup $rk1.4s,$wtmp1 + eor $rka.16b,$rka.16b,@data[0].16b + eor $rkb.16b,$rkb.16b,@datax[0].16b + eor $rk0.16b,$rka.16b,$rk1.16b + eor $rk1.16b,$rkb.16b,$rk1.16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + eor @data[1].16b,@data[1].16b,$rk0.16b + eor @datax[1].16b,@datax[1].16b,$rk1.16b + + /* B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) */ + dup $rk0.4s,$wtmp0 + eor $rka.16b,@data[0].16b,@data[1].16b + eor $rkb.16b,@datax[0].16b,@datax[1].16b + eor @vtmp[0].16b,@data[3].16b,$rk0.16b + eor @vtmp[1].16b,@datax[3].16b,$rk0.16b + eor $rk0.16b,$rka.16b,@vtmp[0].16b + eor $rk1.16b,$rkb.16b,@vtmp[1].16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + eor @data[2].16b,@data[2].16b,$rk0.16b + eor @datax[2].16b,@datax[2].16b,$rk1.16b + + /* B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) */ + dup $rk1.4s,$wtmp1 + eor $rka.16b,$rka.16b,@data[2].16b + eor $rkb.16b,$rkb.16b,@datax[2].16b + eor $rk0.16b,$rka.16b,$rk1.16b + eor $rk1.16b,$rkb.16b,$rk1.16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + eor @data[3].16b,@data[3].16b,$rk0.16b + eor @datax[3].16b,@datax[3].16b,$rk1.16b +___ +} + +sub encrypt_1blk_norev() { + my $dat = shift; + my $rks = shift; +$code.=<<___; + mov $ptr,$rks + mov $counter,#8 + mov $word0,$dat.s[0] + mov $word1,$dat.s[1] + mov $word2,$dat.s[2] + mov $word3,$dat.s[3] +10: +___ + &sm4_1blk($ptr); +$code.=<<___; + subs $counter,$counter,#1 + b.ne 10b + mov $dat.s[0],$word3 + mov $dat.s[1],$word2 + mov $dat.s[2],$word1 + mov $dat.s[3],$word0 +___ +} + +sub encrypt_1blk() { + my $dat = shift; + my $rks = shift; + + &encrypt_1blk_norev($dat,$rks); + &rev32($dat,$dat); +} + +sub encrypt_4blks() { +$code.=<<___; + mov $ptr,$rks1 + mov $counter,#8 +10: +___ + &sm4_4blks($ptr); +$code.=<<___; + subs $counter,$counter,#1 + b.ne 10b +___ + &rev32(@vtmp[3],@data[0]); + &rev32(@vtmp[2],@data[1]); + &rev32(@vtmp[1],@data[2]); + &rev32(@vtmp[0],@data[3]); +} + +sub encrypt_8blks() { + my $rks = shift; +$code.=<<___; + mov $ptr,$rks + mov $counter,#8 +10: +___ + &sm4_8blks($ptr); +$code.=<<___; + subs $counter,$counter,#1 + b.ne 10b +___ + &rev32(@vtmp[3],@data[0]); + &rev32(@vtmp[2],@data[1]); + &rev32(@vtmp[1],@data[2]); + &rev32(@vtmp[0],@data[3]); + &rev32(@data[3],@datax[0]); + &rev32(@data[2],@datax[1]); + &rev32(@data[1],@datax[2]); + &rev32(@data[0],@datax[3]); +} + +sub mov_reg_to_vec() { + my $src0 = shift; + my $src1 = shift; + my $desv = shift; +$code.=<<___; + mov $desv.d[0],$src0 + mov $desv.d[1],$src1 +#ifdef __ARMEB__ + rev32 $desv.16b,$desv.16b +#endif +___ +} + +sub mov_vec_to_reg() { + my $srcv = shift; + my $des0 = shift; + my $des1 = shift; +$code.=<<___; + mov $des0,$srcv.d[0] + mov $des1,$srcv.d[1] +___ +} + +sub compute_tweak() { + my $src0 = shift; + my $src1 = shift; + my $des0 = shift; + my $des1 = shift; +$code.=<<___; + mov $wtmp0,0x87 + extr $xtmp2,$src1,$src1,#32 + extr $des1,$src1,$src0,#63 + and $wtmp1,$wtmp0,$wtmp2,asr#31 + eor $des0,$xtmp1,$src0,lsl#1 +___ +} + +sub compute_tweak_vec() { + my $src = shift; + my $des = shift; + &rbit(@vtmp[2],$src); +$code.=<<___; + ldr @qtmp[0], .Lxts_magic + shl $des.16b, @vtmp[2].16b, #1 + ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 + ushr @vtmp[1].16b, @vtmp[1].16b, #7 + mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b + eor $des.16b, $des.16b, @vtmp[1].16b +___ + &rbit($des,$des); +} + +sub mov_en_to_enc(){ + my $en = shift; + if ($en eq "en") { +$code.=<<___; + mov $enc,1 +___ + } else { +$code.=<<___; + mov $enc,0 +___ + } +} + +sub rbit() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { + if ($standard eq "_gb") { +$code.=<<___; + rbit $dst.16b,$src.16b +___ + } else { +$code.=<<___; + mov $dst.16b,$src.16b +___ + } + } else { + if ($standard eq "_gb") { +$code.=<<___; + rbit $dst.16b,$src.16b +___ + } + } +} + +$code=<<___; +#include "arm_arch.h" +.arch armv8-a+crypto +.text + +.type ${prefix}_consts,%object +.align 7 +${prefix}_consts: +.Lck: + .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 + .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 + .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 + .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 + .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 + .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 + .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 + .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 +.Lfk: + .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc +.Lshuffles: + .long 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x03020100 +.Lxts_magic: + .dword 0x0101010101010187,0x0101010101010101 +.Lsbox_magic: + .dword 0x0b0e0104070a0d00,0x0306090c0f020508 + .dword 0x62185a2042387a00,0x22581a6002783a40 + .dword 0x15df62a89e54e923,0xc10bb67c4a803df7 + .dword 0xb9aa6b78c1d21300,0x1407c6d56c7fbead + .dword 0x6404462679195b3b,0xe383c1a1fe9edcbc + .dword 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f +.size ${prefix}_consts,.-${prefix}_consts +___ + +{{{ +my ($userKey,$roundKey,$enc)=("x0","x1","w2"); +my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8"); +my ($vkey,$vfk,$vmap)=("v5","v6","v7"); +$code.=<<___; +.type ${prefix}_set_key,%function +.align 4 +${prefix}_set_key: + ld1 {$vkey.4s},[$userKey] +___ + &load_sbox_matrix(); + &rev32($vkey,$vkey); +$code.=<<___; + adr $pointer,.Lshuffles + ld1 {$vmap.4s},[$pointer] + adr $pointer,.Lfk + ld1 {$vfk.4s},[$pointer] + eor $vkey.16b,$vkey.16b,$vfk.16b + mov $schedules,#32 + adr $pointer,.Lck + movi @vtmp[0].16b,#64 + cbnz $enc,1f + add $roundKey,$roundKey,124 +1: + mov $wtmp,$vkey.s[1] + ldr $roundkey,[$pointer],#4 + eor $roundkey,$roundkey,$wtmp + mov $wtmp,$vkey.s[2] + eor $roundkey,$roundkey,$wtmp + mov $wtmp,$vkey.s[3] + eor $roundkey,$roundkey,$wtmp + + // optimize sbox using AESE instruction + mov @data[0].s[0],$roundkey + tbl @vtmp[0].16b, {@data[0].16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); +$code.=<<___; + eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b + aese @vtmp[0].16b,@vtmp[1].16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); +$code.=<<___; + mov $wtmp,@vtmp[0].s[0] + + // linear transformation + eor $roundkey,$wtmp,$wtmp,ror #19 + eor $roundkey,$roundkey,$wtmp,ror #9 + mov $wtmp,$vkey.s[0] + eor $roundkey,$roundkey,$wtmp + mov $vkey.s[0],$roundkey + cbz $enc,2f + str $roundkey,[$roundKey],#4 + b 3f +2: + str $roundkey,[$roundKey],#-4 +3: + tbl $vkey.16b,{$vkey.16b},$vmap.16b + subs $schedules,$schedules,#1 + b.ne 1b + ret +.size ${prefix}_set_key,.-${prefix}_set_key +___ +}}} + + +{{{ +$code.=<<___; +.type ${prefix}_enc_4blks,%function +.align 4 +${prefix}_enc_4blks: +___ + &encrypt_4blks(); +$code.=<<___; + ret +.size ${prefix}_enc_4blks,.-${prefix}_enc_4blks +___ +}}} + +{{{ +$code.=<<___; +.type ${prefix}_enc_8blks,%function +.align 4 +${prefix}_enc_8blks: +___ + &encrypt_8blks($rks1); +$code.=<<___; + ret +.size ${prefix}_enc_8blks,.-${prefix}_enc_8blks +___ +}}} + +{{{ +my ($key,$keys)=("x0","x1"); +$code.=<<___; +.globl ${prefix}_set_encrypt_key +.type ${prefix}_set_encrypt_key,%function +.align 5 +${prefix}_set_encrypt_key: + stp x29,x30,[sp,#-16]! + mov w2,1 + bl ${prefix}_set_key + ldp x29,x30,[sp],#16 + ret +.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key +___ +}}} + +{{{ +my ($key,$keys)=("x0","x1"); +$code.=<<___; +.globl ${prefix}_set_decrypt_key +.type ${prefix}_set_decrypt_key,%function +.align 5 +${prefix}_set_decrypt_key: + stp x29,x30,[sp,#-16]! + mov w2,0 + bl ${prefix}_set_key + ldp x29,x30,[sp],#16 + ret +.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key +___ +}}} + + +{{{ + +$code.=<<___; +.globl ${prefix}_ecb_encrypt +.type ${prefix}_ecb_encrypt,%function +.align 5 +${prefix}_ecb_encrypt: + stp d8,d9,[sp,#-0x10]! + stp d10,d11,[sp,#-0x10]! + stp d12,d13,[sp,#-0x10]! + stp d14,d15,[sp,#-0x10]! + stp x16,x17,[sp,#-0x10]! + stp x29,x30,[sp,#-0x10]! +___ + &load_sbox_matrix(); +$code.=<<___; + // convert length into blocks + lsr x2,x2,4 +.Lecb_8_blocks_process: + cmp $blocks,#8 + b.lt .Lecb_4_blocks_process + ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 + ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &rev32(@datax[0],@datax[0]); + &rev32(@datax[1],@datax[1]); + &rev32(@datax[2],@datax[2]); + &rev32(@datax[3],@datax[3]); +$code.=<<___; + bl ${prefix}_enc_8blks + st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.gt .Lecb_8_blocks_process + b 100f +.Lecb_4_blocks_process: + cmp $blocks,#4 + b.lt 1f + ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl ${prefix}_enc_4blks + st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + sub $blocks,$blocks,#4 +1: + // process last block + cmp $blocks,#1 + b.lt 100f + b.gt 1f + ld1 {@data[0].4s},[$inp] +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0],$rks1); +$code.=<<___; + st1 {@data[0].4s},[$outp] + b 100f +1: // process last 2 blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16 + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16 + cmp $blocks,#2 + b.gt 1f +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl ${prefix}_enc_4blks + st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 + st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp] + b 100f +1: // process last 3 blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16 +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl ${prefix}_enc_4blks + st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 + st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16 + st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp] +100: + ldp x29,x30,[sp],#0x10 + ldp x16,x17,[sp],#0x10 + ldp d14,d15,[sp],#0x10 + ldp d12,d13,[sp],#0x10 + ldp d10,d11,[sp],#0x10 + ldp d8,d9,[sp],#0x10 + ret +.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt +___ +}}} + +{{{ +sub gen_xts_do_cipher() { +$code.=<<___; +.globl ${prefix}_xts_do_cipher${standard} +.type ${prefix}_xts_do_cipher${standard},%function +.align 5 +${prefix}_xts_do_cipher${standard}: + stp x29,x30,[sp,#-16]! + ld1 {@tweak[0].4s}, [$ivp] +___ + &load_sbox_matrix(); + &rev32(@tweak[0],@tweak[0]); + &encrypt_1blk(@tweak[0],$rks2); +$code.=<<___; + and $remain,$len,#0x0F + // convert length into blocks + lsr $blocks,$len,4 + cmp $blocks,#1 + b.lt .return${standard} + + cmp $remain,0 + // If the encryption/decryption Length is N times of 16, + // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${standard} + b.eq .xts_encrypt_blocks${standard} + + // If the encryption/decryption length is not N times of 16, + // the last two blocks are encrypted/decrypted in .last_2blks_tweak${standard} or .only_2blks_tweak${standard} + // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${standard} + subs $blocks,$blocks,#1 + b.eq .only_2blks_tweak${standard} +.xts_encrypt_blocks${standard}: +___ + &rbit(@tweak[0],@tweak[0]); + &rev32_armeb(@tweak[0],@tweak[0]); + &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); + &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); + &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); + &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); + &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); + &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); + &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); + &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); +$code.=<<___; +.Lxts_8_blocks_process${standard}: + cmp $blocks,#8 +___ + &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); + &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]); + &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); + &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); + &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); + &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); + &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); + &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); + &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]); + &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); + &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]); + &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); + &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]); + &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); + &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]); + &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); +$code.=<<___; + b.lt .Lxts_4_blocks_process${standard} + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); + &rbit(@tweak[2],@tweak[2]); + &rbit(@tweak[3],@tweak[3]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b + eor @data[3].16b, @data[3].16b, @tweak[3].16b + ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 +___ + &rbit(@tweak[4],@tweak[4]); + &rbit(@tweak[5],@tweak[5]); + &rbit(@tweak[6],@tweak[6]); + &rbit(@tweak[7],@tweak[7]); +$code.=<<___; + eor @datax[0].16b, @datax[0].16b, @tweak[4].16b + eor @datax[1].16b, @datax[1].16b, @tweak[5].16b + eor @datax[2].16b, @datax[2].16b, @tweak[6].16b + eor @datax[3].16b, @datax[3].16b, @tweak[7].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &rev32(@datax[0],@datax[0]); + &rev32(@datax[1],@datax[1]); + &rev32(@datax[2],@datax[2]); + &rev32(@datax[3],@datax[3]); + &transpose(@data,@vtmp); + &transpose(@datax,@vtmp); +$code.=<<___; + bl ${prefix}_enc_8blks +___ + &transpose(@vtmp,@datax); + &transpose(@data,@datax); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b + eor @data[0].16b, @data[0].16b, @tweak[4].16b + eor @data[1].16b, @data[1].16b, @tweak[5].16b + eor @data[2].16b, @data[2].16b, @tweak[6].16b + eor @data[3].16b, @data[3].16b, @tweak[7].16b + + // save the last tweak + mov $lastTweak.16b,@tweak[7].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.gt .Lxts_8_blocks_process${standard} + b 100f +.Lxts_4_blocks_process${standard}: + cmp $blocks,#4 + b.lt 1f + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); + &rbit(@tweak[2],@tweak[2]); + &rbit(@tweak[3],@tweak[3]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b + eor @data[3].16b, @data[3].16b, @tweak[3].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &transpose(@data,@vtmp); +$code.=<<___; + bl ${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + sub $blocks,$blocks,#4 + mov @tweak[0].16b,@tweak[4].16b + mov @tweak[1].16b,@tweak[5].16b + mov @tweak[2].16b,@tweak[6].16b + // save the last tweak + mov $lastTweak.16b,@tweak[3].16b +1: + // process last block + cmp $blocks,#1 + b.lt 100f + b.gt 1f + ld1 {@data[0].4s},[$inp],#16 +___ + &rbit(@tweak[0],@tweak[0]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0],$rks1); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + st1 {@data[0].4s},[$outp],#16 + // save the last tweak + mov $lastTweak.16b,@tweak[0].16b + b 100f +1: // process last 2 blocks + cmp $blocks,#2 + b.gt 1f + ld1 {@data[0].4s,@data[1].4s},[$inp],#32 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &transpose(@data,@vtmp); +$code.=<<___; + bl ${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 + // save the last tweak + mov $lastTweak.16b,@tweak[1].16b + b 100f +1: // process last 3 blocks + ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); + &rbit(@tweak[2],@tweak[2]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &transpose(@data,@vtmp); +$code.=<<___; + bl ${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 + // save the last tweak + mov $lastTweak.16b,@tweak[2].16b +100: + cmp $remain,0 + b.eq .return${standard} + +// This brance calculates the last two tweaks, +// while the encryption/decryption length is larger than 32 +.last_2blks_tweak${standard}: +___ + &rev32_armeb($lastTweak,$lastTweak); + &compute_tweak_vec($lastTweak,@tweak[1]); + &compute_tweak_vec(@tweak[1],@tweak[2]); +$code.=<<___; + b .check_dec${standard} + + +// This brance calculates the last two tweaks, +// while the encryption/decryption length is equal to 32, who only need two tweaks +.only_2blks_tweak${standard}: + mov @tweak[1].16b,@tweak[0].16b +___ + &rev32_armeb(@tweak[1],@tweak[1]); + &compute_tweak_vec(@tweak[1],@tweak[2]); +$code.=<<___; + b .check_dec${standard} + + +// Determine whether encryption or decryption is required. +// The last two tweaks need to be swapped for decryption. +.check_dec${standard}: + // encryption:1 decryption:0 + cmp $enc,1 + b.eq .prcess_last_2blks${standard} + mov @vtmp[0].16B,@tweak[1].16b + mov @tweak[1].16B,@tweak[2].16b + mov @tweak[2].16B,@vtmp[0].16b + +.prcess_last_2blks${standard}: +___ + &rev32_armeb(@tweak[1],@tweak[1]); + &rev32_armeb(@tweak[2],@tweak[2]); +$code.=<<___; + ld1 {@data[0].4s},[$inp],#16 + eor @data[0].16b, @data[0].16b, @tweak[1].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0],$rks1); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[1].16b + st1 {@data[0].4s},[$outp],#16 + + sub $lastBlk,$outp,16 + .loop${standard}: + subs $remain,$remain,1 + ldrb $wtmp0,[$lastBlk,$remain] + ldrb $wtmp1,[$inp,$remain] + strb $wtmp1,[$lastBlk,$remain] + strb $wtmp0,[$outp,$remain] + b.gt .loop${standard} + ld1 {@data[0].4s}, [$lastBlk] + eor @data[0].16b, @data[0].16b, @tweak[2].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0],$rks1); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[2].16b + st1 {@data[0].4s}, [$lastBlk] +.return${standard}: + ldp x29,x30,[sp],#16 + ret +.size ${prefix}_xts_do_cipher${standard},.-${prefix}_xts_do_cipher${standard} +___ +} #end of gen_xts_do_cipher + +}}} + +{{{ +sub gen_xts_cipher() { + my $en = shift; + +$code.=<<___; +.globl ${prefix}_xts_${en}crypt${standard} +.type ${prefix}_xts_${en}crypt${standard},%function +.align 5 +${prefix}_xts_${en}crypt${standard}: + stp x15, x16, [sp, #-0x10]! + stp x17, x18, [sp, #-0x10]! + stp x19, x20, [sp, #-0x10]! + stp x21, x22, [sp, #-0x10]! + stp x23, x24, [sp, #-0x10]! + stp x25, x26, [sp, #-0x10]! + stp x27, x28, [sp, #-0x10]! + stp x29, x30, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d14, d15, [sp, #-0x10]! +___ + &mov_en_to_enc($en); +$code.=<<___; + bl ${prefix}_xts_do_cipher${standard} + ldp d14, d15, [sp], #0x10 + ldp d12, d13, [sp], #0x10 + ldp d10, d11, [sp], #0x10 + ldp d8, d9, [sp], #0x10 + ldp x29, x30, [sp], #0x10 + ldp x27, x28, [sp], #0x10 + ldp x25, x26, [sp], #0x10 + ldp x23, x24, [sp], #0x10 + ldp x21, x22, [sp], #0x10 + ldp x19, x20, [sp], #0x10 + ldp x17, x18, [sp], #0x10 + ldp x15, x16, [sp], #0x10 + ret +.size ${prefix}_xts_${en}crypt${standard},.-${prefix}_xts_${en}crypt${standard} +___ + +} # end of gen_xts_cipher +$standard="_gb"; +&gen_xts_do_cipher(); +&gen_xts_cipher("en"); +&gen_xts_cipher("de"); +$standard=""; +&gen_xts_do_cipher(); +&gen_xts_cipher("en"); +&gen_xts_cipher("de"); +}}} + +######################################## +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info index b65a7d149e5860964c4bb7dc46547b3e0d25edff..4d26ede7dad4fa85390cf505d3ccde81f899c337 100644 --- a/crypto/sm4/build.info +++ b/crypto/sm4/build.info @@ -1,4 +1,18 @@ LIBS=../../libcrypto SOURCE[../../libcrypto]=\ - sm4.c + sm4.c {- $target{sm4_asm_src} -} +GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl $(PERLASM_SCHEME) +INCLUDE[sm4-armv8.o]=.. + +GENERATE[vpsm4_ex-armv8.S]=asm/vpsm4_ex-armv8.pl $(PERLASM_SCHEME) +INCLUDE[vpsm4_ex-armv8.o]=.. + +BEGINRAW[Makefile] +##### SM4 assembler implementations + +# GNU make "catch all" +{- $builddir -}/sm4-%.S: {- $sourcedir -}/asm/sm4-%.pl + CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ + +ENDRAW[Makefile] diff --git a/doc/man3/EVP_sm4_xts.pod b/doc/man3/EVP_sm4_xts.pod new file mode 100644 index 0000000000000000000000000000000000000000..09ca3fb341f0501599d8cfed13ff01b1293d7e51 --- /dev/null +++ b/doc/man3/EVP_sm4_xts.pod @@ -0,0 +1,67 @@ +=pod + +=head1 NAME + +EVP_sm4_xts, +- EVP SM4 cipher + +=head1 SYNOPSIS + + #include + + const EVP_CIPHER *EVP_sm4_xts(void); + +=head1 DESCRIPTION + +The XTS mode of operation (GB/T 17964-2021) for SM4 block cipher. + +=over 4 + +=item EVP_sm4_xts(), + +The SM4 blockcipher with a 256-bit key in XTS mode. This mode use a key length of 256 bits and acts on blocks of 128 bits. + +The B parameter to L or L is the XTS first "tweak" value. XTS mode has two implementations to calculate the following tweak values, one is standardized in IEEE Std. 1619-2007 and has been widely used (e.g., XTS AES), the other is proposed recently (GB/T 17964-2021 implemented in May 2022) and is currently only used in SM4. + +Assume that the input data (B, B, and B) are consistent, the following tweak values are inconsistent due to different standards. As a result, the first ciphertext block are consistent, but the subsequent ciphertext blocks (if any) are different. + +By default, EVP_sm4_xts is standardized in GB/T 17964-2021, and can be changed by EVP_CIPHER_CTX_ctrl. The following Is is supported in XTS mode for SM4. + +=over 4 + +=item EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_XTS_STANDARD, std, NULL) + +Sets the standard of EVP_sm4_xts to B. This must be one of 0 or 1, 0 for XTS mode in GB/T 17964-2021, 1 for XTS mode in IEEE Std 1619-2007. + +=back + +The XTS implementation in OpenSSL does not support streaming. That is there must +only be one L call per L call (and +similarly with the "Decrypt" functions). + +=back + +=head1 RETURN VALUES + +These functions return a B structure that contains the +implementation of the symmetric cipher. See L for +details of the B structure. + +=head1 SEE ALSO + +L, +L, +L + +=head1 COPYRIGHT + +Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. +Copyright 2022 Ribose Inc. All Rights Reserved. + +Licensed under the OpenSSL license (the "License"). You may not use +this file except in compliance with the License. You can obtain a copy +in the file LICENSE in the source distribution or at +L. + +=cut + diff --git a/fuzz/oids.txt b/fuzz/oids.txt index eda55e4e792cd67a4e39cfa5ce8b280cba229aba..a3eaa721a4d90c3a7c4f9037dace94d4a86457df 100644 --- a/fuzz/oids.txt +++ b/fuzz/oids.txt @@ -1063,3 +1063,4 @@ OBJ_id_tc26_gost_3410_2012_256_paramSetC="\x2A\x85\x03\x07\x01\x02\x01\x01\x03" OBJ_id_tc26_gost_3410_2012_256_paramSetD="\x2A\x85\x03\x07\x01\x02\x01\x01\x04" OBJ_hmacWithSHA512_224="\x2A\x86\x48\x86\xF7\x0D\x02\x0C" OBJ_hmacWithSHA512_256="\x2A\x86\x48\x86\xF7\x0D\x02\x0D" +OBJ_sm4_xts="\x2A\x81\x1C\xCF\x55\x01\x68\x0A" diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h new file mode 100644 index 0000000000000000000000000000000000000000..8c45cb70626cad646501d29bf55dfc0aadd40b61 --- /dev/null +++ b/include/crypto/sm4_platform.h @@ -0,0 +1,84 @@ +/* + * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#ifndef OSSL_SM4_PLATFORM_H +# define OSSL_SM4_PLATFORM_H +# pragma once + +# if defined(OPENSSL_CPUID_OBJ) +# if (defined(__arm__) || defined(__arm) || defined(__aarch64__)) +# include "arm_arch.h" +# if __ARM_MAX_ARCH__>=7 +# if defined(VPSM4_EX_ASM) +# define VPSM4_EX_CAPABLE (OPENSSL_armcap_P & ARMV8_AES) +# endif +# define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4) +# define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key +# define HWSM4_set_decrypt_key sm4_v8_set_decrypt_key +# define HWSM4_encrypt sm4_v8_encrypt +# define HWSM4_decrypt sm4_v8_decrypt +# define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt +# define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt +# define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks +# define HWSM4_xts_encrypt_gb sm4_v8_xts_encrypt_gb +# define HWSM4_xts_decrypt_gb sm4_v8_xts_decrypt_gb +# define HWSM4_xts_encrypt sm4_v8_xts_encrypt +# define HWSM4_xts_decrypt sm4_v8_xts_decrypt +# endif +# endif +# endif /* OPENSSL_CPUID_OBJ */ + +# if defined(HWSM4_CAPABLE) +int HWSM4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); +int HWSM4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); +void HWSM4_encrypt(const unsigned char *in, unsigned char *out, + const SM4_KEY *key); +void HWSM4_decrypt(const unsigned char *in, unsigned char *out, + const SM4_KEY *key); +void HWSM4_cbc_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const SM4_KEY *key, + unsigned char *ivec, const int enc); +void HWSM4_ecb_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const SM4_KEY *key, + const int enc); +void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + const unsigned char ivec[16]); +/* xts mode in GB/T 17964-2021 */ +void HWSM4_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +void HWSM4_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +/* xts mode in IEEE Std 1619-2007 */ +void HWSM4_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +void HWSM4_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +# endif /* HWSM4_CAPABLE */ + +#ifdef VPSM4_EX_CAPABLE +void vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); +void vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); +#define vpsm4_ex_encrypt SM4_encrypt +#define vpsm4_ex_decrypt SM4_encrypt +void vpsm4_ex_ecb_encrypt( + const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key, const int enc); +/* xts mode in GB/T 17964-2021 */ +void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +void vpsm4_ex_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +/* xts mode in IEEE Std 1619-2007 */ +void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +void vpsm4_ex_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +#endif /* VPSM4_EX_CAPABLE */ + +#endif /* OSSL_SM4_PLATFORM_H */ diff --git a/include/openssl/evp.h b/include/openssl/evp.h index a411f3f2f94918a214ae24c8f74f76bb323419f1..d11e5ae12c4e4e2689c367d9177c5bc71241f341 100644 --- a/include/openssl/evp.h +++ b/include/openssl/evp.h @@ -353,6 +353,9 @@ int (*EVP_CIPHER_meth_get_ctrl(const EVP_CIPHER *cipher))(EVP_CIPHER_CTX *, # define EVP_CTRL_GET_IVLEN 0x25 +/* Set the XTS mode standard, SM4 only */ +# define EVP_CTRL_XTS_STANDARD 0x26 + /* Padding modes */ #define EVP_PADDING_PKCS7 1 #define EVP_PADDING_ISO7816_4 2 @@ -937,6 +940,7 @@ const EVP_CIPHER *EVP_sm4_cfb128(void); # define EVP_sm4_cfb EVP_sm4_cfb128 const EVP_CIPHER *EVP_sm4_ofb(void); const EVP_CIPHER *EVP_sm4_ctr(void); +const EVP_CIPHER *EVP_sm4_xts(void); # endif # if OPENSSL_API_COMPAT < 0x10100000L diff --git a/include/openssl/modes.h b/include/openssl/modes.h index d544f98d5585e9ab2e6b3ce68682607e7ba0c993..dea324f80b8965e1af168c7848004677fa64faa7 100644 --- a/include/openssl/modes.h +++ b/include/openssl/modes.h @@ -22,6 +22,10 @@ typedef void (*cbc128_f) (const unsigned char *in, unsigned char *out, size_t len, const void *key, unsigned char ivec[16], int enc); +typedef void (*ecb128_f) (const unsigned char *in, unsigned char *out, + size_t len, const void *key, + int enc); + typedef void (*ctr128_f) (const unsigned char *in, unsigned char *out, size_t blocks, const void *key, const unsigned char ivec[16]); @@ -153,6 +157,11 @@ int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char *inp, unsigned char *out, size_t len, int enc); +int CRYPTO_xts128gb_encrypt(const XTS128_CONTEXT *ctx, + const unsigned char iv[16], + const unsigned char *inp, unsigned char *out, + size_t len, int enc); + size_t CRYPTO_128_wrap(void *key, const unsigned char *iv, unsigned char *out, const unsigned char *in, size_t inlen, diff --git a/include/openssl/obj_mac.h b/include/openssl/obj_mac.h index 53516a06c69b5f5f99d910981a1a09e276f416c9..9c89f7741154a5bef412bb58ea81a82b7c3a7178 100644 --- a/include/openssl/obj_mac.h +++ b/include/openssl/obj_mac.h @@ -4767,6 +4767,11 @@ #define NID_sm4_ctr 1139 #define OBJ_sm4_ctr OBJ_sm_scheme,104L,7L +#define SN_sm4_xts "SM4-XTS" +#define LN_sm4_xts "sm4-xts" +#define NID_sm4_xts 1196 +#define OBJ_sm4_xts OBJ_sm_scheme,104L,10L + #define SN_hmac "HMAC" #define LN_hmac "hmac" #define NID_hmac 855 diff --git a/test/evp_test.c b/test/evp_test.c index 62f20ece37055cca0949d1cac67d32b4029dd6f6..3c65ce9ad4ab4c46fe4aac1e2e216fbf49304d90 100644 --- a/test/evp_test.c +++ b/test/evp_test.c @@ -485,6 +485,8 @@ typedef struct cipher_data_st { unsigned char *tag; size_t tag_len; int tag_late; + /* SM4 XTS only */ + int std; } CIPHER_DATA; static int cipher_test_init(EVP_TEST *t, const char *alg) @@ -568,6 +570,15 @@ static int cipher_test_parse(EVP_TEST *t, const char *keyword, return -1; return 1; } + if (strcmp(keyword, "Standard") == 0) { + if (strcmp(value, "GB") == 0) + cdat->std = 0; + else if (strcmp(value, "IEEE") == 0) + cdat->std = 1; + else + return -1; + return 1; + } return 0; } @@ -707,7 +718,11 @@ static int cipher_test_enc(EVP_TEST *t, int enc, goto err; } } - + if (expected->std) { + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_XTS_STANDARD, expected->std, NULL)) { + goto err; + }; + } EVP_CIPHER_CTX_set_padding(ctx, 0); t->err = "CIPHERUPDATE_ERROR"; tmplen = 0; diff --git a/test/recipes/30-test_evp_data/evpciph.txt b/test/recipes/30-test_evp_data/evpciph.txt index 8480ddee0b6355f93ecdef85bd8f254c64d883ec..ae327838d9b6de9b867478ab66ffd83a19535e38 100644 --- a/test/recipes/30-test_evp_data/evpciph.txt +++ b/test/recipes/30-test_evp_data/evpciph.txt @@ -2181,6 +2181,28 @@ IV = 0123456789ABCDEFFEDCBA9876543210 Plaintext = AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCDDDDDDDDDDDDDDDDEEEEEEEEEEEEEEEEFFFFFFFFFFFFFFFFEEEEEEEEEEEEEEEEAAAAAAAAAAAAAAAA Ciphertext = C2B4759E78AC3CF43D0852F4E8D5F9FD7256E8A5FCB65A350EE00630912E44492A0B17E1B85B060D0FBA612D8A95831638B361FD5FFACD942F081485A83CA35D +Title = SM4 XTS test vectors, the XTS mode is standardized in GB/T 17964-2021 by default +Cipher = SM4-XTS +Key = 2B7E151628AED2A6ABF7158809CF4F3C000102030405060708090A0B0C0D0E0F +IV = F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF +Plaintext = 6BC1BEE22E409F96E93D7E117393172AAE2D8A571E03AC9C9EB76FAC45AF8E5130C81C46A35CE411E5FBC1191A0A52EFF69F2445DF4F9B17 +Ciphertext = E9538251C71D7B80BBE4483FEF497BD12C5C581BD6242FC51E08964FB4F60FDB0BA42F63499279213D318D2C11F6886E903BE7F93A1B3479 + +Title = SM4 test vectors for XTS mode in GB/T 17964-2021 and IEEE Std 1619-2007 +Cipher = SM4-XTS +Key = 2B7E151628AED2A6ABF7158809CF4F3C000102030405060708090A0B0C0D0E0F +IV = F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF +Plaintext = 6BC1BEE22E409F96E93D7E117393172AAE2D8A571E03AC9C9EB76FAC45AF8E5130C81C46A35CE411E5FBC1191A0A52EFF69F2445DF4F9B17 +Ciphertext = E9538251C71D7B80BBE4483FEF497BD12C5C581BD6242FC51E08964FB4F60FDB0BA42F63499279213D318D2C11F6886E903BE7F93A1B3479 +Standard = GB + +Cipher = SM4-XTS +Key = 2B7E151628AED2A6ABF7158809CF4F3C000102030405060708090A0B0C0D0E0F +IV = F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF +Plaintext = 6BC1BEE22E409F96E93D7E117393172AAE2D8A571E03AC9C9EB76FAC45AF8E5130C81C46A35CE411E5FBC1191A0A52EFF69F2445DF4F9B17 +Ciphertext = E9538251C71D7B80BBE4483FEF497BD1B3DB1A3E60408C575D63FF7DB39F83260869F9E2585FEC9F0B863BF8FD784B8627D16C0DB6D2CFC7 +Standard = IEEE + Title = ARIA test vectors from RFC5794 (and others) Cipher = ARIA-128-ECB diff --git a/util/libcrypto.num b/util/libcrypto.num index 436f799bcacff177147e759cb36affe5ca965dc9..797dac999e4bb8003242314db803324e38daa536 100644 --- a/util/libcrypto.num +++ b/util/libcrypto.num @@ -4591,3 +4591,5 @@ X509_ALGOR_copy 4544 1_1_1h EXIST::FUNCTION: X509_REQ_set0_signature 4545 1_1_1h EXIST::FUNCTION: X509_REQ_set1_signature_algo 4546 1_1_1h EXIST::FUNCTION: EC_KEY_decoded_from_explicit_params 4547 1_1_1h EXIST::FUNCTION:EC +EVP_sm4_xts 4548 1_1_1x EXIST::FUNCTION:SM4 +CRYPTO_xts128gb_encrypt 4549 1_1_1x EXIST::FUNCTION: