Projects
openEuler:Mainline
compat-openssl11
_service:tar_scm:Feature-add-ARMv8-implementati...
Sign Up
Log In
Username
Password
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File _service:tar_scm:Feature-add-ARMv8-implementations-of-SM4-in-ECB-and-XTS.patch of Package compat-openssl11
From df56c1da16d705fb6f471651feb77a69171af9e3 Mon Sep 17 00:00:00 2001 From: Xu Yizhou <xuyizhou1@huawei.com> Date: Wed, 19 Oct 2022 13:28:58 +0800 Subject: [PATCH] add ARMv8 implementations of SM4 in ECB and XTS --- Configurations/00-base-templates.conf | 1 + Configure | 4 + crypto/evp/c_allc.c | 1 + crypto/evp/e_sm4.c | 352 ++++++- crypto/modes/build.info | 2 +- crypto/modes/xts128gb.c | 204 ++++ crypto/objects/obj_dat.h | 15 +- crypto/objects/obj_mac.num | 1 + crypto/objects/objects.txt | 1 + crypto/sm4/asm/vpsm4_ex-armv8.pl | 1173 +++++++++++++++++++++ crypto/sm4/build.info | 5 +- doc/man3/EVP_sm4_xts.pod | 67 ++ fuzz/oids.txt | 1 + include/openssl/evp.h | 4 + include/openssl/modes.h | 9 + include/openssl/obj_mac.h | 5 + test/evp_test.c | 17 +- test/recipes/30-test_evp_data/evpciph.txt | 22 + util/libcrypto.num | 2 + 19 files changed, 1832 insertions(+), 54 deletions(-) create mode 100644 crypto/modes/xts128gb.c create mode 100644 crypto/sm4/asm/vpsm4_ex-armv8.pl create mode 100644 doc/man3/EVP_sm4_xts.pod diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf index e01dc63..1d35012 100644 --- a/Configurations/00-base-templates.conf +++ b/Configurations/00-base-templates.conf @@ -321,6 +321,7 @@ my %targets=( chacha_asm_src => "chacha-armv8.S", poly1305_asm_src=> "poly1305-armv8.S", keccak1600_asm_src => "keccak1600-armv8.S", + sm4_asm_src => "vpsm4_ex-armv8.S", }, parisc11_asm => { template => 1, diff --git a/Configure b/Configure index a41c897..3bfe360 100755 --- a/Configure +++ b/Configure @@ -1420,6 +1420,9 @@ unless ($disabled{asm}) { if ($target{poly1305_asm_src} ne "") { push @{$config{lib_defines}}, "POLY1305_ASM"; } + if ($target{sm4_asm_src} ne "") { + push @{$config{lib_defines}}, "VPSM4_EX_ASM"; + } } my %predefined_C = compiler_predefined($config{CROSS_COMPILE}.$config{CC}); @@ -3375,6 +3378,7 @@ sub print_table_entry "mtoutflag", "multilib", "build_scheme", + "sm4_asm_src", ); if ($type eq "TABLE") { diff --git a/crypto/evp/c_allc.c b/crypto/evp/c_allc.c index 22fdcc4..01b0d1f 100644 --- a/crypto/evp/c_allc.c +++ b/crypto/evp/c_allc.c @@ -85,6 +85,7 @@ void openssl_add_all_ciphers_int(void) EVP_add_cipher(EVP_sm4_cfb()); EVP_add_cipher(EVP_sm4_ofb()); EVP_add_cipher(EVP_sm4_ctr()); + EVP_add_cipher(EVP_sm4_xts()); EVP_add_cipher_alias(SN_sm4_cbc, "SM4"); EVP_add_cipher_alias(SN_sm4_cbc, "sm4"); #endif diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c index fce3279..169d6c7 100644 --- a/crypto/evp/e_sm4.c +++ b/crypto/evp/e_sm4.c @@ -15,86 +15,346 @@ # include <openssl/modes.h> # include "crypto/sm4.h" # include "crypto/evp.h" +# include "evp_local.h" +# include "modes_local.h" + +#if defined(OPENSSL_CPUID_OBJ) && (defined(__arm__) || defined(__arm) || defined(__aarch64__)) +# include "arm_arch.h" +# if __ARM_MAX_ARCH__>=7 +# if defined(VPSM4_EX_ASM) +# define VPSM4_EX_CAPABLE (OPENSSL_armcap_P & ARMV8_AES) +# endif +# endif +#endif typedef struct { - SM4_KEY ks; + union { + double align; + SM4_KEY ks; + } ks; + block128_f block; + union { + ecb128_f ecb; + } stream; } EVP_SM4_KEY; +#ifdef VPSM4_EX_CAPABLE +void vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); +void vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); +#define vpsm4_ex_encrypt SM4_encrypt +#define vpsm4_ex_decrypt SM4_encrypt +void vpsm4_ex_ecb_encrypt( + const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key, const int enc); +/* xts mode in GB/T 17964-2021 */ +void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +void vpsm4_ex_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +/* xts mode in IEEE Std 1619-2007 */ +void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +void vpsm4_ex_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +#endif + +# define BLOCK_CIPHER_generic(nid,blocksize,ivlen,nmode,mode,MODE,flags) \ +static const EVP_CIPHER sm4_##mode = { \ + nid##_##nmode,blocksize,128/8,ivlen, \ + flags|EVP_CIPH_##MODE##_MODE, \ + sm4_init_key, \ + sm4_##mode##_cipher, \ + NULL, \ + sizeof(EVP_SM4_KEY), \ + NULL,NULL,NULL,NULL }; \ +const EVP_CIPHER *EVP_sm4_##mode(void) \ +{ return &sm4_##mode; } + +#define BLOCK_CIPHER_generic_pack(nid,flags) \ + BLOCK_CIPHER_generic(nid,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,1,16,ctr,ctr,CTR,flags) + static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, const unsigned char *iv, int enc) { - SM4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); + int mode; + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY, ctx); + + mode = EVP_CIPHER_CTX_mode(ctx); + if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) && !enc) { +#ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + vpsm4_ex_set_decrypt_key(key, &dat->ks.ks); + dat->block = (block128_f) vpsm4_ex_decrypt; + if (mode == EVP_CIPH_ECB_MODE) + dat->stream.ecb = (ecb128_f) vpsm4_ex_ecb_encrypt; + } else +#endif + { + dat->block = (block128_f)SM4_decrypt; + SM4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); + } + } else { +#ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + vpsm4_ex_set_encrypt_key(key, &dat->ks.ks); + dat->block = (block128_f) vpsm4_ex_encrypt; + if (mode == EVP_CIPH_ECB_MODE) + dat->stream.ecb = (ecb128_f) vpsm4_ex_ecb_encrypt; + } else +#endif + { + dat->block = (block128_f)SM4_encrypt; + SM4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); + } + } return 1; } -static void sm4_cbc_encrypt(const unsigned char *in, unsigned char *out, - size_t len, const SM4_KEY *key, - unsigned char *ivec, const int enc) +static int sm4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) { - if (enc) - CRYPTO_cbc128_encrypt(in, out, len, key, ivec, - (block128_f)SM4_encrypt); + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + + if (EVP_CIPHER_CTX_encrypting(ctx)) + CRYPTO_cbc128_encrypt(in, out, len, &dat->ks.ks, + EVP_CIPHER_CTX_iv_noconst(ctx), dat->block); else - CRYPTO_cbc128_decrypt(in, out, len, key, ivec, - (block128_f)SM4_decrypt); + CRYPTO_cbc128_decrypt(in, out, len, &dat->ks.ks, + EVP_CIPHER_CTX_iv_noconst(ctx), dat->block); + return 1; } -static void sm4_cfb128_encrypt(const unsigned char *in, unsigned char *out, - size_t length, const SM4_KEY *key, - unsigned char *ivec, int *num, const int enc) +static int sm4_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) { - CRYPTO_cfb128_encrypt(in, out, length, key, ivec, num, enc, - (block128_f)SM4_encrypt); + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + int num = EVP_CIPHER_CTX_num(ctx); + + CRYPTO_cfb128_encrypt(in, out, len, &dat->ks.ks, + ctx->iv, &num, + EVP_CIPHER_CTX_encrypting(ctx), dat->block); + EVP_CIPHER_CTX_set_num(ctx, num); + + return 1; } -static void sm4_ecb_encrypt(const unsigned char *in, unsigned char *out, - const SM4_KEY *key, const int enc) +static int sm4_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) { - if (enc) - SM4_encrypt(in, out, key); + size_t bl = EVP_CIPHER_CTX_block_size(ctx); + size_t i; + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + + if (len < bl){ + return 1; + } + if (dat->stream.ecb != NULL) + (*dat->stream.ecb) (in, out, len, &dat->ks.ks, + EVP_CIPHER_CTX_encrypting(ctx)); else - SM4_decrypt(in, out, key); + for (i = 0, len -= bl; i <= len; i += bl) + (*dat->block) (in + i, out + i, &dat->ks.ks); + return 1; } -static void sm4_ofb128_encrypt(const unsigned char *in, unsigned char *out, - size_t length, const SM4_KEY *key, - unsigned char *ivec, int *num) +static int sm4_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) { - CRYPTO_ofb128_encrypt(in, out, length, key, ivec, num, - (block128_f)SM4_encrypt); -} + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + int num = EVP_CIPHER_CTX_num(ctx); -IMPLEMENT_BLOCK_CIPHER(sm4, ks, sm4, EVP_SM4_KEY, NID_sm4, - 16, 16, 16, 128, EVP_CIPH_FLAG_DEFAULT_ASN1, - sm4_init_key, 0, 0, 0, 0) + CRYPTO_ofb128_encrypt(in, out, len, &dat->ks.ks, + ctx->iv, &num, dat->block); + EVP_CIPHER_CTX_set_num(ctx, num); + return 1; +} static int sm4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, const unsigned char *in, size_t len) { - unsigned int num = EVP_CIPHER_CTX_num(ctx); - EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY, ctx); + int n = EVP_CIPHER_CTX_num(ctx); + unsigned int num; + EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + + if (n < 0) + return 0; + num = (unsigned int)n; - CRYPTO_ctr128_encrypt(in, out, len, &dat->ks, - EVP_CIPHER_CTX_iv_noconst(ctx), - EVP_CIPHER_CTX_buf_noconst(ctx), &num, - (block128_f)SM4_encrypt); + CRYPTO_ctr128_encrypt(in, out, len, &dat->ks.ks, + ctx->iv, + EVP_CIPHER_CTX_buf_noconst(ctx), &num, + dat->block); EVP_CIPHER_CTX_set_num(ctx, num); return 1; } -static const EVP_CIPHER sm4_ctr_mode = { - NID_sm4_ctr, 1, 16, 16, - EVP_CIPH_CTR_MODE, - sm4_init_key, - sm4_ctr_cipher, - NULL, - sizeof(EVP_SM4_KEY), - NULL, NULL, NULL, NULL -}; +BLOCK_CIPHER_generic_pack(NID_sm4, 0) -const EVP_CIPHER *EVP_sm4_ctr(void) +typedef struct { + union { + double align; + SM4_KEY ks; + } ks1, ks2; /* sm4 key schedules to use */ + XTS128_CONTEXT xts; + int std; /* 0 for xts mode in GB/T 17964-2021 */ + /* 1 for xts mode in IEEE Std 1619-2007 */ + void (*stream_gb) (const unsigned char *in, + unsigned char *out, size_t length, + const SM4_KEY *key1, const SM4_KEY *key2, + const unsigned char iv[16]); /* stream for xts mode in GB/T 17964-2021 */ + void (*stream) (const unsigned char *in, + unsigned char *out, size_t length, + const SM4_KEY *key1, const SM4_KEY *key2, + const unsigned char iv[16]); /* stream for xts mode in IEEE Std 1619-2007 */ +} EVP_SM4_XTS_CTX; + +static int sm4_xts_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) +{ + EVP_SM4_XTS_CTX *xctx = EVP_C_DATA(EVP_SM4_XTS_CTX, c); + + if (type == EVP_CTRL_COPY) { + EVP_CIPHER_CTX *out = ptr; + EVP_SM4_XTS_CTX *xctx_out = EVP_C_DATA(EVP_SM4_XTS_CTX,out); + + if (xctx->xts.key1) { + if (xctx->xts.key1 != &xctx->ks1) + return 0; + xctx_out->xts.key1 = &xctx_out->ks1; + } + if (xctx->xts.key2) { + if (xctx->xts.key2 != &xctx->ks2) + return 0; + xctx_out->xts.key2 = &xctx_out->ks2; + } + return 1; + } else if (type == EVP_CTRL_XTS_STANDARD) { + if ((arg < 0) || (arg > 1)) + return 0; + xctx->std = arg; + return 1; + } else if (type != EVP_CTRL_INIT) + return -1; + /* key1 and key2 are used as an indicator both key and IV are set */ + xctx->xts.key1 = NULL; + xctx->xts.key2 = NULL; + return 1; +} + +static int sm4_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + const unsigned char *iv, int enc) +{ + EVP_SM4_XTS_CTX *xctx = EVP_C_DATA(EVP_SM4_XTS_CTX,ctx); + + if (!iv && !key) + return 1; + + if (key) + do { + /* The key is two half length keys in reality */ + const int bytes = EVP_CIPHER_CTX_key_length(ctx) / 2; + xctx->stream_gb = NULL; + xctx->stream = NULL; +#ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + if (enc) { + vpsm4_ex_set_encrypt_key(key, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f) vpsm4_ex_encrypt; + xctx->stream_gb = vpsm4_ex_xts_encrypt_gb; + xctx->stream = vpsm4_ex_xts_encrypt; + } else { + vpsm4_ex_set_decrypt_key(key, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f) vpsm4_ex_decrypt; + xctx->stream_gb = vpsm4_ex_xts_decrypt_gb; + xctx->stream = vpsm4_ex_xts_decrypt; + } + vpsm4_ex_set_encrypt_key(key + bytes, &xctx->ks2.ks); + xctx->xts.block2 = (block128_f) vpsm4_ex_encrypt; + + xctx->xts.key1 = &xctx->ks1; + break; + } else +#endif + (void)0; /* terminate potentially open 'else' */ + + if (enc) { + SM4_set_key(key, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f) SM4_encrypt; + } else { + SM4_set_key(key, &xctx->ks1.ks); + xctx->xts.block1 = (block128_f) SM4_decrypt; + } + + SM4_set_key(key + bytes, &xctx->ks2.ks); + xctx->xts.block2 = (block128_f) SM4_encrypt; + + xctx->xts.key1 = &xctx->ks1; + } while (0); + + if (iv) { + xctx->xts.key2 = &xctx->ks2; + memcpy(EVP_CIPHER_CTX_iv_noconst(ctx), iv, 16); + } + + return 1; +} + +static int sm4_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) +{ + EVP_SM4_XTS_CTX *xctx = EVP_C_DATA(EVP_SM4_XTS_CTX,ctx); + if (!xctx->xts.key1 || !xctx->xts.key2) + return 0; + if (!out || !in || len < SM4_BLOCK_SIZE) + return 0; + if (xctx->std) { + if (xctx->stream) + (*xctx->stream) (in, out, len, + xctx->xts.key1, xctx->xts.key2, + EVP_CIPHER_CTX_iv_noconst(ctx)); + else if (CRYPTO_xts128_encrypt(&xctx->xts, EVP_CIPHER_CTX_iv_noconst(ctx), + in, out, len, + EVP_CIPHER_CTX_encrypting(ctx))) + return 0; + } else { + if (xctx->stream_gb) + (*xctx->stream_gb) (in, out, len, + xctx->xts.key1, xctx->xts.key2, + EVP_CIPHER_CTX_iv_noconst(ctx)); + else if (CRYPTO_xts128gb_encrypt(&xctx->xts, EVP_CIPHER_CTX_iv_noconst(ctx), + in, out, len, + EVP_CIPHER_CTX_encrypting(ctx))) + return 0; + } + return 1; +} + +#define SM4_XTS_BLOCK_SIZE 1 +#define SM4_XTS_IV_LENGTH 16 +#define SM4_XTS_KEY_LENGTH 32 + +#define XTS_FLAGS (EVP_CIPH_FLAG_DEFAULT_ASN1 | EVP_CIPH_CUSTOM_IV \ + | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT \ + | EVP_CIPH_CUSTOM_COPY | EVP_CIPH_XTS_MODE) + +static const EVP_CIPHER sm4_xts_mode = { + NID_sm4_xts, + SM4_XTS_BLOCK_SIZE, + SM4_XTS_KEY_LENGTH, + SM4_XTS_IV_LENGTH, + XTS_FLAGS, + sm4_xts_init_key, + sm4_xts_cipher, + NULL, + sizeof(EVP_SM4_XTS_CTX), + NULL, NULL, sm4_xts_ctrl, NULL +}; + +const EVP_CIPHER *EVP_sm4_xts(void) { - return &sm4_ctr_mode; + return &sm4_xts_mode; } #endif diff --git a/crypto/modes/build.info b/crypto/modes/build.info index 821340e..f974b04 100644 --- a/crypto/modes/build.info +++ b/crypto/modes/build.info @@ -1,7 +1,7 @@ LIBS=../../libcrypto SOURCE[../../libcrypto]=\ cbc128.c ctr128.c cts128.c cfb128.c ofb128.c gcm128.c \ - ccm128.c xts128.c wrap128.c ocb128.c \ + ccm128.c xts128.c xts128gb.c wrap128.c ocb128.c \ {- $target{modes_asm_src} -} INCLUDE[gcm128.o]=.. diff --git a/crypto/modes/xts128gb.c b/crypto/modes/xts128gb.c new file mode 100644 index 0000000..8f57cc5 --- /dev/null +++ b/crypto/modes/xts128gb.c @@ -0,0 +1,204 @@ +/* + * Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the OpenSSL license (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +// This is the xts mode in GB/T 17964-2021 +#include <openssl/crypto.h> +#include "modes_local.h" +#include <string.h> + +#ifndef STRICT_ALIGNMENT +# ifdef __GNUC__ +typedef u64 u64_a1 __attribute((__aligned__(1))); +# else +typedef u64 u64_a1; +# endif +#endif + +int CRYPTO_xts128gb_encrypt(const XTS128_CONTEXT *ctx, + const unsigned char iv[16], + const unsigned char *inp, unsigned char *out, + size_t len, int enc) +{ + const union { + long one; + char little; + } is_endian = { + 1 + }; + union { + u64 u[2]; + u32 d[4]; + u8 c[16]; + } tweak, scratch; + unsigned int i; + + if (len < 16) + return -1; + + memcpy(tweak.c, iv, 16); + + (*ctx->block2) (tweak.c, tweak.c, ctx->key2); + + if (!enc && (len % 16)) + len -= 16; + + while (len >= 16) { +#if defined(STRICT_ALIGNMENT) + memcpy(scratch.c, inp, 16); + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; +#else + scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak.u[0]; + scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak.u[1]; +#endif + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); +#if defined(STRICT_ALIGNMENT) + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out, scratch.c, 16); +#else + ((u64_a1 *)out)[0] = scratch.u[0] ^= tweak.u[0]; + ((u64_a1 *)out)[1] = scratch.u[1] ^= tweak.u[1]; +#endif + inp += 16; + out += 16; + len -= 16; + + if (len == 0) + return 0; + + if (is_endian.little) { + u8 res; + u64 hi, lo; +#ifdef BSWAP8 + hi = BSWAP8(tweak.u[0]); + lo = BSWAP8(tweak.u[1]); +#else + u8 *p = tweak.c; + + hi = (u64)GETU32(p) << 32 | GETU32(p + 4); + lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); +#endif + res = (u8)lo & 1; + tweak.u[0] = (lo >> 1) | (hi << 63); + tweak.u[1] = hi >> 1; + if (res) + tweak.c[15] ^= 0xe1; +#ifdef BSWAP8 + hi = BSWAP8(tweak.u[0]); + lo = BSWAP8(tweak.u[1]); +#else + p = tweak.c; + + hi = (u64)GETU32(p) << 32 | GETU32(p + 4); + lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); +#endif + tweak.u[0] = lo; + tweak.u[1] = hi; + } else { + u8 Cin, Cout; + Cin = 0; + for (i = 0; i < 16; ++i) { + Cout = (tweak.c[i] << 7) & 0x80; + tweak.c[i] = ((tweak.c[i] >> 1) + Cin) & 0xff; + Cin = Cout; + } + if (Cout) + tweak.c[0] ^= 0xe1; + } + } + if (enc) { + for (i = 0; i < len; ++i) { + u8 c = inp[i]; + out[i] = scratch.c[i]; + scratch.c[i] = c; + } + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out - 16, scratch.c, 16); + } else { + union { + u64 u[2]; + u8 c[16]; + } tweak1; + + if (is_endian.little) { + u8 res; + u64 hi, lo; +#ifdef BSWAP8 + hi = BSWAP8(tweak.u[0]); + lo = BSWAP8(tweak.u[1]); +#else + u8 *p = tweak.c; + + hi = (u64)GETU32(p) << 32 | GETU32(p + 4); + lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); +#endif + res = (u8)lo & 1; + tweak1.u[0] = (lo >> 1) | (hi << 63); + tweak1.u[1] = hi >> 1; + if (res) + tweak1.c[15] ^= 0xe1; +#ifdef BSWAP8 + hi = BSWAP8(tweak1.u[0]); + lo = BSWAP8(tweak1.u[1]); +#else + p = tweak1.c; + + hi = (u64)GETU32(p) << 32 | GETU32(p + 4); + lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); +#endif + tweak1.u[0] = lo; + tweak1.u[1] = hi; + } else { + u8 Cin, Cout; + Cin = 0; + for ( i = 0; i < 16; ++i ) { + Cout = (tweak.c[i] << 7) & 0x80; + tweak1.c[i] = ((tweak.c[i] >> 1) + Cin) & 0xff; + Cin = Cout; + } + if (Cout) + tweak1.c[0] ^= 0xe1; + } +#if defined(STRICT_ALIGNMENT) + memcpy(scratch.c, inp, 16); + scratch.u[0] ^= tweak1.u[0]; + scratch.u[1] ^= tweak1.u[1]; +#else + scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak1.u[0]; + scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak1.u[1]; +#endif + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); + scratch.u[0] ^= tweak1.u[0]; + scratch.u[1] ^= tweak1.u[1]; + + for (i = 0; i < len; ++i) { + u8 c = inp[16 + i]; + out[16 + i] = scratch.c[i]; + scratch.c[i] = c; + } + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + (*ctx->block1) (scratch.c, scratch.c, ctx->key1); +#if defined(STRICT_ALIGNMENT) + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out, scratch.c, 16); +#else + ((u64_a1 *)out)[0] = scratch.u[0] ^ tweak.u[0]; + ((u64_a1 *)out)[1] = scratch.u[1] ^ tweak.u[1]; +#endif + } + + return 0; +} diff --git a/crypto/objects/obj_dat.h b/crypto/objects/obj_dat.h index eb4cce4..6d60f87 100644 --- a/crypto/objects/obj_dat.h +++ b/crypto/objects/obj_dat.h @@ -10,7 +10,7 @@ */ /* Serialized OID's */ -static const unsigned char so[7770] = { +static const unsigned char so[7778] = { 0x2A,0x86,0x48,0x86,0xF7,0x0D, /* [ 0] OBJ_rsadsi */ 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x01, /* [ 6] OBJ_pkcs */ 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x02,0x02, /* [ 13] OBJ_md2 */ @@ -1077,9 +1077,10 @@ static const unsigned char so[7770] = { 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x02,0x0C, /* [ 7745] OBJ_hmacWithSHA512_224 */ 0x2A,0x86,0x48,0x86,0xF7,0x0D,0x02,0x0D, /* [ 7753] OBJ_hmacWithSHA512_256 */ 0x2A,0x81,0x1C,0xCF,0x55,0x01,0x83,0x75, /* [ 7761] OBJ_SM2_with_SM3 */ + 0x2A,0x81,0x1C,0xCF,0x55,0x01,0x68,0x0A, /* [ 7769] OBJ_sm4_xts */ }; -#define NUM_NID 1196 +#define NUM_NID 1197 static const ASN1_OBJECT nid_objs[NUM_NID] = { {"UNDEF", "undefined", NID_undef}, {"rsadsi", "RSA Data Security, Inc.", NID_rsadsi, 6, &so[0]}, @@ -2277,9 +2278,10 @@ static const ASN1_OBJECT nid_objs[NUM_NID] = { {"hmacWithSHA512-224", "hmacWithSHA512-224", NID_hmacWithSHA512_224, 8, &so[7745]}, {"hmacWithSHA512-256", "hmacWithSHA512-256", NID_hmacWithSHA512_256, 8, &so[7753]}, {"SM2-SM3", "SM2-with-SM3", NID_SM2_with_SM3, 8, &so[7761]}, + {"SM4-XTS", "sm4-xts", NID_sm4_xts, 8, &so[7769]}, }; -#define NUM_SN 1187 +#define NUM_SN 1188 static const unsigned int sn_objs[NUM_SN] = { 364, /* "AD_DVCS" */ 419, /* "AES-128-CBC" */ @@ -2554,6 +2556,7 @@ static const unsigned int sn_objs[NUM_SN] = { 1139, /* "SM4-CTR" */ 1133, /* "SM4-ECB" */ 1135, /* "SM4-OFB" */ + 1196, /* "SM4-XTS" */ 188, /* "SMIME" */ 167, /* "SMIME-CAPS" */ 100, /* "SN" */ @@ -3470,7 +3473,7 @@ static const unsigned int sn_objs[NUM_SN] = { 1093, /* "x509ExtAdmission" */ }; -#define NUM_LN 1187 +#define NUM_LN 1188 static const unsigned int ln_objs[NUM_LN] = { 363, /* "AD Time Stamping" */ 405, /* "ANSI X9.62" */ @@ -4613,6 +4616,7 @@ static const unsigned int ln_objs[NUM_LN] = { 1139, /* "sm4-ctr" */ 1133, /* "sm4-ecb" */ 1135, /* "sm4-ofb" */ + 1196, /* "sm4-xts" */ 16, /* "stateOrProvinceName" */ 660, /* "streetAddress" */ 498, /* "subtreeMaximumQuality" */ @@ -4661,7 +4665,7 @@ static const unsigned int ln_objs[NUM_LN] = { 125, /* "zlib compression" */ }; -#define NUM_OBJ 1072 +#define NUM_OBJ 1073 static const unsigned int obj_objs[NUM_OBJ] = { 0, /* OBJ_undef 0 */ 181, /* OBJ_iso 1 */ @@ -5128,6 +5132,7 @@ static const unsigned int obj_objs[NUM_OBJ] = { 1136, /* OBJ_sm4_cfb1 1 2 156 10197 1 104 5 */ 1138, /* OBJ_sm4_cfb8 1 2 156 10197 1 104 6 */ 1139, /* OBJ_sm4_ctr 1 2 156 10197 1 104 7 */ + 1196, /* OBJ_sm4_xts 1 2 156 10197 1 104 10 */ 1172, /* OBJ_sm2 1 2 156 10197 1 301 */ 1143, /* OBJ_sm3 1 2 156 10197 1 401 */ 1195, /* OBJ_SM2_with_SM3 1 2 156 10197 1 501 */ diff --git a/crypto/objects/obj_mac.num b/crypto/objects/obj_mac.num index 8b797b0..77ad385 100644 --- a/crypto/objects/obj_mac.num +++ b/crypto/objects/obj_mac.num @@ -1193,3 +1193,4 @@ magma_mac 1192 hmacWithSHA512_224 1193 hmacWithSHA512_256 1194 SM2_with_SM3 1195 +sm4_xts 1196 diff --git a/crypto/objects/objects.txt b/crypto/objects/objects.txt index be9da47..5713fae 100644 --- a/crypto/objects/objects.txt +++ b/crypto/objects/objects.txt @@ -1520,6 +1520,7 @@ sm-scheme 104 4 : SM4-CFB : sm4-cfb sm-scheme 104 5 : SM4-CFB1 : sm4-cfb1 sm-scheme 104 6 : SM4-CFB8 : sm4-cfb8 sm-scheme 104 7 : SM4-CTR : sm4-ctr +sm-scheme 104 10 : SM4-XTS : sm4-xts # There is no OID that just denotes "HMAC" oddly enough... diff --git a/crypto/sm4/asm/vpsm4_ex-armv8.pl b/crypto/sm4/asm/vpsm4_ex-armv8.pl new file mode 100644 index 0000000..86a6f89 --- /dev/null +++ b/crypto/sm4/asm/vpsm4_ex-armv8.pl @@ -0,0 +1,1173 @@ +#! /usr/bin/env perl +# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# This module implements SM4 with ASIMD and AESE on AARCH64 +# +# Feb 2022 +# + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +$prefix="vpsm4_ex"; +my ($inp,$outp,$rks1,$rks2,$ivp,$enc)=("x0","x1","x3","x4","x5","x6"); +my ($blocks,$len)=("x2","x2"); +my $remain=("x7"); +my ($ptr,$counter)=("x12","w13"); +my ($wtmp0,$wtmp1,$wtmp2,$wtmp3)=("w8","w9","w10","w11"); +my ($xtmp0,$xtmp1,$xtmp2,$xtmp3)=("x8","x9","x10","x11"); +my ($word0,$word1,$word2,$word3)=("w14","w15","w16","w17"); +my @twx=map("x$_",(14..29)); +my $lastBlk=("x26"); + +my @tweak=map("v$_",(0..7)); +my @qtmp=map("q$_",(8..11)); +my @vtmp=map("v$_",(8..11)); +my ($rk0,$rk1)=("v12","v13"); +my ($rka,$rkb)=("v14","v15"); +my @data=map("v$_",(16..19)); +my @datax=map("v$_",(20..23)); +my ($vtmp4,$vtmp5)=("v24","v25"); +my $lastTweak=("v25"); +my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31"); +my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31"); + +sub rev32() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { +$code.=<<___; +#ifndef __ARMEB__ + rev32 $dst.16b,$src.16b +#else + mov $dst.16b,$src.16b +#endif +___ + } else { +$code.=<<___; +#ifndef __ARMEB__ + rev32 $dst.16b,$dst.16b +#endif +___ + } +} + +sub rev32_armeb() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { +$code.=<<___; +#ifdef __ARMEB__ + rev32 $dst.16b,$src.16b +#else + mov $dst.16b,$src.16b +#endif +___ + } else { +$code.=<<___; +#ifdef __ARMEB__ + rev32 $dst.16b,$dst.16b +#endif +___ + } +} + +sub transpose() { + my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; + +$code.=<<___; + zip1 $vt0.4s,$dat0.4s,$dat1.4s + zip2 $vt1.4s,$dat0.4s,$dat1.4s + zip1 $vt2.4s,$dat2.4s,$dat3.4s + zip2 $vt3.4s,$dat2.4s,$dat3.4s + zip1 $dat0.2d,$vt0.2d,$vt2.2d + zip2 $dat1.2d,$vt0.2d,$vt2.2d + zip1 $dat2.2d,$vt1.2d,$vt3.2d + zip2 $dat3.2d,$vt1.2d,$vt3.2d +___ +} + +sub load_sbox_matrix () { +$code.=<<___; + ldr $MaskQ, =0x0306090c0f0205080b0e0104070a0d00 + ldr $TAHMatQ, =0x22581a6002783a4062185a2042387a00 + ldr $TALMatQ, =0xc10bb67c4a803df715df62a89e54e923 + ldr $ATAHMatQ, =0x1407c6d56c7fbeadb9aa6b78c1d21300 + ldr $ATALMatQ, =0xe383c1a1fe9edcbc6404462679195b3b + ldr $ANDMaskQ, =0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f +___ +} +# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x) +sub mul_matrix() { + my $x = shift; + my $higherMat = shift; + my $lowerMat = shift; + my $tmp = shift; +$code.=<<___; + ushr $tmp.16b, $x.16b, 4 + and $x.16b, $x.16b, $ANDMaskV.16b + tbl $x.16b, {$lowerMat.16b}, $x.16b + tbl $tmp.16b, {$higherMat.16b}, $tmp.16b + eor $x.16b, $x.16b, $tmp.16b +___ +} + +# sbox operation for one single word +sub sbox_1word () { + my $word = shift; + +$code.=<<___; + mov @vtmp[3].s[0],$word + // optimize sbox using AESE instruction + tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); +$code.=<<___; + eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b + aese @vtmp[0].16b,@vtmp[1].16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); +$code.=<<___; + + mov $wtmp0,@vtmp[0].s[0] + eor $word,$wtmp0,$wtmp0,ror #32-2 + eor $word,$word,$wtmp0,ror #32-10 + eor $word,$word,$wtmp0,ror #32-18 + eor $word,$word,$wtmp0,ror #32-24 +___ +} + +# sbox operation for 4-lane of words +sub sbox() { + my $dat = shift; + +$code.=<<___; + // optimize sbox using AESE instruction + tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); +$code.=<<___; + eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b + aese @vtmp[0].16b,@vtmp[1].16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4); +$code.=<<___; + mov $dat.16b,@vtmp[0].16b + + // linear transformation + ushr @vtmp[0].4s,$dat.4s,32-2 + ushr @vtmp[1].4s,$dat.4s,32-10 + ushr @vtmp[2].4s,$dat.4s,32-18 + ushr @vtmp[3].4s,$dat.4s,32-24 + sli @vtmp[0].4s,$dat.4s,2 + sli @vtmp[1].4s,$dat.4s,10 + sli @vtmp[2].4s,$dat.4s,18 + sli @vtmp[3].4s,$dat.4s,24 + eor $vtmp4.16b,@vtmp[0].16b,$dat.16b + eor $vtmp4.16b,$vtmp4.16b,$vtmp[1].16b + eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b + eor $dat.16b,$dat.16b,$vtmp4.16b +___ +} + +# sbox operation for 8-lane of words +sub sbox_double() { + my $dat = shift; + my $datx = shift; + +$code.=<<___; + // optimize sbox using AESE instruction + tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b + tbl @vtmp[1].16b, {$datx.16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); + &mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4); +$code.=<<___; + eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b + aese @vtmp[0].16b,$vtmp5.16b + aese @vtmp[1].16b,$vtmp5.16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4); + &mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4); +$code.=<<___; + mov $dat.16b,@vtmp[0].16b + mov $datx.16b,@vtmp[1].16b + + // linear transformation + ushr @vtmp[0].4s,$dat.4s,32-2 + ushr $vtmp5.4s,$datx.4s,32-2 + ushr @vtmp[1].4s,$dat.4s,32-10 + ushr @vtmp[2].4s,$dat.4s,32-18 + ushr @vtmp[3].4s,$dat.4s,32-24 + sli @vtmp[0].4s,$dat.4s,2 + sli $vtmp5.4s,$datx.4s,2 + sli @vtmp[1].4s,$dat.4s,10 + sli @vtmp[2].4s,$dat.4s,18 + sli @vtmp[3].4s,$dat.4s,24 + eor $vtmp4.16b,@vtmp[0].16b,$dat.16b + eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b + eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b + eor $dat.16b,$dat.16b,$vtmp4.16b + ushr @vtmp[1].4s,$datx.4s,32-10 + ushr @vtmp[2].4s,$datx.4s,32-18 + ushr @vtmp[3].4s,$datx.4s,32-24 + sli @vtmp[1].4s,$datx.4s,10 + sli @vtmp[2].4s,$datx.4s,18 + sli @vtmp[3].4s,$datx.4s,24 + eor $vtmp4.16b,$vtmp5.16b,$datx.16b + eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b + eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b + eor $datx.16b,$datx.16b,$vtmp4.16b +___ +} + +# sm4 for one block of data, in scalar registers word0/word1/word2/word3 +sub sm4_1blk () { + my $kptr = shift; + +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + /* B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) */ + eor $wtmp3,$word2,$word3 + eor $wtmp2,$wtmp0,$word1 + eor $wtmp3,$wtmp3,$wtmp2 +___ + &sbox_1word($wtmp3); +$code.=<<___; + eor $word0,$word0,$wtmp3 + /* B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) */ + eor $wtmp3,$word2,$word3 + eor $wtmp2,$word0,$wtmp1 + eor $wtmp3,$wtmp3,$wtmp2 +___ + &sbox_1word($wtmp3); +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + eor $word1,$word1,$wtmp3 + /* B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) */ + eor $wtmp3,$word0,$word1 + eor $wtmp2,$wtmp0,$word3 + eor $wtmp3,$wtmp3,$wtmp2 +___ + &sbox_1word($wtmp3); +$code.=<<___; + eor $word2,$word2,$wtmp3 + /* B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) */ + eor $wtmp3,$word0,$word1 + eor $wtmp2,$word2,$wtmp1 + eor $wtmp3,$wtmp3,$wtmp2 +___ + &sbox_1word($wtmp3); +$code.=<<___; + eor $word3,$word3,$wtmp3 +___ +} + +# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3 +sub sm4_4blks () { + my $kptr = shift; + +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + dup $rk0.4s,$wtmp0 + dup $rk1.4s,$wtmp1 + + /* B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) */ + eor $rka.16b,@data[2].16b,@data[3].16b + eor $rk0.16b,@data[1].16b,$rk0.16b + eor $rk0.16b,$rka.16b,$rk0.16b +___ + &sbox($rk0); +$code.=<<___; + eor @data[0].16b,@data[0].16b,$rk0.16b + + /* B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) */ + eor $rka.16b,$rka.16b,@data[0].16b + eor $rk1.16b,$rka.16b,$rk1.16b +___ + &sbox($rk1); +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + eor @data[1].16b,@data[1].16b,$rk1.16b + + dup $rk0.4s,$wtmp0 + dup $rk1.4s,$wtmp1 + + /* B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) */ + eor $rka.16b,@data[0].16b,@data[1].16b + eor $rk0.16b,@data[3].16b,$rk0.16b + eor $rk0.16b,$rka.16b,$rk0.16b +___ + &sbox($rk0); +$code.=<<___; + eor @data[2].16b,@data[2].16b,$rk0.16b + + /* B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) */ + eor $rka.16b,$rka.16b,@data[2].16b + eor $rk1.16b,$rka.16b,$rk1.16b +___ + &sbox($rk1); +$code.=<<___; + eor @data[3].16b,@data[3].16b,$rk1.16b +___ +} + +# sm4 for 8 lanes of data, in neon registers +# data0/data1/data2/data3 datax0/datax1/datax2/datax3 +sub sm4_8blks () { + my $kptr = shift; + +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + /* B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) */ + dup $rk0.4s,$wtmp0 + eor $rka.16b,@data[2].16b,@data[3].16b + eor $rkb.16b,@datax[2].16b,@datax[3].16b + eor @vtmp[0].16b,@data[1].16b,$rk0.16b + eor @vtmp[1].16b,@datax[1].16b,$rk0.16b + eor $rk0.16b,$rka.16b,@vtmp[0].16b + eor $rk1.16b,$rkb.16b,@vtmp[1].16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + eor @data[0].16b,@data[0].16b,$rk0.16b + eor @datax[0].16b,@datax[0].16b,$rk1.16b + + /* B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) */ + dup $rk1.4s,$wtmp1 + eor $rka.16b,$rka.16b,@data[0].16b + eor $rkb.16b,$rkb.16b,@datax[0].16b + eor $rk0.16b,$rka.16b,$rk1.16b + eor $rk1.16b,$rkb.16b,$rk1.16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + eor @data[1].16b,@data[1].16b,$rk0.16b + eor @datax[1].16b,@datax[1].16b,$rk1.16b + + /* B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) */ + dup $rk0.4s,$wtmp0 + eor $rka.16b,@data[0].16b,@data[1].16b + eor $rkb.16b,@datax[0].16b,@datax[1].16b + eor @vtmp[0].16b,@data[3].16b,$rk0.16b + eor @vtmp[1].16b,@datax[3].16b,$rk0.16b + eor $rk0.16b,$rka.16b,@vtmp[0].16b + eor $rk1.16b,$rkb.16b,@vtmp[1].16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + eor @data[2].16b,@data[2].16b,$rk0.16b + eor @datax[2].16b,@datax[2].16b,$rk1.16b + + /* B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) */ + dup $rk1.4s,$wtmp1 + eor $rka.16b,$rka.16b,@data[2].16b + eor $rkb.16b,$rkb.16b,@datax[2].16b + eor $rk0.16b,$rka.16b,$rk1.16b + eor $rk1.16b,$rkb.16b,$rk1.16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + eor @data[3].16b,@data[3].16b,$rk0.16b + eor @datax[3].16b,@datax[3].16b,$rk1.16b +___ +} + +sub encrypt_1blk_norev() { + my $dat = shift; + my $rks = shift; +$code.=<<___; + mov $ptr,$rks + mov $counter,#8 + mov $word0,$dat.s[0] + mov $word1,$dat.s[1] + mov $word2,$dat.s[2] + mov $word3,$dat.s[3] +10: +___ + &sm4_1blk($ptr); +$code.=<<___; + subs $counter,$counter,#1 + b.ne 10b + mov $dat.s[0],$word3 + mov $dat.s[1],$word2 + mov $dat.s[2],$word1 + mov $dat.s[3],$word0 +___ +} + +sub encrypt_1blk() { + my $dat = shift; + my $rks = shift; + + &encrypt_1blk_norev($dat,$rks); + &rev32($dat,$dat); +} + +sub encrypt_4blks() { +$code.=<<___; + mov $ptr,$rks1 + mov $counter,#8 +10: +___ + &sm4_4blks($ptr); +$code.=<<___; + subs $counter,$counter,#1 + b.ne 10b +___ + &rev32(@vtmp[3],@data[0]); + &rev32(@vtmp[2],@data[1]); + &rev32(@vtmp[1],@data[2]); + &rev32(@vtmp[0],@data[3]); +} + +sub encrypt_8blks() { + my $rks = shift; +$code.=<<___; + mov $ptr,$rks + mov $counter,#8 +10: +___ + &sm4_8blks($ptr); +$code.=<<___; + subs $counter,$counter,#1 + b.ne 10b +___ + &rev32(@vtmp[3],@data[0]); + &rev32(@vtmp[2],@data[1]); + &rev32(@vtmp[1],@data[2]); + &rev32(@vtmp[0],@data[3]); + &rev32(@data[3],@datax[0]); + &rev32(@data[2],@datax[1]); + &rev32(@data[1],@datax[2]); + &rev32(@data[0],@datax[3]); +} + +sub mov_reg_to_vec() { + my $src0 = shift; + my $src1 = shift; + my $desv = shift; +$code.=<<___; + mov $desv.d[0],$src0 + mov $desv.d[1],$src1 +#ifdef __ARMEB__ + rev32 $desv.16b,$desv.16b +#endif +___ +} + +sub mov_vec_to_reg() { + my $srcv = shift; + my $des0 = shift; + my $des1 = shift; +$code.=<<___; + mov $des0,$srcv.d[0] + mov $des1,$srcv.d[1] +___ +} + +sub compute_tweak() { + my $src0 = shift; + my $src1 = shift; + my $des0 = shift; + my $des1 = shift; +$code.=<<___; + mov $wtmp0,0x87 + extr $xtmp2,$src1,$src1,#32 + extr $des1,$src1,$src0,#63 + and $wtmp1,$wtmp0,$wtmp2,asr#31 + eor $des0,$xtmp1,$src0,lsl#1 +___ +} + +sub compute_tweak_vec() { + my $src = shift; + my $des = shift; + &rbit(@vtmp[2],$src); +$code.=<<___; + ldr @qtmp[0], =0x01010101010101010101010101010187 + shl $des.16b, @vtmp[2].16b, #1 + ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 + ushr @vtmp[1].16b, @vtmp[1].16b, #7 + mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b + eor $des.16b, $des.16b, @vtmp[1].16b +___ + &rbit($des,$des); +} + +sub mov_en_to_enc(){ + my $en = shift; + if ($en eq "en") { +$code.=<<___; + mov $enc,1 +___ + } else { +$code.=<<___; + mov $enc,0 +___ + } +} + +sub rbit() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { + if ($standard eq "_gb") { +$code.=<<___; + rbit $dst.16b,$src.16b +___ + } else { +$code.=<<___; + mov $dst.16b,$src.16b +___ + } + } else { + if ($standard eq "_gb") { +$code.=<<___; + rbit $dst.16b,$src.16b +___ + } + } +} + +$code=<<___; +#include "arm_arch.h" +.arch armv8-a+crypto +.text + +.type ${prefix}_consts,%object +.align 7 +${prefix}_consts: +.Lck: + .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 + .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 + .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 + .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 + .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 + .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 + .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 + .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 +.Lfk: + .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc +.Lshuffles: + .long 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x03020100 + +.size ${prefix}_consts,.-${prefix}_consts +___ + +{{{ +my ($userKey,$roundKey,$enc)=("x0","x1","w2"); +my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8"); +my ($vkey,$vfk,$vmap)=("v5","v6","v7"); +$code.=<<___; +.type ${prefix}_set_key,%function +.align 4 +${prefix}_set_key: + ld1 {$vkey.4s},[$userKey] +___ + &load_sbox_matrix(); + &rev32($vkey,$vkey); +$code.=<<___; + adr $pointer,.Lshuffles + ld1 {$vmap.4s},[$pointer] + adr $pointer,.Lfk + ld1 {$vfk.4s},[$pointer] + eor $vkey.16b,$vkey.16b,$vfk.16b + mov $schedules,#32 + adr $pointer,.Lck + movi @vtmp[0].16b,#64 + cbnz $enc,1f + add $roundKey,$roundKey,124 +1: + mov $wtmp,$vkey.s[1] + ldr $roundkey,[$pointer],#4 + eor $roundkey,$roundkey,$wtmp + mov $wtmp,$vkey.s[2] + eor $roundkey,$roundkey,$wtmp + mov $wtmp,$vkey.s[3] + eor $roundkey,$roundkey,$wtmp + + // optimize sbox using AESE instruction + mov @data[0].s[0],$roundkey + tbl @vtmp[0].16b, {@data[0].16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); +$code.=<<___; + eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b + aese @vtmp[0].16b,@vtmp[1].16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); +$code.=<<___; + mov $wtmp,@vtmp[0].s[0] + + // linear transformation + eor $roundkey,$wtmp,$wtmp,ror #19 + eor $roundkey,$roundkey,$wtmp,ror #9 + mov $wtmp,$vkey.s[0] + eor $roundkey,$roundkey,$wtmp + mov $vkey.s[0],$roundkey + cbz $enc,2f + str $roundkey,[$roundKey],#4 + b 3f +2: + str $roundkey,[$roundKey],#-4 +3: + tbl $vkey.16b,{$vkey.16b},$vmap.16b + subs $schedules,$schedules,#1 + b.ne 1b + ret +.size ${prefix}_set_key,.-${prefix}_set_key +___ +}}} + + +{{{ +$code.=<<___; +.type ${prefix}_enc_4blks,%function +.align 4 +${prefix}_enc_4blks: +___ + &encrypt_4blks(); +$code.=<<___; + ret +.size ${prefix}_enc_4blks,.-${prefix}_enc_4blks +___ +}}} + +{{{ +$code.=<<___; +.type ${prefix}_enc_8blks,%function +.align 4 +${prefix}_enc_8blks: +___ + &encrypt_8blks($rks1); +$code.=<<___; + ret +.size ${prefix}_enc_8blks,.-${prefix}_enc_8blks +___ +}}} + +{{{ +my ($key,$keys)=("x0","x1"); +$code.=<<___; +.globl ${prefix}_set_encrypt_key +.type ${prefix}_set_encrypt_key,%function +.align 5 +${prefix}_set_encrypt_key: + stp x29,x30,[sp,#-16]! + mov w2,1 + bl ${prefix}_set_key + ldp x29,x30,[sp],#16 + ret +.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key +___ +}}} + +{{{ +my ($key,$keys)=("x0","x1"); +$code.=<<___; +.globl ${prefix}_set_decrypt_key +.type ${prefix}_set_decrypt_key,%function +.align 5 +${prefix}_set_decrypt_key: + stp x29,x30,[sp,#-16]! + mov w2,0 + bl ${prefix}_set_key + ldp x29,x30,[sp],#16 + ret +.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key +___ +}}} + + +{{{ + +$code.=<<___; +.globl ${prefix}_ecb_encrypt +.type ${prefix}_ecb_encrypt,%function +.align 5 +${prefix}_ecb_encrypt: + stp d8,d9,[sp,#-0x10]! + stp d10,d11,[sp,#-0x10]! + stp d12,d13,[sp,#-0x10]! + stp d14,d15,[sp,#-0x10]! + stp x16,x17,[sp,#-0x10]! + stp x29,x30,[sp,#-0x10]! +___ + &load_sbox_matrix(); +$code.=<<___; + // convert length into blocks + lsr x2,x2,4 +.Lecb_8_blocks_process: + cmp $blocks,#8 + b.lt .Lecb_4_blocks_process + ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 + ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &rev32(@datax[0],@datax[0]); + &rev32(@datax[1],@datax[1]); + &rev32(@datax[2],@datax[2]); + &rev32(@datax[3],@datax[3]); +$code.=<<___; + bl ${prefix}_enc_8blks + st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.gt .Lecb_8_blocks_process + b 100f +.Lecb_4_blocks_process: + cmp $blocks,#4 + b.lt 1f + ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl ${prefix}_enc_4blks + st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + sub $blocks,$blocks,#4 +1: + // process last block + cmp $blocks,#1 + b.lt 100f + b.gt 1f + ld1 {@data[0].4s},[$inp] +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0],$rks1); +$code.=<<___; + st1 {@data[0].4s},[$outp] + b 100f +1: // process last 2 blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16 + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16 + cmp $blocks,#2 + b.gt 1f +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl ${prefix}_enc_4blks + st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 + st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp] + b 100f +1: // process last 3 blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16 +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl ${prefix}_enc_4blks + st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 + st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16 + st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp] +100: + ldp x29,x30,[sp],#0x10 + ldp x16,x17,[sp],#0x10 + ldp d14,d15,[sp],#0x10 + ldp d12,d13,[sp],#0x10 + ldp d10,d11,[sp],#0x10 + ldp d8,d9,[sp],#0x10 + ret +.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt +___ +}}} + +{{{ +sub gen_xts_do_cipher() { +$code.=<<___; +.globl ${prefix}_xts_do_cipher${standard} +.type ${prefix}_xts_do_cipher${standard},%function +.align 5 +${prefix}_xts_do_cipher${standard}: + stp x29,x30,[sp,#-16]! + ld1 {@tweak[0].4s}, [$ivp] +___ + &load_sbox_matrix(); + &rev32(@tweak[0],@tweak[0]); + &encrypt_1blk(@tweak[0],$rks2); +$code.=<<___; + and $remain,$len,#0x0F + // convert length into blocks + lsr $blocks,$len,4 + cmp $blocks,#1 + b.lt .return${standard} + + cmp $remain,0 + // If the encryption/decryption Length is N times of 16, + // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${standard} + b.eq .xts_encrypt_blocks${standard} + + // If the encryption/decryption length is not N times of 16, + // the last two blocks are encrypted/decrypted in .last_2blks_tweak${standard} or .only_2blks_tweak${standard} + // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${standard} + subs $blocks,$blocks,#1 + b.eq .only_2blks_tweak${standard} +.xts_encrypt_blocks${standard}: +___ + &rbit(@tweak[0],@tweak[0]); + &rev32_armeb(@tweak[0],@tweak[0]); + &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); + &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); + &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); + &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); + &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); + &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); + &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); + &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); +$code.=<<___; +.Lxts_8_blocks_process${standard}: + cmp $blocks,#8 +___ + &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); + &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]); + &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); + &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); + &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); + &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); + &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); + &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); + &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]); + &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); + &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]); + &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); + &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]); + &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); + &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]); + &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); +$code.=<<___; + b.lt .Lxts_4_blocks_process${standard} + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); + &rbit(@tweak[2],@tweak[2]); + &rbit(@tweak[3],@tweak[3]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b + eor @data[3].16b, @data[3].16b, @tweak[3].16b + ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 +___ + &rbit(@tweak[4],@tweak[4]); + &rbit(@tweak[5],@tweak[5]); + &rbit(@tweak[6],@tweak[6]); + &rbit(@tweak[7],@tweak[7]); +$code.=<<___; + eor @datax[0].16b, @datax[0].16b, @tweak[4].16b + eor @datax[1].16b, @datax[1].16b, @tweak[5].16b + eor @datax[2].16b, @datax[2].16b, @tweak[6].16b + eor @datax[3].16b, @datax[3].16b, @tweak[7].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &rev32(@datax[0],@datax[0]); + &rev32(@datax[1],@datax[1]); + &rev32(@datax[2],@datax[2]); + &rev32(@datax[3],@datax[3]); + &transpose(@data,@vtmp); + &transpose(@datax,@vtmp); +$code.=<<___; + bl ${prefix}_enc_8blks +___ + &transpose(@vtmp,@datax); + &transpose(@data,@datax); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b + eor @data[0].16b, @data[0].16b, @tweak[4].16b + eor @data[1].16b, @data[1].16b, @tweak[5].16b + eor @data[2].16b, @data[2].16b, @tweak[6].16b + eor @data[3].16b, @data[3].16b, @tweak[7].16b + + // save the last tweak + mov $lastTweak.16b,@tweak[7].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.gt .Lxts_8_blocks_process${standard} + b 100f +.Lxts_4_blocks_process${standard}: + cmp $blocks,#4 + b.lt 1f + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); + &rbit(@tweak[2],@tweak[2]); + &rbit(@tweak[3],@tweak[3]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b + eor @data[3].16b, @data[3].16b, @tweak[3].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &transpose(@data,@vtmp); +$code.=<<___; + bl ${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + sub $blocks,$blocks,#4 + mov @tweak[0].16b,@tweak[4].16b + mov @tweak[1].16b,@tweak[5].16b + mov @tweak[2].16b,@tweak[6].16b + // save the last tweak + mov $lastTweak.16b,@tweak[3].16b +1: + // process last block + cmp $blocks,#1 + b.lt 100f + b.gt 1f + ld1 {@data[0].4s},[$inp],#16 +___ + &rbit(@tweak[0],@tweak[0]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0],$rks1); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + st1 {@data[0].4s},[$outp],#16 + // save the last tweak + mov $lastTweak.16b,@tweak[0].16b + b 100f +1: // process last 2 blocks + cmp $blocks,#2 + b.gt 1f + ld1 {@data[0].4s,@data[1].4s},[$inp],#32 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &transpose(@data,@vtmp); +$code.=<<___; + bl ${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 + // save the last tweak + mov $lastTweak.16b,@tweak[1].16b + b 100f +1: // process last 3 blocks + ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 +___ + &rbit(@tweak[0],@tweak[0]); + &rbit(@tweak[1],@tweak[1]); + &rbit(@tweak[2],@tweak[2]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &transpose(@data,@vtmp); +$code.=<<___; + bl ${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 + // save the last tweak + mov $lastTweak.16b,@tweak[2].16b +100: + cmp $remain,0 + b.eq .return${standard} + +// This brance calculates the last two tweaks, +// while the encryption/decryption length is larger than 32 +.last_2blks_tweak${standard}: +___ + &rev32_armeb($lastTweak,$lastTweak); + &compute_tweak_vec($lastTweak,@tweak[1]); + &compute_tweak_vec(@tweak[1],@tweak[2]); +$code.=<<___; + b .check_dec${standard} + + +// This brance calculates the last two tweaks, +// while the encryption/decryption length is equal to 32, who only need two tweaks +.only_2blks_tweak${standard}: + mov @tweak[1].16b,@tweak[0].16b +___ + &rev32_armeb(@tweak[1],@tweak[1]); + &compute_tweak_vec(@tweak[1],@tweak[2]); +$code.=<<___; + b .check_dec${standard} + + +// Determine whether encryption or decryption is required. +// The last two tweaks need to be swapped for decryption. +.check_dec${standard}: + // encryption:1 decryption:0 + cmp $enc,1 + b.eq .prcess_last_2blks${standard} + mov @vtmp[0].16B,@tweak[1].16b + mov @tweak[1].16B,@tweak[2].16b + mov @tweak[2].16B,@vtmp[0].16b + +.prcess_last_2blks${standard}: +___ + &rev32_armeb(@tweak[1],@tweak[1]); + &rev32_armeb(@tweak[2],@tweak[2]); +$code.=<<___; + ld1 {@data[0].4s},[$inp],#16 + eor @data[0].16b, @data[0].16b, @tweak[1].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0],$rks1); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[1].16b + st1 {@data[0].4s},[$outp],#16 + + sub $lastBlk,$outp,16 + .loop${standard}: + subs $remain,$remain,1 + ldrb $wtmp0,[$lastBlk,$remain] + ldrb $wtmp1,[$inp,$remain] + strb $wtmp1,[$lastBlk,$remain] + strb $wtmp0,[$outp,$remain] + b.gt .loop${standard} + ld1 {@data[0].4s}, [$lastBlk] + eor @data[0].16b, @data[0].16b, @tweak[2].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0],$rks1); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[2].16b + st1 {@data[0].4s}, [$lastBlk] +.return${standard}: + ldp x29,x30,[sp],#16 + ret +.size ${prefix}_xts_do_cipher${standard},.-${prefix}_xts_do_cipher${standard} +___ +} #end of gen_xts_do_cipher + +}}} + +{{{ +sub gen_xts_cipher() { + my $en = shift; + +$code.=<<___; +.globl ${prefix}_xts_${en}crypt${standard} +.type ${prefix}_xts_${en}crypt${standard},%function +.align 5 +${prefix}_xts_${en}crypt${standard}: + stp x15, x16, [sp, #-0x10]! + stp x17, x18, [sp, #-0x10]! + stp x19, x20, [sp, #-0x10]! + stp x21, x22, [sp, #-0x10]! + stp x23, x24, [sp, #-0x10]! + stp x25, x26, [sp, #-0x10]! + stp x27, x28, [sp, #-0x10]! + stp x29, x30, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d14, d15, [sp, #-0x10]! +___ + &mov_en_to_enc($en); +$code.=<<___; + bl ${prefix}_xts_do_cipher${standard} + ldp d14, d15, [sp], #0x10 + ldp d12, d13, [sp], #0x10 + ldp d10, d11, [sp], #0x10 + ldp d8, d9, [sp], #0x10 + ldp x29, x30, [sp], #0x10 + ldp x27, x28, [sp], #0x10 + ldp x25, x26, [sp], #0x10 + ldp x23, x24, [sp], #0x10 + ldp x21, x22, [sp], #0x10 + ldp x19, x20, [sp], #0x10 + ldp x17, x18, [sp], #0x10 + ldp x15, x16, [sp], #0x10 + ret +.size ${prefix}_xts_${en}crypt${standard},.-${prefix}_xts_${en}crypt${standard} +___ + +} # end of gen_xts_cipher +$standard="_gb"; +&gen_xts_do_cipher(); +&gen_xts_cipher("en"); +&gen_xts_cipher("de"); +$standard=""; +&gen_xts_do_cipher(); +&gen_xts_cipher("en"); +&gen_xts_cipher("de"); +}}} + +######################################## +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info index b65a7d1..bb042c5 100644 --- a/crypto/sm4/build.info +++ b/crypto/sm4/build.info @@ -1,4 +1,7 @@ LIBS=../../libcrypto SOURCE[../../libcrypto]=\ - sm4.c + sm4.c {- $target{sm4_asm_src} -} + +GENERATE[vpsm4_ex-armv8.S]=asm/vpsm4_ex-armv8.pl $(PERLASM_SCHEME) +INCLUDE[vpsm4_ex-armv8.o]=.. \ No newline at end of file diff --git a/doc/man3/EVP_sm4_xts.pod b/doc/man3/EVP_sm4_xts.pod new file mode 100644 index 0000000..09ca3fb --- /dev/null +++ b/doc/man3/EVP_sm4_xts.pod @@ -0,0 +1,67 @@ +=pod + +=head1 NAME + +EVP_sm4_xts, +- EVP SM4 cipher + +=head1 SYNOPSIS + + #include <openssl/evp.h> + + const EVP_CIPHER *EVP_sm4_xts(void); + +=head1 DESCRIPTION + +The XTS mode of operation (GB/T 17964-2021) for SM4 block cipher. + +=over 4 + +=item EVP_sm4_xts(), + +The SM4 blockcipher with a 256-bit key in XTS mode. This mode use a key length of 256 bits and acts on blocks of 128 bits. + +The B<iv> parameter to L<EVP_EncryptInit_ex(3)> or L<EVP_DecryptInit_ex(3)> is the XTS first "tweak" value. XTS mode has two implementations to calculate the following tweak values, one is standardized in IEEE Std. 1619-2007 and has been widely used (e.g., XTS AES), the other is proposed recently (GB/T 17964-2021 implemented in May 2022) and is currently only used in SM4. + +Assume that the input data (B<in>, B<iv>, and B<Key>) are consistent, the following tweak values are inconsistent due to different standards. As a result, the first ciphertext block are consistent, but the subsequent ciphertext blocks (if any) are different. + +By default, EVP_sm4_xts is standardized in GB/T 17964-2021, and can be changed by EVP_CIPHER_CTX_ctrl. The following I<ctrl>s is supported in XTS mode for SM4. + +=over 4 + +=item EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_XTS_STANDARD, std, NULL) + +Sets the standard of EVP_sm4_xts to B<std>. This must be one of 0 or 1, 0 for XTS mode in GB/T 17964-2021, 1 for XTS mode in IEEE Std 1619-2007. + +=back + +The XTS implementation in OpenSSL does not support streaming. That is there must +only be one L<EVP_EncryptUpdate(3)> call per L<EVP_EncryptInit_ex(3)> call (and +similarly with the "Decrypt" functions). + +=back + +=head1 RETURN VALUES + +These functions return a B<EVP_CIPHER> structure that contains the +implementation of the symmetric cipher. See L<EVP_CIPHER_meth_new(3)> for +details of the B<EVP_CIPHER> structure. + +=head1 SEE ALSO + +L<evp(7)>, +L<EVP_EncryptInit(3)>, +L<EVP_CIPHER_meth_new(3)> + +=head1 COPYRIGHT + +Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. +Copyright 2022 Ribose Inc. All Rights Reserved. + +Licensed under the OpenSSL license (the "License"). You may not use +this file except in compliance with the License. You can obtain a copy +in the file LICENSE in the source distribution or at +L<https://www.openssl.org/source/license.html>. + +=cut + diff --git a/fuzz/oids.txt b/fuzz/oids.txt index 8dfdea9..d1f98a8 100644 --- a/fuzz/oids.txt +++ b/fuzz/oids.txt @@ -1064,3 +1064,4 @@ OBJ_id_tc26_gost_3410_2012_256_paramSetD="\x2A\x85\x03\x07\x01\x02\x01\x01\x04" OBJ_hmacWithSHA512_224="\x2A\x86\x48\x86\xF7\x0D\x02\x0C" OBJ_hmacWithSHA512_256="\x2A\x86\x48\x86\xF7\x0D\x02\x0D" OBJ_SM2_with_SM3="\x2A\x81\x1C\xCF\x55\x01\x83\x75" +OBJ_sm4_xts="\x2A\x81\x1C\xCF\x55\x01\x68\x0A" diff --git a/include/openssl/evp.h b/include/openssl/evp.h index 3116c1b..69326bc 100644 --- a/include/openssl/evp.h +++ b/include/openssl/evp.h @@ -353,6 +353,9 @@ int (*EVP_CIPHER_meth_get_ctrl(const EVP_CIPHER *cipher))(EVP_CIPHER_CTX *, # define EVP_CTRL_GET_IVLEN 0x25 +/* Set the XTS mode standard, SM4 only */ +# define EVP_CTRL_XTS_STANDARD 0x26 + /* Padding modes */ #define EVP_PADDING_PKCS7 1 #define EVP_PADDING_ISO7816_4 2 @@ -937,6 +940,7 @@ const EVP_CIPHER *EVP_sm4_cfb128(void); # define EVP_sm4_cfb EVP_sm4_cfb128 const EVP_CIPHER *EVP_sm4_ofb(void); const EVP_CIPHER *EVP_sm4_ctr(void); +const EVP_CIPHER *EVP_sm4_xts(void); # endif # if OPENSSL_API_COMPAT < 0x10100000L diff --git a/include/openssl/modes.h b/include/openssl/modes.h index d544f98..dea324f 100644 --- a/include/openssl/modes.h +++ b/include/openssl/modes.h @@ -22,6 +22,10 @@ typedef void (*cbc128_f) (const unsigned char *in, unsigned char *out, size_t len, const void *key, unsigned char ivec[16], int enc); +typedef void (*ecb128_f) (const unsigned char *in, unsigned char *out, + size_t len, const void *key, + int enc); + typedef void (*ctr128_f) (const unsigned char *in, unsigned char *out, size_t blocks, const void *key, const unsigned char ivec[16]); @@ -153,6 +157,11 @@ int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char *inp, unsigned char *out, size_t len, int enc); +int CRYPTO_xts128gb_encrypt(const XTS128_CONTEXT *ctx, + const unsigned char iv[16], + const unsigned char *inp, unsigned char *out, + size_t len, int enc); + size_t CRYPTO_128_wrap(void *key, const unsigned char *iv, unsigned char *out, const unsigned char *in, size_t inlen, diff --git a/include/openssl/obj_mac.h b/include/openssl/obj_mac.h index 9b125c1..edfc87d 100644 --- a/include/openssl/obj_mac.h +++ b/include/openssl/obj_mac.h @@ -4772,6 +4772,11 @@ #define NID_sm4_ctr 1139 #define OBJ_sm4_ctr OBJ_sm_scheme,104L,7L +#define SN_sm4_xts "SM4-XTS" +#define LN_sm4_xts "sm4-xts" +#define NID_sm4_xts 1196 +#define OBJ_sm4_xts OBJ_sm_scheme,104L,10L + #define SN_hmac "HMAC" #define LN_hmac "hmac" #define NID_hmac 855 diff --git a/test/evp_test.c b/test/evp_test.c index 62f20ec..3c65ce9 100644 --- a/test/evp_test.c +++ b/test/evp_test.c @@ -485,6 +485,8 @@ typedef struct cipher_data_st { unsigned char *tag; size_t tag_len; int tag_late; + /* SM4 XTS only */ + int std; } CIPHER_DATA; static int cipher_test_init(EVP_TEST *t, const char *alg) @@ -568,6 +570,15 @@ static int cipher_test_parse(EVP_TEST *t, const char *keyword, return -1; return 1; } + if (strcmp(keyword, "Standard") == 0) { + if (strcmp(value, "GB") == 0) + cdat->std = 0; + else if (strcmp(value, "IEEE") == 0) + cdat->std = 1; + else + return -1; + return 1; + } return 0; } @@ -707,7 +718,11 @@ static int cipher_test_enc(EVP_TEST *t, int enc, goto err; } } - + if (expected->std) { + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_XTS_STANDARD, expected->std, NULL)) { + goto err; + }; + } EVP_CIPHER_CTX_set_padding(ctx, 0); t->err = "CIPHERUPDATE_ERROR"; tmplen = 0; diff --git a/test/recipes/30-test_evp_data/evpciph.txt b/test/recipes/30-test_evp_data/evpciph.txt index 76c839b..a3687bc 100644 --- a/test/recipes/30-test_evp_data/evpciph.txt +++ b/test/recipes/30-test_evp_data/evpciph.txt @@ -2132,6 +2132,28 @@ IV = 0123456789ABCDEFFEDCBA9876543210 Plaintext = AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCDDDDDDDDDDDDDDDDEEEEEEEEEEEEEEEEFFFFFFFFFFFFFFFFEEEEEEEEEEEEEEEEAAAAAAAAAAAAAAAA Ciphertext = C2B4759E78AC3CF43D0852F4E8D5F9FD7256E8A5FCB65A350EE00630912E44492A0B17E1B85B060D0FBA612D8A95831638B361FD5FFACD942F081485A83CA35D +Title = SM4 XTS test vectors, the XTS mode is standardized in GB/T 17964-2021 by default +Cipher = SM4-XTS +Key = 2B7E151628AED2A6ABF7158809CF4F3C000102030405060708090A0B0C0D0E0F +IV = F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF +Plaintext = 6BC1BEE22E409F96E93D7E117393172AAE2D8A571E03AC9C9EB76FAC45AF8E5130C81C46A35CE411E5FBC1191A0A52EFF69F2445DF4F9B17 +Ciphertext = E9538251C71D7B80BBE4483FEF497BD12C5C581BD6242FC51E08964FB4F60FDB0BA42F63499279213D318D2C11F6886E903BE7F93A1B3479 + +Title = SM4 test vectors for XTS mode in GB/T 17964-2021 and IEEE Std 1619-2007 +Cipher = SM4-XTS +Key = 2B7E151628AED2A6ABF7158809CF4F3C000102030405060708090A0B0C0D0E0F +IV = F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF +Plaintext = 6BC1BEE22E409F96E93D7E117393172AAE2D8A571E03AC9C9EB76FAC45AF8E5130C81C46A35CE411E5FBC1191A0A52EFF69F2445DF4F9B17 +Ciphertext = E9538251C71D7B80BBE4483FEF497BD12C5C581BD6242FC51E08964FB4F60FDB0BA42F63499279213D318D2C11F6886E903BE7F93A1B3479 +Standard = GB + +Cipher = SM4-XTS +Key = 2B7E151628AED2A6ABF7158809CF4F3C000102030405060708090A0B0C0D0E0F +IV = F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF +Plaintext = 6BC1BEE22E409F96E93D7E117393172AAE2D8A571E03AC9C9EB76FAC45AF8E5130C81C46A35CE411E5FBC1191A0A52EFF69F2445DF4F9B17 +Ciphertext = E9538251C71D7B80BBE4483FEF497BD1B3DB1A3E60408C575D63FF7DB39F83260869F9E2585FEC9F0B863BF8FD784B8627D16C0DB6D2CFC7 +Standard = IEEE + Title = ARIA test vectors from RFC5794 (and others) Cipher = ARIA-128-ECB diff --git a/util/libcrypto.num b/util/libcrypto.num index 95bccf9..62e2ea2 100644 --- a/util/libcrypto.num +++ b/util/libcrypto.num @@ -4632,3 +4632,5 @@ X509_REQ_get0_sm2_id 6385 1_1_1m EXIST::FUNCTION:SM2 X509_REQ_set0_sm2_id 6386 1_1_1m EXIST::FUNCTION:SM2 EVP_PKEY_is_sm2 6387 1_1_1m EXIST::FUNCTION:SM2 SM2_compute_key 6388 1_1_1m EXIST::FUNCTION: +EVP_sm4_xts 6389 1_1_1m EXIST::FUNCTION:SM4 +CRYPTO_xts128gb_encrypt 6390 1_1_1m EXIST::FUNCTION: \ No newline at end of file -- 2.36.1
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.
浙ICP备2022010568号-2