Projects
openEuler:24.03:SP1:Everything
compat-openssl11
_service:tar_scm:Backport-SM4-optimization-for-...
Sign Up
Log In
Username
Password
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File _service:tar_scm:Backport-SM4-optimization-for-ARM-by-HW-instruction.patch of Package compat-openssl11
From 4f7e522f7fda2c55c4915396d08f8c9cf3b3fba8 Mon Sep 17 00:00:00 2001 From: Xu Yizhou <xuyizhou1@huawei.com> Date: Fri, 28 Oct 2022 11:24:28 +0800 Subject: [PATCH 2/3] SM4 optimization for ARM by HW instruction This patch is a copy of the following PR, with some extra supporting code. 1. SM4 optimization for ARM by HW instruction This patch implements the SM4 optimization for ARM processor, using SM4 HW instruction, which is an optional feature of crypto extension for aarch64 V8. Tested on some modern ARM micro-architectures with SM4 support, the performance uplift can be observed around 8X~40X over existing C implementation in openssl. Algorithms that can be parallelized (like CTR, ECB, CBC decryption) are on higher end, with algorithm like CBC encryption on lower end (due to inter-block dependency) Perf data on Yitian-710 2.75GHz hardware, before and after optimization: Before: type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes SM4-CTR 105787.80k 107837.87k 108380.84k 108462.08k 108549.46k 108554.92k SM4-ECB 111924.58k 118173.76k 119776.00k 120093.70k 120264.02k 120274.94k SM4-CBC 106428.09k 109190.98k 109674.33k 109774.51k 109827.41k 109827.41k After (7.4x - 36.6x faster): type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes SM4-CTR 781979.02k 2432994.28k 3437753.86k 3834177.88k 3963715.58k 3974556.33k SM4-ECB 937590.69k 2941689.02k 3945751.81k 4328655.87k 4459181.40k 4468692.31k SM4-CBC 890639.88k 1027746.58k 1050621.78k 1056696.66k 1058613.93k 1058701.31k Signed-off-by: Daniel Hu <Daniel.Hu@arm.com> Reviewed-by: Paul Dale <pauli@openssl.org> Reviewed-by: Tomas Mraz <tomas@openssl.org> (Merged from https://github.com/openssl/openssl/pull/17455\) Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com> --- Configurations/00-base-templates.conf | 2 +- Configure | 3 +- crypto/arm64cpuid.pl | 7 + crypto/arm_arch.h | 1 + crypto/armcap.c | 10 + crypto/evp/e_sm4.c | 88 ++-- crypto/sm4/asm/sm4-armv8.pl | 629 ++++++++++++++++++++++++++ crypto/sm4/build.info | 13 +- include/crypto/sm4_platform.h | 70 +++ 9 files changed, 788 insertions(+), 35 deletions(-) create mode 100644 crypto/sm4/asm/sm4-armv8.pl create mode 100644 include/crypto/sm4_platform.h diff --git a/Configurations/00-base-templates.conf b/Configurations/00-base-templates.conf index a67ae65..a26d081 100644 --- a/Configurations/00-base-templates.conf +++ b/Configurations/00-base-templates.conf @@ -321,7 +321,7 @@ my %targets=( chacha_asm_src => "chacha-armv8.S", poly1305_asm_src=> "poly1305-armv8.S", keccak1600_asm_src => "keccak1600-armv8.S", - sm4_asm_src => "vpsm4_ex-armv8.S", + sm4_asm_src => "sm4-armv8.S vpsm4_ex-armv8.S", sm3_asm_src => "sm3-armv8.S", }, parisc11_asm => { diff --git a/Configure b/Configure index fce460d..d013204 100755 --- a/Configure +++ b/Configure @@ -1421,7 +1421,8 @@ unless ($disabled{asm}) { push @{$config{lib_defines}}, "POLY1305_ASM"; } if ($target{sm4_asm_src} ne "") { - push @{$config{lib_defines}}, "VPSM4_EX_ASM"; + push @{$config{lib_defines}}, "SM4_ASM" if ($target{sm4_asm_src} =~ m/sm4/); + push @{$config{lib_defines}}, "VPSM4_EX_ASM" if ($target{sm4_asm_src} =~ m/vpsm4_ex/); } if ($target{sm3_asm_src} ne "") { push @{$config{lib_defines}}, "SM3_ASM"; diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl index 1e9b167..341167b 100755 --- a/crypto/arm64cpuid.pl +++ b/crypto/arm64cpuid.pl @@ -71,6 +71,13 @@ _armv8_pmull_probe: ret .size _armv8_pmull_probe,.-_armv8_pmull_probe +.globl _armv8_sm4_probe +.type _armv8_sm4_probe,%function +_armv8_sm4_probe: + .long 0xcec08400 // sm4e v0.4s, v0.4s + ret +.size _armv8_sm4_probe,.-_armv8_sm4_probe + .globl _armv8_sha512_probe .type _armv8_sha512_probe,%function _armv8_sha512_probe: diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h index 8839b21..0f6f7ca 100644 --- a/crypto/arm_arch.h +++ b/crypto/arm_arch.h @@ -81,5 +81,6 @@ extern unsigned int OPENSSL_armcap_P; # define ARMV8_PMULL (1<<5) # define ARMV8_SHA512 (1<<6) # define ARMV8_SM3 (1<<9) +# define ARMV8_SM4 (1<<10) #endif diff --git a/crypto/armcap.c b/crypto/armcap.c index 8b2f4a5..73bcad1 100644 --- a/crypto/armcap.c +++ b/crypto/armcap.c @@ -48,6 +48,7 @@ void _armv8_sha256_probe(void); void _armv8_pmull_probe(void); # ifdef __aarch64__ void _armv8_sm3_probe(void); +void _armv8_sm4_probe(void); void _armv8_sha512_probe(void); # endif uint32_t _armv7_tick(void); @@ -132,6 +133,7 @@ static unsigned long getauxval(unsigned long key) # define HWCAP_CE_SHA1 (1 << 5) # define HWCAP_CE_SHA256 (1 << 6) # define HWCAP_CE_SM3 (1 << 18) +# define HWCAP_CE_SM4 (1 << 19) # define HWCAP_CE_SHA512 (1 << 21) # endif @@ -190,6 +192,9 @@ void OPENSSL_cpuid_setup(void) OPENSSL_armcap_P |= ARMV8_SHA256; # ifdef __aarch64__ + if (hwcap & HWCAP_CE_SM4) + OPENSSL_armcap_P |= ARMV8_SM4; + if (hwcap & HWCAP_CE_SHA512) OPENSSL_armcap_P |= ARMV8_SHA512; @@ -234,6 +239,11 @@ void OPENSSL_cpuid_setup(void) OPENSSL_armcap_P |= ARMV8_SHA256; } # if defined(__aarch64__) && !defined(__APPLE__) + if (sigsetjmp(ill_jmp, 1) == 0) { + _armv8_sm4_probe(); + OPENSSL_armcap_P |= ARMV8_SM4; + } + if (sigsetjmp(ill_jmp, 1) == 0) { _armv8_sha512_probe(); OPENSSL_armcap_P |= ARMV8_SHA512; diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c index 169d6c7..eaa5ba0 100644 --- a/crypto/evp/e_sm4.c +++ b/crypto/evp/e_sm4.c @@ -15,17 +15,11 @@ # include <openssl/modes.h> # include "crypto/sm4.h" # include "crypto/evp.h" +# include "crypto/sm4_platform.h" # include "evp_local.h" # include "modes_local.h" -#if defined(OPENSSL_CPUID_OBJ) && (defined(__arm__) || defined(__arm) || defined(__aarch64__)) -# include "arm_arch.h" -# if __ARM_MAX_ARCH__>=7 -# if defined(VPSM4_EX_ASM) -# define VPSM4_EX_CAPABLE (OPENSSL_armcap_P & ARMV8_AES) -# endif -# endif -#endif + typedef struct { union { @@ -35,28 +29,11 @@ typedef struct { block128_f block; union { ecb128_f ecb; + cbc128_f cbc; + ctr128_f ctr; } stream; } EVP_SM4_KEY; -#ifdef VPSM4_EX_CAPABLE -void vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); -void vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); -#define vpsm4_ex_encrypt SM4_encrypt -#define vpsm4_ex_decrypt SM4_encrypt -void vpsm4_ex_ecb_encrypt( - const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key, const int enc); -/* xts mode in GB/T 17964-2021 */ -void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, - const SM4_KEY *key2, const uint8_t iv[16]); -void vpsm4_ex_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, - const SM4_KEY *key2, const uint8_t iv[16]); -/* xts mode in IEEE Std 1619-2007 */ -void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, - const SM4_KEY *key2, const uint8_t iv[16]); -void vpsm4_ex_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, - const SM4_KEY *key2, const uint8_t iv[16]); -#endif - # define BLOCK_CIPHER_generic(nid,blocksize,ivlen,nmode,mode,MODE,flags) \ static const EVP_CIPHER sm4_##mode = { \ nid##_##nmode,blocksize,128/8,ivlen, \ @@ -84,6 +61,21 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, mode = EVP_CIPHER_CTX_mode(ctx); if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) && !enc) { +#ifdef HWSM4_CAPABLE + if (HWSM4_CAPABLE) { + HWSM4_set_decrypt_key(key, &dat->ks.ks); + dat->block = (block128_f) HWSM4_decrypt; + dat->stream.cbc = NULL; +# ifdef HWSM4_cbc_encrypt + if (mode == EVP_CIPH_CBC_MODE) + dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt; +# endif +# ifdef HWSM4_ecb_encrypt + if (mode == EVP_CIPH_ECB_MODE) + dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt; +# endif + } else +#endif #ifdef VPSM4_EX_CAPABLE if (VPSM4_EX_CAPABLE) { vpsm4_ex_set_decrypt_key(key, &dat->ks.ks); @@ -97,6 +89,29 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, SM4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); } } else { +#ifdef HWSM4_CAPABLE + if (HWSM4_CAPABLE) { + HWSM4_set_encrypt_key(key, &dat->ks.ks); + dat->block = (block128_f) HWSM4_encrypt; + dat->stream.cbc = NULL; +# ifdef HWSM4_cbc_encrypt + if (mode == EVP_CIPH_CBC_MODE) + dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt; + else +# endif +# ifdef HWSM4_ecb_encrypt + if (mode == EVP_CIPH_ECB_MODE) + dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt; + else +# endif +# ifdef HWSM4_ctr32_encrypt_blocks + if (mode == EVP_CIPH_CTR_MODE) + dat->stream.ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks; + else +# endif + (void)0; /* terminate potentially open 'else' */ + } else +#endif #ifdef VPSM4_EX_CAPABLE if (VPSM4_EX_CAPABLE) { vpsm4_ex_set_encrypt_key(key, &dat->ks.ks); @@ -118,7 +133,10 @@ static int sm4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, { EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); - if (EVP_CIPHER_CTX_encrypting(ctx)) + if (dat->stream.cbc) + (*dat->stream.cbc) (in, out, len, &dat->ks.ks, ctx->iv, + EVP_CIPHER_CTX_encrypting(ctx)); + else if (EVP_CIPHER_CTX_encrypting(ctx)) CRYPTO_cbc128_encrypt(in, out, len, &dat->ks.ks, EVP_CIPHER_CTX_iv_noconst(ctx), dat->block); else @@ -183,10 +201,16 @@ static int sm4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, return 0; num = (unsigned int)n; - CRYPTO_ctr128_encrypt(in, out, len, &dat->ks.ks, - ctx->iv, - EVP_CIPHER_CTX_buf_noconst(ctx), &num, - dat->block); + if (dat->stream.ctr) + CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks, + ctx->iv, + EVP_CIPHER_CTX_buf_noconst(ctx), + &num, dat->stream.ctr); + else + CRYPTO_ctr128_encrypt(in, out, len, &dat->ks.ks, + ctx->iv, + EVP_CIPHER_CTX_buf_noconst(ctx), &num, + dat->block); EVP_CIPHER_CTX_set_num(ctx, num); return 1; } diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl new file mode 100644 index 0000000..dbacad2 --- /dev/null +++ b/crypto/sm4/asm/sm4-armv8.pl @@ -0,0 +1,629 @@ +#! /usr/bin/env perl +# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# This module implements support for SM4 hw support on aarch64 +# Oct 2021 +# + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +$prefix="sm4_v8"; +my @rks=map("v$_",(0..7)); + +sub rev32() { +my $dst = shift; +my $src = shift; +$code.=<<___; +#ifndef __ARMEB__ + rev32 $dst.16b,$src.16b +#endif +___ +} + +sub enc_blk () { +my $data = shift; +$code.=<<___; + sm4e $data.4s,@rks[0].4s + sm4e $data.4s,@rks[1].4s + sm4e $data.4s,@rks[2].4s + sm4e $data.4s,@rks[3].4s + sm4e $data.4s,@rks[4].4s + sm4e $data.4s,@rks[5].4s + sm4e $data.4s,@rks[6].4s + sm4e $data.4s,@rks[7].4s + rev64 $data.4S,$data.4S + ext $data.16b,$data.16b,$data.16b,#8 +___ +} + +sub enc_4blks () { +my $data0 = shift; +my $data1 = shift; +my $data2 = shift; +my $data3 = shift; +$code.=<<___; + sm4e $data0.4s,@rks[0].4s + sm4e $data1.4s,@rks[0].4s + sm4e $data2.4s,@rks[0].4s + sm4e $data3.4s,@rks[0].4s + + sm4e $data0.4s,@rks[1].4s + sm4e $data1.4s,@rks[1].4s + sm4e $data2.4s,@rks[1].4s + sm4e $data3.4s,@rks[1].4s + + sm4e $data0.4s,@rks[2].4s + sm4e $data1.4s,@rks[2].4s + sm4e $data2.4s,@rks[2].4s + sm4e $data3.4s,@rks[2].4s + + sm4e $data0.4s,@rks[3].4s + sm4e $data1.4s,@rks[3].4s + sm4e $data2.4s,@rks[3].4s + sm4e $data3.4s,@rks[3].4s + + sm4e $data0.4s,@rks[4].4s + sm4e $data1.4s,@rks[4].4s + sm4e $data2.4s,@rks[4].4s + sm4e $data3.4s,@rks[4].4s + + sm4e $data0.4s,@rks[5].4s + sm4e $data1.4s,@rks[5].4s + sm4e $data2.4s,@rks[5].4s + sm4e $data3.4s,@rks[5].4s + + sm4e $data0.4s,@rks[6].4s + sm4e $data1.4s,@rks[6].4s + sm4e $data2.4s,@rks[6].4s + sm4e $data3.4s,@rks[6].4s + + sm4e $data0.4s,@rks[7].4s + rev64 $data0.4S,$data0.4S + sm4e $data1.4s,@rks[7].4s + ext $data0.16b,$data0.16b,$data0.16b,#8 + rev64 $data1.4S,$data1.4S + sm4e $data2.4s,@rks[7].4s + ext $data1.16b,$data1.16b,$data1.16b,#8 + rev64 $data2.4S,$data2.4S + sm4e $data3.4s,@rks[7].4s + ext $data2.16b,$data2.16b,$data2.16b,#8 + rev64 $data3.4S,$data3.4S + ext $data3.16b,$data3.16b,$data3.16b,#8 +___ +} + +$code=<<___; +#include "arm_arch.h" +.arch armv8-a+crypto +.text +___ + +{{{ +$code.=<<___; +.align 6 +.Lck: + .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 + .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 + .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 + .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 + .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 + .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 + .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 + .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 +.Lfk: + .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc +___ +}}} + +{{{ +my ($key,$keys)=("x0","x1"); +my ($tmp)=("x2"); +my ($key0,$key1,$key2,$key3,$key4,$key5,$key6,$key7)=map("v$_",(0..7)); +my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23)); +my ($fkconst) = ("v24"); +$code.=<<___; +.globl ${prefix}_set_encrypt_key +.type ${prefix}_set_encrypt_key,%function +.align 5 +${prefix}_set_encrypt_key: + ld1 {$key0.4s},[$key] + adr $tmp,.Lfk + ld1 {$fkconst.4s},[$tmp] + adr $tmp,.Lck + ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64 +___ + &rev32($key0, $key0); +$code.=<<___; + ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp] + eor $key0.16b,$key0.16b,$fkconst.16b; + sm4ekey $key0.4S,$key0.4S,$const0.4S + sm4ekey $key1.4S,$key0.4S,$const1.4S + sm4ekey $key2.4S,$key1.4S,$const2.4S + sm4ekey $key3.4S,$key2.4S,$const3.4S + sm4ekey $key4.4S,$key3.4S,$const4.4S + st1 {$key0.4s,$key1.4s,$key2.4s,$key3.4s},[$keys],64 + sm4ekey $key5.4S,$key4.4S,$const5.4S + sm4ekey $key6.4S,$key5.4S,$const6.4S + sm4ekey $key7.4S,$key6.4S,$const7.4S + st1 {$key4.4s,$key5.4s,$key6.4s,$key7.4s},[$keys] + ret +.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key +___ +}}} + +{{{ +my ($key,$keys)=("x0","x1"); +my ($tmp)=("x2"); +my ($key7,$key6,$key5,$key4,$key3,$key2,$key1,$key0)=map("v$_",(0..7)); +my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23)); +my ($fkconst) = ("v24"); +$code.=<<___; +.globl ${prefix}_set_decrypt_key +.type ${prefix}_set_decrypt_key,%function +.align 5 +${prefix}_set_decrypt_key: + ld1 {$key0.4s},[$key] + adr $tmp,.Lfk + ld1 {$fkconst.4s},[$tmp] + adr $tmp, .Lck + ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64 +___ + &rev32($key0, $key0); +$code.=<<___; + ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp] + eor $key0.16b, $key0.16b,$fkconst.16b; + sm4ekey $key0.4S,$key0.4S,$const0.4S + sm4ekey $key1.4S,$key0.4S,$const1.4S + sm4ekey $key2.4S,$key1.4S,$const2.4S + rev64 $key0.4s,$key0.4s + rev64 $key1.4s,$key1.4s + ext $key0.16b,$key0.16b,$key0.16b,#8 + ext $key1.16b,$key1.16b,$key1.16b,#8 + sm4ekey $key3.4S,$key2.4S,$const3.4S + sm4ekey $key4.4S,$key3.4S,$const4.4S + rev64 $key2.4s,$key2.4s + rev64 $key3.4s,$key3.4s + ext $key2.16b,$key2.16b,$key2.16b,#8 + ext $key3.16b,$key3.16b,$key3.16b,#8 + sm4ekey $key5.4S,$key4.4S,$const5.4S + sm4ekey $key6.4S,$key5.4S,$const6.4S + rev64 $key4.4s,$key4.4s + rev64 $key5.4s,$key5.4s + ext $key4.16b,$key4.16b,$key4.16b,#8 + ext $key5.16b,$key5.16b,$key5.16b,#8 + sm4ekey $key7.4S,$key6.4S,$const7.4S + rev64 $key6.4s, $key6.4s + rev64 $key7.4s, $key7.4s + ext $key6.16b,$key6.16b,$key6.16b,#8 + ext $key7.16b,$key7.16b,$key7.16b,#8 + st1 {$key7.4s,$key6.4s,$key5.4s,$key4.4s},[$keys],64 + st1 {$key3.4s,$key2.4s,$key1.4s,$key0.4s},[$keys] + ret +.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key +___ +}}} + +{{{ +sub gen_block () { +my $dir = shift; +my ($inp,$out,$rk)=map("x$_",(0..2)); +my ($data)=("v16"); +$code.=<<___; +.globl ${prefix}_${dir}crypt +.type ${prefix}_${dir}crypt,%function +.align 5 +${prefix}_${dir}crypt: + ld1 {$data.4s},[$inp] + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] +___ + &rev32($data,$data); + &enc_blk($data); + &rev32($data,$data); +$code.=<<___; + st1 {$data.4s},[$out] + ret +.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt +___ +} + +&gen_block("en"); +&gen_block("de"); +}}} + +{{{ +my ($inp,$out,$len,$rk)=map("x$_",(0..3)); +my ($enc) = ("w4"); +my @dat=map("v$_",(16..23)); +$code.=<<___; +.globl ${prefix}_ecb_encrypt +.type ${prefix}_ecb_encrypt,%function +.align 5 +${prefix}_ecb_encrypt: + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] +1: + cmp $len,#64 + b.lt 1f + ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 + cmp $len,#128 + b.lt 2f + ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64 + // 8 blocks +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); +$code.=<<___; + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 +___ + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); +$code.=<<___; + st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 + subs $len,$len,#128 + b.gt 1b + ret + // 4 blocks +2: +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); +$code.=<<___; + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + subs $len,$len,#64 + b.gt 1b +1: + subs $len,$len,#16 + b.lt 1f + ld1 {@dat[0].4s},[$inp],#16 +___ + &rev32(@dat[0],@dat[0]); + &enc_blk(@dat[0]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + st1 {@dat[0].4s},[$out],#16 + b.ne 1b +1: + ret +.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt +___ +}}} + +{{{ +my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4)); +my ($enc) = ("w5"); +my @dat=map("v$_",(16..23)); +my @in=map("v$_",(24..31)); +my ($ivec) = ("v8"); +$code.=<<___; +.globl ${prefix}_cbc_encrypt +.type ${prefix}_cbc_encrypt,%function +.align 5 +${prefix}_cbc_encrypt: + stp d8,d9,[sp, #-16]! + + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] + ld1 {$ivec.4s},[$ivp] + cmp $enc,#0 + b.eq .Ldec +1: + cmp $len, #64 + b.lt 1f + ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 + eor @dat[0].16b,@dat[0].16b,$ivec.16b +___ + &rev32(@dat[1],@dat[1]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &enc_blk(@dat[0]); +$code.=<<___; + eor @dat[1].16b,@dat[1].16b,@dat[0].16b +___ + &enc_blk(@dat[1]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + eor @dat[2].16b,@dat[2].16b,@dat[1].16b +___ + &enc_blk(@dat[2]); + &rev32(@dat[1],@dat[1]); +$code.=<<___; + eor @dat[3].16b,@dat[3].16b,@dat[2].16b +___ + &enc_blk(@dat[3]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); +$code.=<<___; + mov $ivec.16b,@dat[3].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + subs $len,$len,#64 + b.ne 1b +1: + subs $len,$len,#16 + b.lt 3f + ld1 {@dat[0].4s},[$inp],#16 + eor $ivec.16b,$ivec.16b,@dat[0].16b +___ + &rev32($ivec,$ivec); + &enc_blk($ivec); + &rev32($ivec,$ivec); +$code.=<<___; + st1 {$ivec.16b},[$out],#16 + b.ne 1b + b 3f +.Ldec: +1: + cmp $len, #64 + b.lt 1f + ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp] + ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64 + cmp $len,#128 + b.lt 2f + // 8 blocks mode + ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp] + ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64 +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],$dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],$dat[7]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,$ivec.16b + eor @dat[1].16b,@dat[1].16b,@in[0].16b + eor @dat[2].16b,@dat[2].16b,@in[1].16b + mov $ivec.16b,@in[7].16b + eor @dat[3].16b,$dat[3].16b,@in[2].16b + eor @dat[4].16b,$dat[4].16b,@in[3].16b + eor @dat[5].16b,$dat[5].16b,@in[4].16b + eor @dat[6].16b,$dat[6].16b,@in[5].16b + eor @dat[7].16b,$dat[7].16b,@in[6].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 + subs $len,$len,128 + b.gt 1b + b 3f + // 4 blocks mode +2: +___ + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],$dat[3]); + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,$ivec.16b + eor @dat[1].16b,@dat[1].16b,@in[0].16b + mov $ivec.16b,@in[3].16b + eor @dat[2].16b,@dat[2].16b,@in[1].16b + eor @dat[3].16b,$dat[3].16b,@in[2].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + subs $len,$len,#64 + b.gt 1b +1: + subs $len,$len,#16 + b.lt 3f + ld1 {@dat[0].4s},[$inp],#16 + mov @in[0].16b,@dat[0].16b +___ + &rev32(@dat[0],@dat[0]); + &enc_blk(@dat[0]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,$ivec.16b + mov $ivec.16b,@in[0].16b + st1 {@dat[0].16b},[$out],#16 + b.ne 1b +3: + // save back IV + st1 {$ivec.16b},[$ivp] + ldp d8,d9,[sp],#16 + ret +.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt +___ +}}} + +{{{ +my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4)); +my ($ctr)=("w5"); +my @dat=map("v$_",(16..23)); +my @in=map("v$_",(24..31)); +my ($ivec)=("v8"); +$code.=<<___; +.globl ${prefix}_ctr32_encrypt_blocks +.type ${prefix}_ctr32_encrypt_blocks,%function +.align 5 +${prefix}_ctr32_encrypt_blocks: + stp d8,d9,[sp, #-16]! + + ld1 {$ivec.4s},[$ivp] + ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64 + ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] +___ + &rev32($ivec,$ivec); +$code.=<<___; + mov $ctr,$ivec.s[3] +1: + cmp $len,#4 + b.lt 1f + ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64 + mov @dat[0].16b,$ivec.16b + mov @dat[1].16b,$ivec.16b + mov @dat[2].16b,$ivec.16b + mov @dat[3].16b,$ivec.16b + add $ctr,$ctr,#1 + mov $dat[1].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[2].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[3].s[3],$ctr + cmp $len,#8 + b.lt 2f + ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64 + mov @dat[4].16b,$ivec.16b + mov @dat[5].16b,$ivec.16b + mov @dat[6].16b,$ivec.16b + mov @dat[7].16b,$ivec.16b + add $ctr,$ctr,#1 + mov $dat[4].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[5].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[6].s[3],$ctr + add $ctr,$ctr,#1 + mov @dat[7].s[3],$ctr +___ + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); + &rev32(@dat[4],@dat[4]); + &rev32(@dat[5],@dat[5]); + &rev32(@dat[6],@dat[6]); + &rev32(@dat[7],@dat[7]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,@in[0].16b + eor @dat[1].16b,@dat[1].16b,@in[1].16b + eor @dat[2].16b,@dat[2].16b,@in[2].16b + eor @dat[3].16b,@dat[3].16b,@in[3].16b + eor @dat[4].16b,@dat[4].16b,@in[4].16b + eor @dat[5].16b,@dat[5].16b,@in[5].16b + eor @dat[6].16b,@dat[6].16b,@in[6].16b + eor @dat[7].16b,@dat[7].16b,@in[7].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 + subs $len,$len,#8 + b.eq 3f + add $ctr,$ctr,#1 + mov $ivec.s[3],$ctr + b 1b +2: +___ + &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); + &rev32(@dat[0],@dat[0]); + &rev32(@dat[1],@dat[1]); + &rev32(@dat[2],@dat[2]); + &rev32(@dat[3],@dat[3]); +$code.=<<___; + eor @dat[0].16b,@dat[0].16b,@in[0].16b + eor @dat[1].16b,@dat[1].16b,@in[1].16b + eor @dat[2].16b,@dat[2].16b,@in[2].16b + eor @dat[3].16b,@dat[3].16b,@in[3].16b + st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 + subs $len,$len,#4 + b.eq 3f + add $ctr,$ctr,#1 + mov $ivec.s[3],$ctr + b 1b +1: + subs $len,$len,#1 + b.lt 3f + mov $dat[0].16b,$ivec.16b + ld1 {@in[0].4s},[$inp],#16 +___ + &enc_blk(@dat[0]); + &rev32(@dat[0],@dat[0]); +$code.=<<___; + eor $dat[0].16b,$dat[0].16b,@in[0].16b + st1 {$dat[0].4s},[$out],#16 + b.eq 3f + add $ctr,$ctr,#1 + mov $ivec.s[3],$ctr + b 1b +3: + ldp d8,d9,[sp],#16 + ret +.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks +___ +}}} +######################################## +{ my %opcode = ( + "sm4e" => 0xcec08400, + "sm4ekey" => 0xce60c800); + + sub unsm4 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sm4\w+)\s+([qv].*)/unsm4($1,$2)/ge; + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info index bb042c5..4d26ede 100644 --- a/crypto/sm4/build.info +++ b/crypto/sm4/build.info @@ -2,6 +2,17 @@ LIBS=../../libcrypto SOURCE[../../libcrypto]=\ sm4.c {- $target{sm4_asm_src} -} +GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl $(PERLASM_SCHEME) +INCLUDE[sm4-armv8.o]=.. GENERATE[vpsm4_ex-armv8.S]=asm/vpsm4_ex-armv8.pl $(PERLASM_SCHEME) -INCLUDE[vpsm4_ex-armv8.o]=.. \ No newline at end of file +INCLUDE[vpsm4_ex-armv8.o]=.. + +BEGINRAW[Makefile] +##### SM4 assembler implementations + +# GNU make "catch all" +{- $builddir -}/sm4-%.S: {- $sourcedir -}/asm/sm4-%.pl + CC="$(CC)" $(PERL) $< $(PERLASM_SCHEME) $@ + +ENDRAW[Makefile] diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h new file mode 100644 index 0000000..2f5a6cf --- /dev/null +++ b/include/crypto/sm4_platform.h @@ -0,0 +1,70 @@ +/* + * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. + * + * Licensed under the Apache License 2.0 (the "License"). You may not use + * this file except in compliance with the License. You can obtain a copy + * in the file LICENSE in the source distribution or at + * https://www.openssl.org/source/license.html + */ + +#ifndef OSSL_SM4_PLATFORM_H +# define OSSL_SM4_PLATFORM_H +# pragma once + +# if defined(OPENSSL_CPUID_OBJ) +# if (defined(__arm__) || defined(__arm) || defined(__aarch64__)) +# include "arm_arch.h" +# if __ARM_MAX_ARCH__>=7 +# if defined(VPSM4_EX_ASM) +# define VPSM4_EX_CAPABLE (OPENSSL_armcap_P & ARMV8_AES) +# endif +# define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4) +# define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key +# define HWSM4_set_decrypt_key sm4_v8_set_decrypt_key +# define HWSM4_encrypt sm4_v8_encrypt +# define HWSM4_decrypt sm4_v8_decrypt +# define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt +# define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt +# define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks +# endif +# endif +# endif /* OPENSSL_CPUID_OBJ */ + +# if defined(HWSM4_CAPABLE) +int HWSM4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); +int HWSM4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); +void HWSM4_encrypt(const unsigned char *in, unsigned char *out, + const SM4_KEY *key); +void HWSM4_decrypt(const unsigned char *in, unsigned char *out, + const SM4_KEY *key); +void HWSM4_cbc_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const SM4_KEY *key, + unsigned char *ivec, const int enc); +void HWSM4_ecb_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const SM4_KEY *key, + const int enc); +void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + const unsigned char ivec[16]); +# endif /* HWSM4_CAPABLE */ + +#ifdef VPSM4_EX_CAPABLE +void vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); +void vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); +#define vpsm4_ex_encrypt SM4_encrypt +#define vpsm4_ex_decrypt SM4_encrypt +void vpsm4_ex_ecb_encrypt( + const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key, const int enc); +/* xts mode in GB/T 17964-2021 */ +void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +void vpsm4_ex_xts_decrypt_gb(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +/* xts mode in IEEE Std 1619-2007 */ +void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +void vpsm4_ex_xts_decrypt(const unsigned char *in, unsigned char *out, size_t length, const SM4_KEY *key1, + const SM4_KEY *key2, const uint8_t iv[16]); +#endif /* VPSM4_EX_CAPABLE */ + +#endif /* OSSL_SM4_PLATFORM_H */ \ No newline at end of file -- 2.36.1
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.
浙ICP备2022010568号-2