Projects
Mega:24.03
openssl
_service:tar_scm:Backport-SM4-AESE-optimization...
Sign Up
Log In
Username
Password
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File _service:tar_scm:Backport-SM4-AESE-optimization-for-ARMv8.patch of Package openssl
From 730387aebda57a1bb0af5a74747d4dadc5e033f7 Mon Sep 17 00:00:00 2001 From: Xu Yizhou <xuyizhou1@huawei.com> Date: Wed, 18 Jan 2023 09:55:02 +0800 Subject: [PATCH 12/13] SM4 AESE optimization for ARMv8 Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com> Reviewed-by: Tomas Mraz <tomas@openssl.org> Reviewed-by: Paul Dale <pauli@openssl.org> (Merged from https://github.com/openssl/openssl/pull/19914) --- crypto/sm4/asm/vpsm4-armv8.pl | 458 +++++ crypto/sm4/asm/vpsm4_ex-armv8.pl | 1544 +++++++++++++++++ crypto/sm4/build.info | 4 +- include/crypto/sm4_platform.h | 41 +- .../implementations/ciphers/cipher_sm4_hw.c | 26 +- .../implementations/ciphers/cipher_sm4_xts.c | 4 +- .../implementations/ciphers/cipher_sm4_xts.h | 2 +- .../ciphers/cipher_sm4_xts_hw.c | 33 +- 8 files changed, 2090 insertions(+), 22 deletions(-) create mode 100644 crypto/sm4/asm/vpsm4_ex-armv8.pl diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl index 73797af582..e19de30901 100755 --- a/crypto/sm4/asm/vpsm4-armv8.pl +++ b/crypto/sm4/asm/vpsm4-armv8.pl @@ -28,6 +28,7 @@ open OUT,"| \"$^X\" $xlate $flavour \"$output\"" $prefix="vpsm4"; my @vtmp=map("v$_",(0..3)); +my @qtmp=map("q$_",(0..3)); my @data=map("v$_",(4..7)); my @datax=map("v$_",(8..11)); my ($rk0,$rk1)=("v12","v13"); @@ -36,6 +37,7 @@ my @vtmpx=map("v$_",(12..15)); my @sbox=map("v$_",(16..31)); my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3"); my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9"); +my ($xtmp1,$xtmp2)=("x8","x9"); my ($ptr,$counter)=("x10","w11"); my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15"); @@ -60,6 +62,51 @@ ___ } } +sub rev32_armeb() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { +$code.=<<___; +#ifdef __AARCH64EB__ + rev32 $dst.16b,$src.16b +#else + mov $dst.16b,$src.16b +#endif +___ + } else { +$code.=<<___; +#ifdef __AARCH64EB__ + rev32 $dst.16b,$dst.16b +#endif +___ + } +} + +sub rbit() { + my $dst = shift; + my $src = shift; + my $std = shift; + + if ($src and ("$src" ne "$dst")) { + if ($std eq "_gb") { +$code.=<<___; + rbit $dst.16b,$src.16b +___ + } else { +$code.=<<___; + mov $dst.16b,$src.16b +___ + } + } else { + if ($std eq "_gb") { +$code.=<<___; + rbit $dst.16b,$src.16b +___ + } + } +} + sub transpose() { my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; @@ -435,6 +482,58 @@ $code.=<<___; ___ } + +sub mov_reg_to_vec() { + my $src0 = shift; + my $src1 = shift; + my $desv = shift; +$code.=<<___; + mov $desv.d[0],$src0 + mov $desv.d[1],$src1 +___ + &rev32_armeb($desv,$desv); +} + +sub mov_vec_to_reg() { + my $srcv = shift; + my $des0 = shift; + my $des1 = shift; +$code.=<<___; + mov $des0,$srcv.d[0] + mov $des1,$srcv.d[1] +___ +} + +sub compute_tweak() { + my $src0 = shift; + my $src1 = shift; + my $des0 = shift; + my $des1 = shift; +$code.=<<___; + mov $wtmp0,0x87 + extr $xtmp2,$src1,$src1,#32 + extr $des1,$src1,$src0,#63 + and $wtmp1,$wtmp0,$wtmp2,asr#31 + eor $des0,$xtmp1,$src0,lsl#1 +___ +} + +sub compute_tweak_vec() { + my $src = shift; + my $des = shift; + my $std = shift; + &rbit(@vtmp[2],$src,$std); +$code.=<<___; + ldr @qtmp[0], =0x01010101010101010101010101010187 + shl $des.16b, @vtmp[2].16b, #1 + ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 + ushr @vtmp[1].16b, @vtmp[1].16b, #7 + mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b + eor $des.16b, $des.16b, @vtmp[1].16b +___ + &rbit($des,$des,$std); +} + $code=<<___; #include "arm_arch.h" .arch armv8-a @@ -1101,6 +1200,365 @@ $code.=<<___; .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks ___ }}} + +{{{ +my ($blocks,$len)=("x2","x2"); +my $ivp=("x5"); +my @twx=map("x$_",(12..27)); +my ($rks1,$rks2)=("x26","x27"); +my $lastBlk=("x26"); +my $enc=("w28"); +my $remain=("x29"); + +my @tweak=@datax; + +sub gen_xts_cipher() { + my $std = shift; +$code.=<<___; +.globl ${prefix}_xts_encrypt${std} +.type ${prefix}_xts_encrypt${std},%function +.align 5 +${prefix}_xts_encrypt${std}: + AARCH64_SIGN_LINK_REGISTER + stp x15, x16, [sp, #-0x10]! + stp x17, x18, [sp, #-0x10]! + stp x19, x20, [sp, #-0x10]! + stp x21, x22, [sp, #-0x10]! + stp x23, x24, [sp, #-0x10]! + stp x25, x26, [sp, #-0x10]! + stp x27, x28, [sp, #-0x10]! + stp x29, x30, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d14, d15, [sp, #-0x10]! + mov $rks1,x3 + mov $rks2,x4 + mov $enc,w6 + ld1 {@tweak[0].4s}, [$ivp] + mov $rks,$rks2 +___ + &load_sbox(); + &rev32(@tweak[0],@tweak[0]); + &encrypt_1blk(@tweak[0]); +$code.=<<___; + mov $rks,$rks1 + and $remain,$len,#0x0F + // convert length into blocks + lsr $blocks,$len,4 + cmp $blocks,#1 + b.lt .return${std} + + cmp $remain,0 + // If the encryption/decryption Length is N times of 16, + // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std} + b.eq .xts_encrypt_blocks${std} + + // If the encryption/decryption length is not N times of 16, + // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std} + // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std} + subs $blocks,$blocks,#1 + b.eq .only_2blks_tweak${std} +.xts_encrypt_blocks${std}: +___ + &rbit(@tweak[0],@tweak[0],$std); + &rev32_armeb(@tweak[0],@tweak[0]); + &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); + &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); + &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); + &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); + &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); + &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); + &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); + &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); +$code.=<<___; +.Lxts_8_blocks_process${std}: + cmp $blocks,#8 + b.lt .Lxts_4_blocks_process${std} +___ + &mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]); + &mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]); + &mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]); + &mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]); + &mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]); + &mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]); + &mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]); + &mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]); +$code.=<<___; + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rbit(@vtmp[0],@vtmp[0],$std); + &rbit(@vtmp[1],@vtmp[1],$std); + &rbit(@vtmp[2],@vtmp[2],$std); + &rbit(@vtmp[3],@vtmp[3],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @vtmp[0].16b + eor @data[1].16b, @data[1].16b, @vtmp[1].16b + eor @data[2].16b, @data[2].16b, @vtmp[2].16b + eor @data[3].16b, @data[3].16b, @vtmp[3].16b + ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 +___ + &rbit(@vtmpx[0],@vtmpx[0],$std); + &rbit(@vtmpx[1],@vtmpx[1],$std); + &rbit(@vtmpx[2],@vtmpx[2],$std); + &rbit(@vtmpx[3],@vtmpx[3],$std); +$code.=<<___; + eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b + eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b + eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b + eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &rev32(@datax[0],@datax[0]); + &rev32(@datax[1],@datax[1]); + &rev32(@datax[2],@datax[2]); + &rev32(@datax[3],@datax[3]); + &transpose(@data,@vtmp); + &transpose(@datax,@vtmp); +$code.=<<___; + bl _${prefix}_enc_8blks +___ + &transpose(@vtmp,@datax); + &transpose(@data,@datax); + + &mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]); + &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]); + &mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]); + &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); + &mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]); + &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); + &mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]); + &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); + &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]); + &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); + &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]); + &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); + &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]); + &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); + &mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]); + &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b + eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b + eor @data[3].16b, @data[3].16b, @tweak[3].16b + + // save the last tweak + st1 {@tweak[3].4s},[$ivp] + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.gt .Lxts_8_blocks_process${std} + b 100f +.Lxts_4_blocks_process${std}: +___ + &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); + &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); + &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); + &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); +$code.=<<___; + cmp $blocks,#4 + b.lt 1f + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rbit(@tweak[0],@tweak[0],$std); + &rbit(@tweak[1],@tweak[1],$std); + &rbit(@tweak[2],@tweak[2],$std); + &rbit(@tweak[3],@tweak[3],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b + eor @data[3].16b, @data[3].16b, @tweak[3].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &transpose(@data,@vtmp); +$code.=<<___; + bl _${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + sub $blocks,$blocks,#4 +___ + &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]); + &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]); + &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]); +$code.=<<___; + // save the last tweak + st1 {@tweak[3].4s},[$ivp] +1: + // process last block + cmp $blocks,#1 + b.lt 100f + b.gt 1f + ld1 {@data[0].4s},[$inp],#16 +___ + &rbit(@tweak[0],@tweak[0],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + st1 {@data[0].4s},[$outp],#16 + // save the last tweak + st1 {@tweak[0].4s},[$ivp] + b 100f +1: // process last 2 blocks + cmp $blocks,#2 + b.gt 1f + ld1 {@data[0].4s,@data[1].4s},[$inp],#32 +___ + &rbit(@tweak[0],@tweak[0],$std); + &rbit(@tweak[1],@tweak[1],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &transpose(@data,@vtmp); +$code.=<<___; + bl _${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 + // save the last tweak + st1 {@tweak[1].4s},[$ivp] + b 100f +1: // process last 3 blocks + ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 +___ + &rbit(@tweak[0],@tweak[0],$std); + &rbit(@tweak[1],@tweak[1],$std); + &rbit(@tweak[2],@tweak[2],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &transpose(@data,@vtmp); +$code.=<<___; + bl _${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 + // save the last tweak + st1 {@tweak[2].4s},[$ivp] +100: + cmp $remain,0 + b.eq .return${std} + +// This brance calculates the last two tweaks, +// while the encryption/decryption length is larger than 32 +.last_2blks_tweak${std}: + ld1 {@tweak[0].4s},[$ivp] +___ + &rev32_armeb(@tweak[0],@tweak[0]); + &compute_tweak_vec(@tweak[0],@tweak[1],$std); + &compute_tweak_vec(@tweak[1],@tweak[2],$std); +$code.=<<___; + b .check_dec${std} + + +// This brance calculates the last two tweaks, +// while the encryption/decryption length is equal to 32, who only need two tweaks +.only_2blks_tweak${std}: + mov @tweak[1].16b,@tweak[0].16b +___ + &rev32_armeb(@tweak[1],@tweak[1]); + &compute_tweak_vec(@tweak[1],@tweak[2]); +$code.=<<___; + b .check_dec${std} + + +// Determine whether encryption or decryption is required. +// The last two tweaks need to be swapped for decryption. +.check_dec${std}: + // encryption:1 decryption:0 + cmp $enc,1 + b.eq .prcess_last_2blks${std} + mov @vtmp[0].16B,@tweak[1].16b + mov @tweak[1].16B,@tweak[2].16b + mov @tweak[2].16B,@vtmp[0].16b + +.prcess_last_2blks${std}: +___ + &rev32_armeb(@tweak[1],@tweak[1]); + &rev32_armeb(@tweak[2],@tweak[2]); +$code.=<<___; + ld1 {@data[0].4s},[$inp],#16 + eor @data[0].16b, @data[0].16b, @tweak[1].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[1].16b + st1 {@data[0].4s},[$outp],#16 + + sub $lastBlk,$outp,16 + .loop${std}: + subs $remain,$remain,1 + ldrb $wtmp0,[$lastBlk,$remain] + ldrb $wtmp1,[$inp,$remain] + strb $wtmp1,[$lastBlk,$remain] + strb $wtmp0,[$outp,$remain] + b.gt .loop${std} + ld1 {@data[0].4s}, [$lastBlk] + eor @data[0].16b, @data[0].16b, @tweak[2].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[2].16b + st1 {@data[0].4s}, [$lastBlk] +.return${std}: + ldp d14, d15, [sp], #0x10 + ldp d12, d13, [sp], #0x10 + ldp d10, d11, [sp], #0x10 + ldp d8, d9, [sp], #0x10 + ldp x29, x30, [sp], #0x10 + ldp x27, x28, [sp], #0x10 + ldp x25, x26, [sp], #0x10 + ldp x23, x24, [sp], #0x10 + ldp x21, x22, [sp], #0x10 + ldp x19, x20, [sp], #0x10 + ldp x17, x18, [sp], #0x10 + ldp x15, x16, [sp], #0x10 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std} +___ +} # end of gen_xts_cipher +&gen_xts_cipher("_gb"); +&gen_xts_cipher(""); +}}} ######################################## open SELF,$0; while(<SELF>) { diff --git a/crypto/sm4/asm/vpsm4_ex-armv8.pl b/crypto/sm4/asm/vpsm4_ex-armv8.pl new file mode 100644 index 0000000000..3d094aa535 --- /dev/null +++ b/crypto/sm4/asm/vpsm4_ex-armv8.pl @@ -0,0 +1,1544 @@ +#! /usr/bin/env perl +# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# This module implements SM4 with ASIMD and AESE on AARCH64 +# +# Dec 2022 +# + +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + or die "can't call $xlate: $!"; +*STDOUT=*OUT; + +$prefix="vpsm4_ex"; +my @vtmp=map("v$_",(0..3)); +my @qtmp=map("q$_",(0..3)); +my @data=map("v$_",(4..7)); +my @datax=map("v$_",(8..11)); +my ($rk0,$rk1)=("v12","v13"); +my ($rka,$rkb)=("v14","v15"); +my @vtmpx=map("v$_",(12..15)); +my ($vtmp4,$vtmp5)=("v24","v25"); +my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31"); +my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31"); + +my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3"); +my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9"); +my ($xtmp1,$xtmp2)=("x8","x9"); +my ($ptr,$counter)=("x10","w11"); +my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15"); + +sub rev32() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { +$code.=<<___; +#ifndef __AARCH64EB__ + rev32 $dst.16b,$src.16b +#else + mov $dst.16b,$src.16b +#endif +___ + } else { +$code.=<<___; +#ifndef __AARCH64EB__ + rev32 $dst.16b,$dst.16b +#endif +___ + } +} + +sub rev32_armeb() { + my $dst = shift; + my $src = shift; + + if ($src and ("$src" ne "$dst")) { +$code.=<<___; +#ifdef __AARCH64EB__ + rev32 $dst.16b,$src.16b +#else + mov $dst.16b,$src.16b +#endif +___ + } else { +$code.=<<___; +#ifdef __AARCH64EB__ + rev32 $dst.16b,$dst.16b +#endif +___ + } +} + +sub rbit() { + my $dst = shift; + my $src = shift; + my $std = shift; + + if ($src and ("$src" ne "$dst")) { + if ($std eq "_gb") { +$code.=<<___; + rbit $dst.16b,$src.16b +___ + } else { +$code.=<<___; + mov $dst.16b,$src.16b +___ + } + } else { + if ($std eq "_gb") { +$code.=<<___; + rbit $dst.16b,$src.16b +___ + } + } +} + +sub transpose() { + my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; + +$code.=<<___; + zip1 $vt0.4s,$dat0.4s,$dat1.4s + zip2 $vt1.4s,$dat0.4s,$dat1.4s + zip1 $vt2.4s,$dat2.4s,$dat3.4s + zip2 $vt3.4s,$dat2.4s,$dat3.4s + zip1 $dat0.2d,$vt0.2d,$vt2.2d + zip2 $dat1.2d,$vt0.2d,$vt2.2d + zip1 $dat2.2d,$vt1.2d,$vt3.2d + zip2 $dat3.2d,$vt1.2d,$vt3.2d +___ +} + +# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x) +sub mul_matrix() { + my $x = shift; + my $higherMat = shift; + my $lowerMat = shift; + my $tmp = shift; +$code.=<<___; + ushr $tmp.16b, $x.16b, 4 + and $x.16b, $x.16b, $ANDMaskV.16b + tbl $x.16b, {$lowerMat.16b}, $x.16b + tbl $tmp.16b, {$higherMat.16b}, $tmp.16b + eor $x.16b, $x.16b, $tmp.16b +___ +} + +# sbox operations for 4-lane of words +# sbox operation for 4-lane of words +sub sbox() { + my $dat = shift; + +$code.=<<___; + // optimize sbox using AESE instruction + tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); +$code.=<<___; + eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b + aese @vtmp[0].16b,@vtmp[1].16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4); +$code.=<<___; + mov $dat.16b,@vtmp[0].16b + + // linear transformation + ushr @vtmp[0].4s,$dat.4s,32-2 + ushr @vtmp[1].4s,$dat.4s,32-10 + ushr @vtmp[2].4s,$dat.4s,32-18 + ushr @vtmp[3].4s,$dat.4s,32-24 + sli @vtmp[0].4s,$dat.4s,2 + sli @vtmp[1].4s,$dat.4s,10 + sli @vtmp[2].4s,$dat.4s,18 + sli @vtmp[3].4s,$dat.4s,24 + eor $vtmp4.16b,@vtmp[0].16b,$dat.16b + eor $vtmp4.16b,$vtmp4.16b,$vtmp[1].16b + eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b + eor $dat.16b,$dat.16b,$vtmp4.16b +___ +} + +# sbox operation for 8-lane of words +sub sbox_double() { + my $dat = shift; + my $datx = shift; + +$code.=<<___; + // optimize sbox using AESE instruction + tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b + tbl @vtmp[1].16b, {$datx.16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); + &mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4); +$code.=<<___; + eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b + aese @vtmp[0].16b,$vtmp5.16b + aese @vtmp[1].16b,$vtmp5.16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4); + &mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4); +$code.=<<___; + mov $dat.16b,@vtmp[0].16b + mov $datx.16b,@vtmp[1].16b + + // linear transformation + ushr @vtmp[0].4s,$dat.4s,32-2 + ushr $vtmp5.4s,$datx.4s,32-2 + ushr @vtmp[1].4s,$dat.4s,32-10 + ushr @vtmp[2].4s,$dat.4s,32-18 + ushr @vtmp[3].4s,$dat.4s,32-24 + sli @vtmp[0].4s,$dat.4s,2 + sli $vtmp5.4s,$datx.4s,2 + sli @vtmp[1].4s,$dat.4s,10 + sli @vtmp[2].4s,$dat.4s,18 + sli @vtmp[3].4s,$dat.4s,24 + eor $vtmp4.16b,@vtmp[0].16b,$dat.16b + eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b + eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b + eor $dat.16b,$dat.16b,$vtmp4.16b + ushr @vtmp[1].4s,$datx.4s,32-10 + ushr @vtmp[2].4s,$datx.4s,32-18 + ushr @vtmp[3].4s,$datx.4s,32-24 + sli @vtmp[1].4s,$datx.4s,10 + sli @vtmp[2].4s,$datx.4s,18 + sli @vtmp[3].4s,$datx.4s,24 + eor $vtmp4.16b,$vtmp5.16b,$datx.16b + eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b + eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b + eor $datx.16b,$datx.16b,$vtmp4.16b +___ +} + +# sbox operation for one single word +sub sbox_1word () { + my $word = shift; + +$code.=<<___; + mov @vtmp[3].s[0],$word + // optimize sbox using AESE instruction + tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); +$code.=<<___; + eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b + aese @vtmp[0].16b,@vtmp[1].16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); +$code.=<<___; + + mov $wtmp0,@vtmp[0].s[0] + eor $word,$wtmp0,$wtmp0,ror #32-2 + eor $word,$word,$wtmp0,ror #32-10 + eor $word,$word,$wtmp0,ror #32-18 + eor $word,$word,$wtmp0,ror #32-24 +___ +} + +# sm4 for one block of data, in scalar registers word0/word1/word2/word3 +sub sm4_1blk () { + my $kptr = shift; + +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor $tmpw,$word2,$word3 + eor $wtmp2,$wtmp0,$word1 + eor $tmpw,$tmpw,$wtmp2 +___ + &sbox_1word($tmpw); +$code.=<<___; + eor $word0,$word0,$tmpw + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor $tmpw,$word2,$word3 + eor $wtmp2,$word0,$wtmp1 + eor $tmpw,$tmpw,$wtmp2 +___ + &sbox_1word($tmpw); +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + eor $word1,$word1,$tmpw + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor $tmpw,$word0,$word1 + eor $wtmp2,$wtmp0,$word3 + eor $tmpw,$tmpw,$wtmp2 +___ + &sbox_1word($tmpw); +$code.=<<___; + eor $word2,$word2,$tmpw + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor $tmpw,$word0,$word1 + eor $wtmp2,$word2,$wtmp1 + eor $tmpw,$tmpw,$wtmp2 +___ + &sbox_1word($tmpw); +$code.=<<___; + eor $word3,$word3,$tmpw +___ +} + +# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3 +sub sm4_4blks () { + my $kptr = shift; + +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + dup $rk0.4s,$wtmp0 + dup $rk1.4s,$wtmp1 + + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + eor $rka.16b,@data[2].16b,@data[3].16b + eor $rk0.16b,@data[1].16b,$rk0.16b + eor $rk0.16b,$rka.16b,$rk0.16b +___ + &sbox($rk0); +$code.=<<___; + eor @data[0].16b,@data[0].16b,$rk0.16b + + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + eor $rka.16b,$rka.16b,@data[0].16b + eor $rk1.16b,$rka.16b,$rk1.16b +___ + &sbox($rk1); +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + eor @data[1].16b,@data[1].16b,$rk1.16b + + dup $rk0.4s,$wtmp0 + dup $rk1.4s,$wtmp1 + + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + eor $rka.16b,@data[0].16b,@data[1].16b + eor $rk0.16b,@data[3].16b,$rk0.16b + eor $rk0.16b,$rka.16b,$rk0.16b +___ + &sbox($rk0); +$code.=<<___; + eor @data[2].16b,@data[2].16b,$rk0.16b + + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + eor $rka.16b,$rka.16b,@data[2].16b + eor $rk1.16b,$rka.16b,$rk1.16b +___ + &sbox($rk1); +$code.=<<___; + eor @data[3].16b,@data[3].16b,$rk1.16b +___ +} + +# sm4 for 8 lanes of data, in neon registers +# data0/data1/data2/data3 datax0/datax1/datax2/datax3 +sub sm4_8blks () { + my $kptr = shift; + +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) + dup $rk0.4s,$wtmp0 + eor $rka.16b,@data[2].16b,@data[3].16b + eor $rkb.16b,@datax[2].16b,@datax[3].16b + eor @vtmp[0].16b,@data[1].16b,$rk0.16b + eor @vtmp[1].16b,@datax[1].16b,$rk0.16b + eor $rk0.16b,$rka.16b,@vtmp[0].16b + eor $rk1.16b,$rkb.16b,@vtmp[1].16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + eor @data[0].16b,@data[0].16b,$rk0.16b + eor @datax[0].16b,@datax[0].16b,$rk1.16b + + // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) + dup $rk1.4s,$wtmp1 + eor $rka.16b,$rka.16b,@data[0].16b + eor $rkb.16b,$rkb.16b,@datax[0].16b + eor $rk0.16b,$rka.16b,$rk1.16b + eor $rk1.16b,$rkb.16b,$rk1.16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + ldp $wtmp0,$wtmp1,[$kptr],8 + eor @data[1].16b,@data[1].16b,$rk0.16b + eor @datax[1].16b,@datax[1].16b,$rk1.16b + + // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) + dup $rk0.4s,$wtmp0 + eor $rka.16b,@data[0].16b,@data[1].16b + eor $rkb.16b,@datax[0].16b,@datax[1].16b + eor @vtmp[0].16b,@data[3].16b,$rk0.16b + eor @vtmp[1].16b,@datax[3].16b,$rk0.16b + eor $rk0.16b,$rka.16b,@vtmp[0].16b + eor $rk1.16b,$rkb.16b,@vtmp[1].16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + eor @data[2].16b,@data[2].16b,$rk0.16b + eor @datax[2].16b,@datax[2].16b,$rk1.16b + + // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) + dup $rk1.4s,$wtmp1 + eor $rka.16b,$rka.16b,@data[2].16b + eor $rkb.16b,$rkb.16b,@datax[2].16b + eor $rk0.16b,$rka.16b,$rk1.16b + eor $rk1.16b,$rkb.16b,$rk1.16b +___ + &sbox_double($rk0,$rk1); +$code.=<<___; + eor @data[3].16b,@data[3].16b,$rk0.16b + eor @datax[3].16b,@datax[3].16b,$rk1.16b +___ +} + +sub encrypt_1blk_norev() { + my $dat = shift; + +$code.=<<___; + mov $ptr,$rks + mov $counter,#8 + mov $word0,$dat.s[0] + mov $word1,$dat.s[1] + mov $word2,$dat.s[2] + mov $word3,$dat.s[3] +10: +___ + &sm4_1blk($ptr); +$code.=<<___; + subs $counter,$counter,#1 + b.ne 10b + mov $dat.s[0],$word3 + mov $dat.s[1],$word2 + mov $dat.s[2],$word1 + mov $dat.s[3],$word0 +___ +} + +sub encrypt_1blk() { + my $dat = shift; + + &encrypt_1blk_norev($dat); + &rev32($dat,$dat); +} + +sub encrypt_4blks() { +$code.=<<___; + mov $ptr,$rks + mov $counter,#8 +10: +___ + &sm4_4blks($ptr); +$code.=<<___; + subs $counter,$counter,#1 + b.ne 10b +___ + &rev32(@vtmp[3],@data[0]); + &rev32(@vtmp[2],@data[1]); + &rev32(@vtmp[1],@data[2]); + &rev32(@vtmp[0],@data[3]); +} + +sub encrypt_8blks() { +$code.=<<___; + mov $ptr,$rks + mov $counter,#8 +10: +___ + &sm4_8blks($ptr); +$code.=<<___; + subs $counter,$counter,#1 + b.ne 10b +___ + &rev32(@vtmp[3],@data[0]); + &rev32(@vtmp[2],@data[1]); + &rev32(@vtmp[1],@data[2]); + &rev32(@vtmp[0],@data[3]); + &rev32(@data[3],@datax[0]); + &rev32(@data[2],@datax[1]); + &rev32(@data[1],@datax[2]); + &rev32(@data[0],@datax[3]); +} + +sub load_sbox () { + my $data = shift; + +$code.=<<___; + ldr $MaskQ, =0x0306090c0f0205080b0e0104070a0d00 + ldr $TAHMatQ, =0x22581a6002783a4062185a2042387a00 + ldr $TALMatQ, =0xc10bb67c4a803df715df62a89e54e923 + ldr $ATAHMatQ, =0x1407c6d56c7fbeadb9aa6b78c1d21300 + ldr $ATALMatQ, =0xe383c1a1fe9edcbc6404462679195b3b + ldr $ANDMaskQ, =0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f +___ +} + +sub mov_reg_to_vec() { + my $src0 = shift; + my $src1 = shift; + my $desv = shift; +$code.=<<___; + mov $desv.d[0],$src0 + mov $desv.d[1],$src1 +___ + &rev32_armeb($desv,$desv); +} + +sub mov_vec_to_reg() { + my $srcv = shift; + my $des0 = shift; + my $des1 = shift; +$code.=<<___; + mov $des0,$srcv.d[0] + mov $des1,$srcv.d[1] +___ +} + +sub compute_tweak() { + my $src0 = shift; + my $src1 = shift; + my $des0 = shift; + my $des1 = shift; +$code.=<<___; + mov $wtmp0,0x87 + extr $xtmp2,$src1,$src1,#32 + extr $des1,$src1,$src0,#63 + and $wtmp1,$wtmp0,$wtmp2,asr#31 + eor $des0,$xtmp1,$src0,lsl#1 +___ +} + +sub compute_tweak_vec() { + my $src = shift; + my $des = shift; + my $std = shift; + &rbit(@vtmp[2],$src,$std); +$code.=<<___; + ldr @qtmp[0], =0x01010101010101010101010101010187 + shl $des.16b, @vtmp[2].16b, #1 + ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 + ushr @vtmp[1].16b, @vtmp[1].16b, #7 + mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b + eor $des.16b, $des.16b, @vtmp[1].16b +___ + &rbit($des,$des,$std); +} + +$code=<<___; +#include "arm_arch.h" +.arch armv8-a+crypto +.text + +.type _${prefix}_consts,%object +.align 7 +_${prefix}_consts: +.Lck: + .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 + .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 + .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 + .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 + .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 + .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 + .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 + .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 +.Lfk: + .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197 +.Lshuffles: + .dword 0x0B0A090807060504,0x030201000F0E0D0C + +.size _${prefix}_consts,.-_${prefix}_consts +___ + +{{{ +my ($key,$keys,$enc)=("x0","x1","w2"); +my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8"); +my ($vkey,$vfk,$vmap)=("v5","v6","v7"); +$code.=<<___; +.type _${prefix}_set_key,%function +.align 4 +_${prefix}_set_key: + AARCH64_VALID_CALL_TARGET + ld1 {$vkey.4s},[$key] +___ + &load_sbox(); + &rev32($vkey,$vkey); +$code.=<<___; + adr $pointer,.Lshuffles + ld1 {$vmap.2d},[$pointer] + adr $pointer,.Lfk + ld1 {$vfk.2d},[$pointer] + eor $vkey.16b,$vkey.16b,$vfk.16b + mov $schedules,#32 + adr $pointer,.Lck + movi @vtmp[0].16b,#64 + cbnz $enc,1f + add $keys,$keys,124 +1: + mov $wtmp,$vkey.s[1] + ldr $roundkey,[$pointer],#4 + eor $roundkey,$roundkey,$wtmp + mov $wtmp,$vkey.s[2] + eor $roundkey,$roundkey,$wtmp + mov $wtmp,$vkey.s[3] + eor $roundkey,$roundkey,$wtmp + // optimize sbox using AESE instruction + mov @data[0].s[0],$roundkey + tbl @vtmp[0].16b, {@data[0].16b}, $MaskV.16b +___ + &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); +$code.=<<___; + eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b + aese @vtmp[0].16b,@vtmp[1].16b +___ + &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); +$code.=<<___; + mov $wtmp,@vtmp[0].s[0] + eor $roundkey,$wtmp,$wtmp,ror #19 + eor $roundkey,$roundkey,$wtmp,ror #9 + mov $wtmp,$vkey.s[0] + eor $roundkey,$roundkey,$wtmp + mov $vkey.s[0],$roundkey + cbz $enc,2f + str $roundkey,[$keys],#4 + b 3f +2: + str $roundkey,[$keys],#-4 +3: + tbl $vkey.16b,{$vkey.16b},$vmap.16b + subs $schedules,$schedules,#1 + b.ne 1b + ret +.size _${prefix}_set_key,.-_${prefix}_set_key +___ +}}} + + +{{{ +$code.=<<___; +.type _${prefix}_enc_4blks,%function +.align 4 +_${prefix}_enc_4blks: + AARCH64_VALID_CALL_TARGET +___ + &encrypt_4blks(); +$code.=<<___; + ret +.size _${prefix}_enc_4blks,.-_${prefix}_enc_4blks +___ +}}} + +{{{ +$code.=<<___; +.type _${prefix}_enc_8blks,%function +.align 4 +_${prefix}_enc_8blks: + AARCH64_VALID_CALL_TARGET +___ + &encrypt_8blks(); +$code.=<<___; + ret +.size _${prefix}_enc_8blks,.-_${prefix}_enc_8blks +___ +}}} + + +{{{ +my ($key,$keys)=("x0","x1"); +$code.=<<___; +.globl ${prefix}_set_encrypt_key +.type ${prefix}_set_encrypt_key,%function +.align 5 +${prefix}_set_encrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + mov w2,1 + bl _${prefix}_set_key + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key +___ +}}} + +{{{ +my ($key,$keys)=("x0","x1"); +$code.=<<___; +.globl ${prefix}_set_decrypt_key +.type ${prefix}_set_decrypt_key,%function +.align 5 +${prefix}_set_decrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + mov w2,0 + bl _${prefix}_set_key + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key +___ +}}} + +{{{ +sub gen_block () { + my $dir = shift; + my ($inp,$outp,$rk)=map("x$_",(0..2)); + +$code.=<<___; +.globl ${prefix}_${dir}crypt +.type ${prefix}_${dir}crypt,%function +.align 5 +${prefix}_${dir}crypt: + AARCH64_VALID_CALL_TARGET + ld1 {@data[0].4s},[$inp] +___ + &load_sbox(); + &rev32(@data[0],@data[0]); +$code.=<<___; + mov $rks,$rk +___ + &encrypt_1blk(@data[0]); +$code.=<<___; + st1 {@data[0].4s},[$outp] + ret +.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt +___ +} +&gen_block("en"); +&gen_block("de"); +}}} + +{{{ +$code.=<<___; +.globl ${prefix}_ecb_encrypt +.type ${prefix}_ecb_encrypt,%function +.align 5 +${prefix}_ecb_encrypt: + AARCH64_SIGN_LINK_REGISTER + // convert length into blocks + lsr x2,x2,4 + stp d8,d9,[sp,#-80]! + stp d10,d11,[sp,#16] + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + stp x29,x30,[sp,#64] +___ + &load_sbox(); +$code.=<<___; +.Lecb_8_blocks_process: + cmp $blocks,#8 + b.lt .Lecb_4_blocks_process + ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 + ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &rev32(@datax[0],@datax[0]); + &rev32(@datax[1],@datax[1]); + &rev32(@datax[2],@datax[2]); + &rev32(@datax[3],@datax[3]); +$code.=<<___; + bl _${prefix}_enc_8blks + st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.gt .Lecb_8_blocks_process + b 100f +.Lecb_4_blocks_process: + cmp $blocks,#4 + b.lt 1f + ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl _${prefix}_enc_4blks + st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + sub $blocks,$blocks,#4 +1: + // process last block + cmp $blocks,#1 + b.lt 100f + b.gt 1f + ld1 {@data[0].4s},[$inp] +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0]); +$code.=<<___; + st1 {@data[0].4s},[$outp] + b 100f +1: // process last 2 blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16 + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16 + cmp $blocks,#2 + b.gt 1f +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl _${prefix}_enc_4blks + st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 + st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp] + b 100f +1: // process last 3 blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16 +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl _${prefix}_enc_4blks + st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 + st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16 + st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp] +100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + ldp d14,d15,[sp,#48] + ldp x29,x30,[sp,#64] + ldp d8,d9,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt +___ +}}} + +{{{ +my ($len,$ivp,$enc)=("x2","x4","w5"); +my $ivec0=("v3"); +my $ivec1=("v15"); + +$code.=<<___; +.globl ${prefix}_cbc_encrypt +.type ${prefix}_cbc_encrypt,%function +.align 5 +${prefix}_cbc_encrypt: + AARCH64_VALID_CALL_TARGET + lsr $len,$len,4 +___ + &load_sbox(); +$code.=<<___; + cbz $enc,.Ldec + ld1 {$ivec0.4s},[$ivp] +.Lcbc_4_blocks_enc: + cmp $blocks,#4 + b.lt 1f + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 + eor @data[0].16b,@data[0].16b,$ivec0.16b +___ + &rev32(@data[1],@data[1]); + &rev32(@data[0],@data[0]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &encrypt_1blk_norev(@data[0]); +$code.=<<___; + eor @data[1].16b,@data[1].16b,@data[0].16b +___ + &encrypt_1blk_norev(@data[1]); + &rev32(@data[0],@data[0]); + +$code.=<<___; + eor @data[2].16b,@data[2].16b,@data[1].16b +___ + &encrypt_1blk_norev(@data[2]); + &rev32(@data[1],@data[1]); +$code.=<<___; + eor @data[3].16b,@data[3].16b,@data[2].16b +___ + &encrypt_1blk_norev(@data[3]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + orr $ivec0.16b,@data[3].16b,@data[3].16b + st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#4 + b.ne .Lcbc_4_blocks_enc + b 2f +1: + subs $blocks,$blocks,#1 + b.lt 2f + ld1 {@data[0].4s},[$inp],#16 + eor $ivec0.16b,$ivec0.16b,@data[0].16b +___ + &rev32($ivec0,$ivec0); + &encrypt_1blk($ivec0); +$code.=<<___; + st1 {$ivec0.4s},[$outp],#16 + b 1b +2: + // save back IV + st1 {$ivec0.4s},[$ivp] + ret + +.Ldec: + // decryption mode starts + AARCH64_SIGN_LINK_REGISTER + stp d8,d9,[sp,#-80]! + stp d10,d11,[sp,#16] + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + stp x29,x30,[sp,#64] +.Lcbc_8_blocks_dec: + cmp $blocks,#8 + b.lt 1f + ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] + add $ptr,$inp,#64 + ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr] +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],$data[3]); + &rev32(@datax[0],@datax[0]); + &rev32(@datax[1],@datax[1]); + &rev32(@datax[2],@datax[2]); + &rev32(@datax[3],$datax[3]); +$code.=<<___; + bl _${prefix}_enc_8blks +___ + &transpose(@vtmp,@datax); + &transpose(@data,@datax); +$code.=<<___; + ld1 {$ivec1.4s},[$ivp] + ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 + // note ivec1 and vtmpx[3] are resuing the same register + // care needs to be taken to avoid conflict + eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b + ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 + eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b + eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b + eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b + // save back IV + st1 {$vtmpx[3].4s}, [$ivp] + eor @data[0].16b,@data[0].16b,$datax[3].16b + eor @data[1].16b,@data[1].16b,@vtmpx[0].16b + eor @data[2].16b,@data[2].16b,@vtmpx[1].16b + eor @data[3].16b,$data[3].16b,@vtmpx[2].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.gt .Lcbc_8_blocks_dec + b.eq 100f +1: + ld1 {$ivec1.4s},[$ivp] +.Lcbc_4_blocks_dec: + cmp $blocks,#4 + b.lt 1f + ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],$data[3]); +$code.=<<___; + bl _${prefix}_enc_4blks + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &transpose(@vtmp,@datax); +$code.=<<___; + eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b + eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b + orr $ivec1.16b,@data[3].16b,@data[3].16b + eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b + eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + subs $blocks,$blocks,#4 + b.gt .Lcbc_4_blocks_dec + // save back IV + st1 {@data[3].4s}, [$ivp] + b 100f +1: // last block + subs $blocks,$blocks,#1 + b.lt 100f + b.gt 1f + ld1 {@data[0].4s},[$inp],#16 + // save back IV + st1 {$data[0].4s}, [$ivp] +___ + &rev32(@datax[0],@data[0]); + &encrypt_1blk(@datax[0]); +$code.=<<___; + eor @datax[0].16b,@datax[0].16b,$ivec1.16b + st1 {@datax[0].4s},[$outp],#16 + b 100f +1: // last two blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp] + add $ptr,$inp,#16 + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16 + subs $blocks,$blocks,1 + b.gt 1f +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl _${prefix}_enc_4blks + ld1 {@data[0].4s,@data[1].4s},[$inp],#32 +___ + &transpose(@vtmp,@datax); +$code.=<<___; + eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b + eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b + st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 + // save back IV + st1 {@data[1].4s}, [$ivp] + b 100f +1: // last 3 blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr] +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); +$code.=<<___; + bl _${prefix}_enc_4blks + ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 +___ + &transpose(@vtmp,@datax); +$code.=<<___; + eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b + eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b + eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 + // save back IV + st1 {@data[2].4s}, [$ivp] +100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + ldp d14,d15,[sp,#48] + ldp x29,x30,[sp,#64] + ldp d8,d9,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt +___ +}}} + +{{{ +my ($ivp)=("x4"); +my ($ctr)=("w5"); +my $ivec=("v3"); + +$code.=<<___; +.globl ${prefix}_ctr32_encrypt_blocks +.type ${prefix}_ctr32_encrypt_blocks,%function +.align 5 +${prefix}_ctr32_encrypt_blocks: + AARCH64_VALID_CALL_TARGET + ld1 {$ivec.4s},[$ivp] +___ + &rev32($ivec,$ivec); + &load_sbox(); +$code.=<<___; + cmp $blocks,#1 + b.ne 1f + // fast processing for one single block without + // context saving overhead +___ + &encrypt_1blk($ivec); +$code.=<<___; + ld1 {@data[0].4s},[$inp] + eor @data[0].16b,@data[0].16b,$ivec.16b + st1 {@data[0].4s},[$outp] + ret +1: + AARCH64_SIGN_LINK_REGISTER + stp d8,d9,[sp,#-80]! + stp d10,d11,[sp,#16] + stp d12,d13,[sp,#32] + stp d14,d15,[sp,#48] + stp x29,x30,[sp,#64] + mov $word0,$ivec.s[0] + mov $word1,$ivec.s[1] + mov $word2,$ivec.s[2] + mov $ctr,$ivec.s[3] +.Lctr32_4_blocks_process: + cmp $blocks,#4 + b.lt 1f + dup @data[0].4s,$word0 + dup @data[1].4s,$word1 + dup @data[2].4s,$word2 + mov @data[3].s[0],$ctr + add $ctr,$ctr,#1 + mov $data[3].s[1],$ctr + add $ctr,$ctr,#1 + mov @data[3].s[2],$ctr + add $ctr,$ctr,#1 + mov @data[3].s[3],$ctr + add $ctr,$ctr,#1 + cmp $blocks,#8 + b.ge .Lctr32_8_blocks_process + bl _${prefix}_enc_4blks + ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 + eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b + eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b + eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b + eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b + st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + subs $blocks,$blocks,#4 + b.ne .Lctr32_4_blocks_process + b 100f +.Lctr32_8_blocks_process: + dup @datax[0].4s,$word0 + dup @datax[1].4s,$word1 + dup @datax[2].4s,$word2 + mov @datax[3].s[0],$ctr + add $ctr,$ctr,#1 + mov $datax[3].s[1],$ctr + add $ctr,$ctr,#1 + mov @datax[3].s[2],$ctr + add $ctr,$ctr,#1 + mov @datax[3].s[3],$ctr + add $ctr,$ctr,#1 + bl _${prefix}_enc_8blks + ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 + ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 + eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b + eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b + eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b + eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b + eor @data[0].16b,@data[0].16b,@datax[0].16b + eor @data[1].16b,@data[1].16b,@datax[1].16b + eor @data[2].16b,@data[2].16b,@datax[2].16b + eor @data[3].16b,@data[3].16b,@datax[3].16b + st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.ne .Lctr32_4_blocks_process + b 100f +1: // last block processing + subs $blocks,$blocks,#1 + b.lt 100f + b.gt 1f + mov $ivec.s[0],$word0 + mov $ivec.s[1],$word1 + mov $ivec.s[2],$word2 + mov $ivec.s[3],$ctr +___ + &encrypt_1blk($ivec); +$code.=<<___; + ld1 {@data[0].4s},[$inp] + eor @data[0].16b,@data[0].16b,$ivec.16b + st1 {@data[0].4s},[$outp] + b 100f +1: // last 2 blocks processing + dup @data[0].4s,$word0 + dup @data[1].4s,$word1 + dup @data[2].4s,$word2 + mov @data[3].s[0],$ctr + add $ctr,$ctr,#1 + mov @data[3].s[1],$ctr + subs $blocks,$blocks,#1 + b.ne 1f + bl _${prefix}_enc_4blks + ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 + ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 + eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b + eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b + eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b + eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b + st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 + st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 + b 100f +1: // last 3 blocks processing + add $ctr,$ctr,#1 + mov @data[3].s[2],$ctr + bl _${prefix}_enc_4blks + ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 + ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 + ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16 + eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b + eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b + eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b + eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b + st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 + st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 + st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16 +100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] + ldp d14,d15,[sp,#48] + ldp x29,x30,[sp,#64] + ldp d8,d9,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks +___ +}}} + + +{{{ +my ($blocks,$len)=("x2","x2"); +my $ivp=("x5"); +my @twx=map("x$_",(12..27)); +my ($rks1,$rks2)=("x26","x27"); +my $lastBlk=("x26"); +my $enc=("w28"); +my $remain=("x29"); + +my @tweak=map("v$_",(16..23)); +my $lastTweak=("v25"); + +sub gen_xts_cipher() { + my $std = shift; +$code.=<<___; +.globl ${prefix}_xts_encrypt${std} +.type ${prefix}_xts_encrypt${std},%function +.align 5 +${prefix}_xts_encrypt${std}: + AARCH64_SIGN_LINK_REGISTER + stp x15, x16, [sp, #-0x10]! + stp x17, x18, [sp, #-0x10]! + stp x19, x20, [sp, #-0x10]! + stp x21, x22, [sp, #-0x10]! + stp x23, x24, [sp, #-0x10]! + stp x25, x26, [sp, #-0x10]! + stp x27, x28, [sp, #-0x10]! + stp x29, x30, [sp, #-0x10]! + stp d8, d9, [sp, #-0x10]! + stp d10, d11, [sp, #-0x10]! + stp d12, d13, [sp, #-0x10]! + stp d14, d15, [sp, #-0x10]! + mov $rks1,x3 + mov $rks2,x4 + mov $enc,w6 + ld1 {@tweak[0].4s}, [$ivp] + mov $rks,$rks2 +___ + &load_sbox(); + &rev32(@tweak[0],@tweak[0]); + &encrypt_1blk(@tweak[0]); +$code.=<<___; + mov $rks,$rks1 + and $remain,$len,#0x0F + // convert length into blocks + lsr $blocks,$len,4 + cmp $blocks,#1 + b.lt .return${std} + + cmp $remain,0 + // If the encryption/decryption Length is N times of 16, + // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std} + b.eq .xts_encrypt_blocks${std} + + // If the encryption/decryption length is not N times of 16, + // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std} + // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std} + subs $blocks,$blocks,#1 + b.eq .only_2blks_tweak${std} +.xts_encrypt_blocks${std}: +___ + &rbit(@tweak[0],@tweak[0],$std); + &rev32_armeb(@tweak[0],@tweak[0]); + &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); + &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); + &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); + &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); + &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); + &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); + &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); + &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); +$code.=<<___; +.Lxts_8_blocks_process${std}: + cmp $blocks,#8 +___ + &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); + &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]); + &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); + &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); + &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); + &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); + &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); + &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); + &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]); + &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); + &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]); + &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); + &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]); + &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); + &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]); + &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); +$code.=<<___; + b.lt .Lxts_4_blocks_process${std} + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rbit(@tweak[0],@tweak[0],$std); + &rbit(@tweak[1],@tweak[1],$std); + &rbit(@tweak[2],@tweak[2],$std); + &rbit(@tweak[3],@tweak[3],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b + eor @data[3].16b, @data[3].16b, @tweak[3].16b + ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 +___ + &rbit(@tweak[4],@tweak[4],$std); + &rbit(@tweak[5],@tweak[5],$std); + &rbit(@tweak[6],@tweak[6],$std); + &rbit(@tweak[7],@tweak[7],$std); +$code.=<<___; + eor @datax[0].16b, @datax[0].16b, @tweak[4].16b + eor @datax[1].16b, @datax[1].16b, @tweak[5].16b + eor @datax[2].16b, @datax[2].16b, @tweak[6].16b + eor @datax[3].16b, @datax[3].16b, @tweak[7].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &rev32(@datax[0],@datax[0]); + &rev32(@datax[1],@datax[1]); + &rev32(@datax[2],@datax[2]); + &rev32(@datax[3],@datax[3]); + &transpose(@data,@vtmp); + &transpose(@datax,@vtmp); +$code.=<<___; + bl _${prefix}_enc_8blks +___ + &transpose(@vtmp,@datax); + &transpose(@data,@datax); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b + eor @data[0].16b, @data[0].16b, @tweak[4].16b + eor @data[1].16b, @data[1].16b, @tweak[5].16b + eor @data[2].16b, @data[2].16b, @tweak[6].16b + eor @data[3].16b, @data[3].16b, @tweak[7].16b + + // save the last tweak + mov $lastTweak.16b,@tweak[7].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 + subs $blocks,$blocks,#8 + b.gt .Lxts_8_blocks_process${std} + b 100f +.Lxts_4_blocks_process${std}: + cmp $blocks,#4 + b.lt 1f + ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 +___ + &rbit(@tweak[0],@tweak[0],$std); + &rbit(@tweak[1],@tweak[1],$std); + &rbit(@tweak[2],@tweak[2],$std); + &rbit(@tweak[3],@tweak[3],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b + eor @data[3].16b, @data[3].16b, @tweak[3].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &rev32(@data[3],@data[3]); + &transpose(@data,@vtmp); +$code.=<<___; + bl _${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 + sub $blocks,$blocks,#4 + mov @tweak[0].16b,@tweak[4].16b + mov @tweak[1].16b,@tweak[5].16b + mov @tweak[2].16b,@tweak[6].16b + // save the last tweak + mov $lastTweak.16b,@tweak[3].16b +1: + // process last block + cmp $blocks,#1 + b.lt 100f + b.gt 1f + ld1 {@data[0].4s},[$inp],#16 +___ + &rbit(@tweak[0],@tweak[0],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + st1 {@data[0].4s},[$outp],#16 + // save the last tweak + mov $lastTweak.16b,@tweak[0].16b + b 100f +1: // process last 2 blocks + cmp $blocks,#2 + b.gt 1f + ld1 {@data[0].4s,@data[1].4s},[$inp],#32 +___ + &rbit(@tweak[0],@tweak[0],$std); + &rbit(@tweak[1],@tweak[1],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &transpose(@data,@vtmp); +$code.=<<___; + bl _${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 + // save the last tweak + mov $lastTweak.16b,@tweak[1].16b + b 100f +1: // process last 3 blocks + ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 +___ + &rbit(@tweak[0],@tweak[0],$std); + &rbit(@tweak[1],@tweak[1],$std); + &rbit(@tweak[2],@tweak[2],$std); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[0].16b + eor @data[1].16b, @data[1].16b, @tweak[1].16b + eor @data[2].16b, @data[2].16b, @tweak[2].16b +___ + &rev32(@data[0],@data[0]); + &rev32(@data[1],@data[1]); + &rev32(@data[2],@data[2]); + &transpose(@data,@vtmp); +$code.=<<___; + bl _${prefix}_enc_4blks +___ + &transpose(@vtmp,@data); +$code.=<<___; + eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b + eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b + eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 + // save the last tweak + mov $lastTweak.16b,@tweak[2].16b +100: + cmp $remain,0 + b.eq .return${std} + +// This brance calculates the last two tweaks, +// while the encryption/decryption length is larger than 32 +.last_2blks_tweak${std}: +___ + &rev32_armeb($lastTweak,$lastTweak); + &compute_tweak_vec($lastTweak,@tweak[1],$std); + &compute_tweak_vec(@tweak[1],@tweak[2],$std); +$code.=<<___; + b .check_dec${std} + + +// This brance calculates the last two tweaks, +// while the encryption/decryption length is equal to 32, who only need two tweaks +.only_2blks_tweak${std}: + mov @tweak[1].16b,@tweak[0].16b +___ + &rev32_armeb(@tweak[1],@tweak[1]); + &compute_tweak_vec(@tweak[1],@tweak[2]); +$code.=<<___; + b .check_dec${std} + + +// Determine whether encryption or decryption is required. +// The last two tweaks need to be swapped for decryption. +.check_dec${std}: + // encryption:1 decryption:0 + cmp $enc,1 + b.eq .prcess_last_2blks${std} + mov @vtmp[0].16B,@tweak[1].16b + mov @tweak[1].16B,@tweak[2].16b + mov @tweak[2].16B,@vtmp[0].16b + +.prcess_last_2blks${std}: +___ + &rev32_armeb(@tweak[1],@tweak[1]); + &rev32_armeb(@tweak[2],@tweak[2]); +$code.=<<___; + ld1 {@data[0].4s},[$inp],#16 + eor @data[0].16b, @data[0].16b, @tweak[1].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[1].16b + st1 {@data[0].4s},[$outp],#16 + + sub $lastBlk,$outp,16 + .loop${std}: + subs $remain,$remain,1 + ldrb $wtmp0,[$lastBlk,$remain] + ldrb $wtmp1,[$inp,$remain] + strb $wtmp1,[$lastBlk,$remain] + strb $wtmp0,[$outp,$remain] + b.gt .loop${std} + ld1 {@data[0].4s}, [$lastBlk] + eor @data[0].16b, @data[0].16b, @tweak[2].16b +___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0]); +$code.=<<___; + eor @data[0].16b, @data[0].16b, @tweak[2].16b + st1 {@data[0].4s}, [$lastBlk] +.return${std}: + ldp d14, d15, [sp], #0x10 + ldp d12, d13, [sp], #0x10 + ldp d10, d11, [sp], #0x10 + ldp d8, d9, [sp], #0x10 + ldp x29, x30, [sp], #0x10 + ldp x27, x28, [sp], #0x10 + ldp x25, x26, [sp], #0x10 + ldp x23, x24, [sp], #0x10 + ldp x21, x22, [sp], #0x10 + ldp x19, x20, [sp], #0x10 + ldp x17, x18, [sp], #0x10 + ldp x15, x16, [sp], #0x10 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std} +___ +} # end of gen_xts_cipher +&gen_xts_cipher("_gb"); +&gen_xts_cipher(""); +}}} + +######################################## +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + print $_,"\n"; +} + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info index 75a215ab80..73ffe5ea09 100644 --- a/crypto/sm4/build.info +++ b/crypto/sm4/build.info @@ -2,7 +2,7 @@ LIBS=../../libcrypto IF[{- !$disabled{asm} -}] $SM4DEF_aarch64=SM4_ASM VPSM4_ASM - $SM4ASM_aarch64=sm4-armv8.S vpsm4-armv8.S + $SM4ASM_aarch64=sm4-armv8.S vpsm4-armv8.S vpsm4_ex-armv8.S # Now that we have defined all the arch specific variables, use the # appropriate one, and define the appropriate macros @@ -30,5 +30,7 @@ ENDIF GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl GENERATE[vpsm4-armv8.S]=asm/vpsm4-armv8.pl +GENERATE[vpsm4_ex-armv8.S]=asm/vpsm4_ex-armv8.pl INCLUDE[sm4-armv8.o]=.. INCLUDE[vpsm4-armv8.o]=.. +INCLUDE[vpsm4_ex-armv8.o]=.. diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h index 15d8abbcb1..8b9cd10f97 100644 --- a/include/crypto/sm4_platform.h +++ b/include/crypto/sm4_platform.h @@ -20,11 +20,16 @@ static inline int vpsm4_capable(void) { return (OPENSSL_armcap_P & ARMV8_CPUID) && (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1) || - MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1) || - MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, HISI_CPU_IMP, HISI_CPU_PART_KP920)); + MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1)); +} +static inline int vpsm4_ex_capable(void) +{ + return (OPENSSL_armcap_P & ARMV8_CPUID) && + (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, HISI_CPU_IMP, HISI_CPU_PART_KP920)); } # if defined(VPSM4_ASM) # define VPSM4_CAPABLE vpsm4_capable() +# define VPSM4_EX_CAPABLE vpsm4_ex_capable() # endif # define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4) # define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key @@ -56,7 +61,7 @@ void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, const unsigned char ivec[16]); # endif /* HWSM4_CAPABLE */ -#ifdef VPSM4_CAPABLE +# ifdef VPSM4_CAPABLE int vpsm4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); int vpsm4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); void vpsm4_encrypt(const unsigned char *in, unsigned char *out, @@ -72,7 +77,37 @@ void vpsm4_ecb_encrypt(const unsigned char *in, unsigned char *out, void vpsm4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, size_t len, const void *key, const unsigned char ivec[16]); +void vpsm4_xts_encrypt(const unsigned char *in, unsigned char *out, + size_t len, const SM4_KEY *key1, const SM4_KEY *key2, + const unsigned char ivec[16], const int enc); +void vpsm4_xts_encrypt_gb(const unsigned char *in, unsigned char *out, + size_t len, const SM4_KEY *key1, const SM4_KEY *key2, + const unsigned char ivec[16], const int enc); # endif /* VPSM4_CAPABLE */ +# ifdef VPSM4_EX_CAPABLE +int vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); +int vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); +void vpsm4_ex_encrypt(const unsigned char *in, unsigned char *out, + const SM4_KEY *key); +void vpsm4_ex_decrypt(const unsigned char *in, unsigned char *out, + const SM4_KEY *key); +void vpsm4_ex_cbc_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const SM4_KEY *key, + unsigned char *ivec, const int enc); +void vpsm4_ex_ecb_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const SM4_KEY *key, + const int enc); +void vpsm4_ex_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + const unsigned char ivec[16]); +void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out, + size_t len, const SM4_KEY *key1, const SM4_KEY *key2, + const unsigned char ivec[16], const int enc); +void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out, + size_t len, const SM4_KEY *key1, + const SM4_KEY *key2, const unsigned char ivec[16], + const int enc); +# endif /* VPSM4_EX_CAPABLE */ #endif /* OSSL_SM4_PLATFORM_H */ diff --git a/providers/implementations/ciphers/cipher_sm4_hw.c b/providers/implementations/ciphers/cipher_sm4_hw.c index 9a2e99f67c..8cabd78266 100644 --- a/providers/implementations/ciphers/cipher_sm4_hw.c +++ b/providers/implementations/ciphers/cipher_sm4_hw.c @@ -42,6 +42,19 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx, (void)0; /* terminate potentially open 'else' */ } else #endif +#ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + vpsm4_ex_set_encrypt_key(key, ks); + ctx->block = (block128_f)vpsm4_ex_encrypt; + ctx->stream.cbc = NULL; + if (ctx->mode == EVP_CIPH_CBC_MODE) + ctx->stream.cbc = (cbc128_f)vpsm4_ex_cbc_encrypt; + else if (ctx->mode == EVP_CIPH_ECB_MODE) + ctx->stream.ecb = (ecb128_f)vpsm4_ex_ecb_encrypt; + else if (ctx->mode == EVP_CIPH_CTR_MODE) + ctx->stream.ctr = (ctr128_f)vpsm4_ex_ctr32_encrypt_blocks; + } else +#endif #ifdef VPSM4_CAPABLE if (VPSM4_CAPABLE) { vpsm4_set_encrypt_key(key, ks); @@ -75,6 +88,17 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx, #endif } else #endif +#ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + vpsm4_ex_set_decrypt_key(key, ks); + ctx->block = (block128_f)vpsm4_ex_decrypt; + ctx->stream.cbc = NULL; + if (ctx->mode == EVP_CIPH_CBC_MODE) + ctx->stream.cbc = (cbc128_f)vpsm4_ex_cbc_encrypt; + else if (ctx->mode == EVP_CIPH_ECB_MODE) + ctx->stream.ecb = (ecb128_f)vpsm4_ex_ecb_encrypt; + } else +#endif #ifdef VPSM4_CAPABLE if (VPSM4_CAPABLE) { vpsm4_set_decrypt_key(key, ks); @@ -82,7 +106,7 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx, ctx->stream.cbc = NULL; if (ctx->mode == EVP_CIPH_CBC_MODE) ctx->stream.cbc = (cbc128_f)vpsm4_cbc_encrypt; - else if (ctx->mode == EVP_CIPH_ECB_MODE) + else if (ctx->mode == EVP_CIPH_ECB_MODE) ctx->stream.ecb = (ecb128_f)vpsm4_ecb_encrypt; } else #endif diff --git a/providers/implementations/ciphers/cipher_sm4_xts.c b/providers/implementations/ciphers/cipher_sm4_xts.c index 3c568d4d18..037055fce8 100644 --- a/providers/implementations/ciphers/cipher_sm4_xts.c +++ b/providers/implementations/ciphers/cipher_sm4_xts.c @@ -145,14 +145,14 @@ static int sm4_xts_cipher(void *vctx, unsigned char *out, size_t *outl, if (ctx->xts_standard) { if (ctx->stream != NULL) (*ctx->stream)(in, out, inl, ctx->xts.key1, ctx->xts.key2, - ctx->base.iv); + ctx->base.iv, ctx->base.enc); else if (CRYPTO_xts128_encrypt(&ctx->xts, ctx->base.iv, in, out, inl, ctx->base.enc)) return 0; } else { if (ctx->stream_gb != NULL) (*ctx->stream_gb)(in, out, inl, ctx->xts.key1, ctx->xts.key2, - ctx->base.iv); + ctx->base.iv, ctx->base.enc); else if (ossl_crypto_xts128gb_encrypt(&ctx->xts, ctx->base.iv, in, out, inl, ctx->base.enc)) return 0; diff --git a/providers/implementations/ciphers/cipher_sm4_xts.h b/providers/implementations/ciphers/cipher_sm4_xts.h index 4c369183e2..cfca596979 100644 --- a/providers/implementations/ciphers/cipher_sm4_xts.h +++ b/providers/implementations/ciphers/cipher_sm4_xts.h @@ -14,7 +14,7 @@ PROV_CIPHER_FUNC(void, xts_stream, (const unsigned char *in, unsigned char *out, size_t len, const SM4_KEY *key1, const SM4_KEY *key2, - const unsigned char iv[16])); + const unsigned char iv[16], const int enc)); typedef struct prov_sm4_xts_ctx_st { /* Must be first */ diff --git a/providers/implementations/ciphers/cipher_sm4_xts_hw.c b/providers/implementations/ciphers/cipher_sm4_xts_hw.c index 403eb879b1..67a9923d94 100644 --- a/providers/implementations/ciphers/cipher_sm4_xts_hw.c +++ b/providers/implementations/ciphers/cipher_sm4_xts_hw.c @@ -11,8 +11,7 @@ #define XTS_SET_KEY_FN(fn_set_enc_key, fn_set_dec_key, \ fn_block_enc, fn_block_dec, \ - fn_stream_enc, fn_stream_dec, \ - fn_stream_gb_enc, fn_stream_gb_dec) { \ + fn_stream, fn_stream_gb) { \ size_t bytes = keylen / 2; \ \ if (ctx->enc) { \ @@ -26,8 +25,8 @@ xctx->xts.block2 = (block128_f)fn_block_enc; \ xctx->xts.key1 = &xctx->ks1; \ xctx->xts.key2 = &xctx->ks2; \ - xctx->stream = ctx->enc ? fn_stream_enc : fn_stream_dec; \ - xctx->stream_gb = ctx->enc ? fn_stream_gb_enc : fn_stream_gb_dec; \ + xctx->stream = fn_stream; \ + xctx->stream_gb = fn_stream_gb; \ } static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx, @@ -35,23 +34,30 @@ static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx, size_t keylen) { PROV_SM4_XTS_CTX *xctx = (PROV_SM4_XTS_CTX *)ctx; - OSSL_xts_stream_fn stream_enc = NULL; - OSSL_xts_stream_fn stream_dec = NULL; - OSSL_xts_stream_fn stream_gb_enc = NULL; - OSSL_xts_stream_fn stream_gb_dec = NULL; + OSSL_xts_stream_fn stream = NULL; + OSSL_xts_stream_fn stream_gb = NULL; #ifdef HWSM4_CAPABLE if (HWSM4_CAPABLE) { XTS_SET_KEY_FN(HWSM4_set_encrypt_key, HWSM4_set_decrypt_key, - HWSM4_encrypt, HWSM4_decrypt, stream_enc, stream_dec, - stream_gb_enc, stream_gb_dec); + HWSM4_encrypt, HWSM4_decrypt, stream, stream_gb); return 1; } else #endif /* HWSM4_CAPABLE */ +#ifdef VPSM4_EX_CAPABLE + if (VPSM4_EX_CAPABLE) { + stream = vpsm4_ex_xts_encrypt; + stream_gb = vpsm4_ex_xts_encrypt_gb; + XTS_SET_KEY_FN(vpsm4_ex_set_encrypt_key, vpsm4_ex_set_decrypt_key, + vpsm4_ex_encrypt, vpsm4_ex_decrypt, stream, stream_gb); + return 1; + } else +#endif /* VPSM4_EX_CAPABLE */ #ifdef VPSM4_CAPABLE if (VPSM4_CAPABLE) { + stream = vpsm4_xts_encrypt; + stream_gb = vpsm4_xts_encrypt_gb; XTS_SET_KEY_FN(vpsm4_set_encrypt_key, vpsm4_set_decrypt_key, - vpsm4_encrypt, vpsm4_decrypt, stream_enc, stream_dec, - stream_gb_enc, stream_gb_dec); + vpsm4_encrypt, vpsm4_decrypt, stream, stream_gb); return 1; } else #endif /* VPSM4_CAPABLE */ @@ -60,8 +66,7 @@ static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx, } { XTS_SET_KEY_FN(ossl_sm4_set_key, ossl_sm4_set_key, ossl_sm4_encrypt, - ossl_sm4_decrypt, stream_enc, stream_dec, stream_gb_enc, - stream_gb_dec); + ossl_sm4_decrypt, stream, stream_gb); } return 1; } -- 2.37.3.windows.1
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.
浙ICP备2022010568号-2