Projects
openEuler:24.03:SP1:Everything
openjdk-1.8.0
_service:tar_scm:8143925-enhancing-CounterMode....
Sign Up
Log In
Username
Password
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File _service:tar_scm:8143925-enhancing-CounterMode.crypt-for-AESCrypt.patch of Package openjdk-1.8.0
From 02b097417275acaad294d71a852c2def2222be25 Mon Sep 17 00:00:00 2001 From: kuenking111 <wangkun49@huawei.com> Date: Sat, 3 Sep 2022 14:17:50 +0000 Subject: [PATCH 1/6] 8143925-enhancing-CounterMode.crypt-for-AESCrypt --- .../src/cpu/aarch64/vm/assembler_aarch64.hpp | 35 +- .../cpu/aarch64/vm/macroAssembler_aarch64.hpp | 17 + .../aarch64/vm/macroAssembler_aarch64_aes.cpp | 685 ++++++++++++++++++ .../cpu/aarch64/vm/stubGenerator_aarch64.cpp | 324 ++++++++- .../cpu/aarch64/vm/stubRoutines_aarch64.hpp | 2 +- .../src/cpu/aarch64/vm/vm_version_aarch64.cpp | 13 +- hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp | 5 + hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp | 5 + hotspot/src/cpu/x86/vm/assembler_x86.cpp | 74 +- hotspot/src/cpu/x86/vm/assembler_x86.hpp | 12 + .../src/cpu/x86/vm/stubGenerator_x86_32.cpp | 344 +++++++++ .../src/cpu/x86/vm/stubGenerator_x86_64.cpp | 340 ++++++++- hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp | 1 + hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp | 5 + .../src/cpu/x86/vm/stubRoutines_x86_32.hpp | 2 +- .../src/cpu/x86/vm/stubRoutines_x86_64.hpp | 2 +- hotspot/src/cpu/x86/vm/vm_version_x86.cpp | 36 + hotspot/src/share/vm/classfile/vmSymbols.hpp | 4 + hotspot/src/share/vm/opto/escape.cpp | 1 + hotspot/src/share/vm/opto/library_call.cpp | 174 +++++ hotspot/src/share/vm/opto/runtime.cpp | 29 + hotspot/src/share/vm/opto/runtime.hpp | 1 + hotspot/src/share/vm/runtime/globals.hpp | 3 + hotspot/src/share/vm/runtime/stubRoutines.cpp | 1 + hotspot/src/share/vm/runtime/stubRoutines.hpp | 2 + hotspot/src/share/vm/runtime/vmStructs.cpp | 1 + .../test/compiler/7184394/TestAESBase.java | 4 +- .../test/compiler/7184394/TestAESMain.java | 7 + .../com/sun/crypto/provider/CounterMode.java | 11 +- .../classes/com/sun/crypto/provider/GCTR.java | 89 +-- .../com/sun/crypto/provider/GHASH.java | 20 +- .../sun/security/ssl/SSLSocketImpl.java | 14 +- .../security/ssl/SSLSocketInputRecord.java | 215 +++--- .../sun/security/ssl/SSLTransport.java | 4 + .../bench/javax/crypto/full/AESGCMBench.java | 128 ++++ .../javax/crypto/full/AESGCMByteBuffer.java | 163 +++++ .../bench/javax/crypto/full/CryptoBase.java | 102 +++ .../bench/javax/crypto/small/AESGCMBench.java | 36 + .../javax/crypto/small/AESGCMByteBuffer.java | 36 + .../ssl/SSLSocketImpl/ClientTimeout.java | 3 +- .../SSLSocketImpl/SSLExceptionForIOIssue.java | 4 +- 41 files changed, 2738 insertions(+), 216 deletions(-) create mode 100644 hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64_aes.cpp create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMBench.java create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMByteBuffer.java create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/full/CryptoBase.java create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMBench.java create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMByteBuffer.java diff --git a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp index b0fa9b5fc..9202e61f8 100644 --- a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp +++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp @@ -146,6 +146,21 @@ REGISTER_DECLARATION(Register, esp, r20); #define assert_cond(ARG1) assert(ARG1, #ARG1) +// In many places we've added C-style casts to silence compiler +// warnings, for example when truncating a size_t to an int when we +// know the size_t is a small struct. Such casts are risky because +// they effectively disable useful compiler warnings. We can make our +// lives safer with this function, which ensures that any cast is +// reversible without loss of information. It doesn't check +// everything: it isn't intended to make sure that pointer types are +// compatible, for example. +template <typename T2, typename T1> +T2 checked_cast(T1 thing) { + T2 result = static_cast<T2>(thing); + assert(static_cast<T1>(result) == thing, "must be"); + return result; +} + namespace asm_util { uint32_t encode_logical_immediate(bool is32, uint64_t imm); }; @@ -193,7 +208,7 @@ public: static inline uint32_t extract(uint32_t val, int msb, int lsb) { int nbits = msb - lsb + 1; assert_cond(msb >= lsb); - uint32_t mask = (1U << nbits) - 1; + uint32_t mask = checked_cast<uint32_t>(right_n_bits(nbits)); uint32_t result = val >> lsb; result &= mask; return result; @@ -208,7 +223,7 @@ public: int nbits = msb - lsb + 1; guarantee(val < (1U << nbits), "Field too big for insn"); assert_cond(msb >= lsb); - unsigned mask = (1U << nbits) - 1; + unsigned mask = checked_cast<unsigned>(right_n_bits(nbits)); val <<= lsb; mask <<= lsb; unsigned target = *(unsigned *)a; @@ -222,7 +237,7 @@ public: long chk = val >> (nbits - 1); guarantee (chk == -1 || chk == 0, "Field too big for insn"); unsigned uval = val; - unsigned mask = (1U << nbits) - 1; + unsigned mask = checked_cast<unsigned>(right_n_bits(nbits)); uval &= mask; uval <<= lsb; mask <<= lsb; @@ -234,9 +249,9 @@ public: void f(unsigned val, int msb, int lsb) { int nbits = msb - lsb + 1; - guarantee(val < (1U << nbits), "Field too big for insn"); + guarantee(val < (1ULL << nbits), "Field too big for insn"); assert_cond(msb >= lsb); - unsigned mask = (1U << nbits) - 1; + unsigned mask = checked_cast<unsigned>(right_n_bits(nbits)); val <<= lsb; mask <<= lsb; insn |= val; @@ -255,7 +270,7 @@ public: long chk = val >> (nbits - 1); guarantee (chk == -1 || chk == 0, "Field too big for insn"); unsigned uval = val; - unsigned mask = (1U << nbits) - 1; + unsigned mask = checked_cast<unsigned>(right_n_bits(nbits)); uval &= mask; f(uval, lsb + nbits - 1, lsb); } @@ -280,7 +295,7 @@ public: unsigned get(int msb = 31, int lsb = 0) { int nbits = msb - lsb + 1; - unsigned mask = ((1U << nbits) - 1) << lsb; + unsigned mask = checked_cast<unsigned>(right_n_bits(nbits)) << lsb; assert_cond((bits & mask) == mask); return (insn & mask) >> lsb; } @@ -1991,21 +2006,21 @@ public: starti; f(0,31), f((int)T & 1, 30); f(op1, 29, 21), f(0, 20, 16), f(op2, 15, 12); - f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0); + f((int)T >> 1, 11, 10), srf(Xn, 5), rf(Vt, 0); } void ld_st(FloatRegister Vt, SIMD_Arrangement T, Register Xn, int imm, int op1, int op2) { starti; f(0,31), f((int)T & 1, 30); f(op1 | 0b100, 29, 21), f(0b11111, 20, 16), f(op2, 15, 12); - f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0); + f((int)T >> 1, 11, 10), srf(Xn, 5), rf(Vt, 0); } void ld_st(FloatRegister Vt, SIMD_Arrangement T, Register Xn, Register Xm, int op1, int op2) { starti; f(0,31), f((int)T & 1, 30); f(op1 | 0b100, 29, 21), rf(Xm, 16), f(op2, 15, 12); - f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0); + f((int)T >> 1, 11, 10), srf(Xn, 5), rf(Vt, 0); } void ld_st(FloatRegister Vt, SIMD_Arrangement T, Address a, int op1, int op2) { diff --git a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp index 0ca694038..d334f1b69 100644 --- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp +++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp @@ -1240,6 +1240,23 @@ public: void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6, Register tmp7); + void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, + FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, + FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3); + void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, + FloatRegister p, FloatRegister z, FloatRegister t1); + void ghash_processBlocks_wide(address p, Register state, Register subkeyH, + Register data, Register blocks, int unrolls); + void ghash_modmul (FloatRegister result, + FloatRegister result_lo, FloatRegister result_hi, FloatRegister b, + FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p, + FloatRegister t1, FloatRegister t2, FloatRegister t3); + + void aesenc_loadkeys(Register key, Register keylen); + void aesecb_encrypt(Register from, Register to, Register keylen, + FloatRegister data = v0, int unrolls = 1); + void aesecb_decrypt(Register from, Register to, Register key, Register keylen); + void aes_round(FloatRegister input, FloatRegister subkey); // ISB may be needed because of a safepoint void maybe_isb() { isb(); } diff --git a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64_aes.cpp b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64_aes.cpp new file mode 100644 index 000000000..1db79c97a --- /dev/null +++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64_aes.cpp @@ -0,0 +1,685 @@ +/* + * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "precompiled.hpp" + +#include "asm/assembler.hpp" +#include "asm/assembler.inline.hpp" +#include "macroAssembler_aarch64.hpp" +#include "memory/resourceArea.hpp" +#include "runtime/stubRoutines.hpp" + +void MacroAssembler::aesecb_decrypt(Register from, Register to, Register key, Register keylen) { + Label L_doLast; + + ld1(v0, T16B, from); // get 16 bytes of input + + ld1(v5, T16B, post(key, 16)); + rev32(v5, T16B, v5); + + ld1(v1, v2, v3, v4, T16B, post(key, 64)); + rev32(v1, T16B, v1); + rev32(v2, T16B, v2); + rev32(v3, T16B, v3); + rev32(v4, T16B, v4); + aesd(v0, v1); + aesimc(v0, v0); + aesd(v0, v2); + aesimc(v0, v0); + aesd(v0, v3); + aesimc(v0, v0); + aesd(v0, v4); + aesimc(v0, v0); + + ld1(v1, v2, v3, v4, T16B, post(key, 64)); + rev32(v1, T16B, v1); + rev32(v2, T16B, v2); + rev32(v3, T16B, v3); + rev32(v4, T16B, v4); + aesd(v0, v1); + aesimc(v0, v0); + aesd(v0, v2); + aesimc(v0, v0); + aesd(v0, v3); + aesimc(v0, v0); + aesd(v0, v4); + aesimc(v0, v0); + + ld1(v1, v2, T16B, post(key, 32)); + rev32(v1, T16B, v1); + rev32(v2, T16B, v2); + + cmpw(keylen, 44); + br(Assembler::EQ, L_doLast); + + aesd(v0, v1); + aesimc(v0, v0); + aesd(v0, v2); + aesimc(v0, v0); + + ld1(v1, v2, T16B, post(key, 32)); + rev32(v1, T16B, v1); + rev32(v2, T16B, v2); + + cmpw(keylen, 52); + br(Assembler::EQ, L_doLast); + + aesd(v0, v1); + aesimc(v0, v0); + aesd(v0, v2); + aesimc(v0, v0); + + ld1(v1, v2, T16B, post(key, 32)); + rev32(v1, T16B, v1); + rev32(v2, T16B, v2); + + bind(L_doLast); + + aesd(v0, v1); + aesimc(v0, v0); + aesd(v0, v2); + + eor(v0, T16B, v0, v5); + + st1(v0, T16B, to); + + // Preserve the address of the start of the key + sub(key, key, keylen, LSL, exact_log2(sizeof (jint))); +} + +// Load expanded key into v17..v31 +void MacroAssembler::aesenc_loadkeys(Register key, Register keylen) { + Label L_loadkeys_44, L_loadkeys_52; + cmpw(keylen, 52); + br(Assembler::LO, L_loadkeys_44); + br(Assembler::EQ, L_loadkeys_52); + + ld1(v17, v18, T16B, post(key, 32)); + rev32(v17, T16B, v17); + rev32(v18, T16B, v18); + bind(L_loadkeys_52); + ld1(v19, v20, T16B, post(key, 32)); + rev32(v19, T16B, v19); + rev32(v20, T16B, v20); + bind(L_loadkeys_44); + ld1(v21, v22, v23, v24, T16B, post(key, 64)); + rev32(v21, T16B, v21); + rev32(v22, T16B, v22); + rev32(v23, T16B, v23); + rev32(v24, T16B, v24); + ld1(v25, v26, v27, v28, T16B, post(key, 64)); + rev32(v25, T16B, v25); + rev32(v26, T16B, v26); + rev32(v27, T16B, v27); + rev32(v28, T16B, v28); + ld1(v29, v30, v31, T16B, post(key, 48)); + rev32(v29, T16B, v29); + rev32(v30, T16B, v30); + rev32(v31, T16B, v31); + + // Preserve the address of the start of the key + sub(key, key, keylen, LSL, exact_log2(sizeof (jint))); +} + +// NeoverseTM N1Software Optimization Guide: +// Adjacent AESE/AESMC instruction pairs and adjacent AESD/AESIMC +// instruction pairs will exhibit the performance characteristics +// described in Section 4.6. +void MacroAssembler::aes_round(FloatRegister input, FloatRegister subkey) { + aese(input, subkey); aesmc(input, input); +} + +// KernelGenerator +// +// The abstract base class of an unrolled function generator. +// Subclasses override generate(), length(), and next() to generate +// unrolled and interleaved functions. +// +// The core idea is that a subclass defines a method which generates +// the base case of a function and a method to generate a clone of it, +// shifted to a different set of registers. KernelGenerator will then +// generate several interleaved copies of the function, with each one +// using a different set of registers. + +// The subclass must implement three methods: length(), which is the +// number of instruction bundles in the intrinsic, generate(int n) +// which emits the nth instruction bundle in the intrinsic, and next() +// which takes an instance of the generator and returns a version of it, +// shifted to a new set of registers. + +class KernelGenerator: public MacroAssembler { +protected: + const int _unrolls; +public: + KernelGenerator(Assembler *as, int unrolls) + : MacroAssembler(as->code()), _unrolls(unrolls) { } + virtual void generate(int index) = 0; + virtual int length() = 0; + virtual KernelGenerator *next() = 0; + int unrolls() { return _unrolls; } + void unroll(); +}; + +void KernelGenerator::unroll() { + ResourceMark rm; + KernelGenerator **generators + = NEW_RESOURCE_ARRAY(KernelGenerator *, unrolls()); + + generators[0] = this; + for (int i = 1; i < unrolls(); i++) { + generators[i] = generators[i-1]->next(); + } + + for (int j = 0; j < length(); j++) { + for (int i = 0; i < unrolls(); i++) { + generators[i]->generate(j); + } + } +} + +// An unrolled and interleaved generator for AES encryption. +class AESKernelGenerator: public KernelGenerator { + Register _from, _to; + const Register _keylen; + FloatRegister _data; + const FloatRegister _subkeys; + bool _once; + Label _rounds_44, _rounds_52; + +public: + AESKernelGenerator(Assembler *as, int unrolls, + Register from, Register to, Register keylen, FloatRegister data, + FloatRegister subkeys, bool once = true) + : KernelGenerator(as, unrolls), + _from(from), _to(to), _keylen(keylen), _data(data), + _subkeys(subkeys), _once(once) { + } + + virtual void generate(int index) { + switch (index) { + case 0: + if (_from != noreg) { + ld1(_data, T16B, _from); // get 16 bytes of input + } + break; + case 1: + if (_once) { + cmpw(_keylen, 52); + br(Assembler::LO, _rounds_44); + br(Assembler::EQ, _rounds_52); + } + break; + case 2: aes_round(_data, _subkeys + 0); break; + case 3: aes_round(_data, _subkeys + 1); break; + case 4: + if (_once) bind(_rounds_52); + break; + case 5: aes_round(_data, _subkeys + 2); break; + case 6: aes_round(_data, _subkeys + 3); break; + case 7: + if (_once) bind(_rounds_44); + break; + case 8: aes_round(_data, _subkeys + 4); break; + case 9: aes_round(_data, _subkeys + 5); break; + case 10: aes_round(_data, _subkeys + 6); break; + case 11: aes_round(_data, _subkeys + 7); break; + case 12: aes_round(_data, _subkeys + 8); break; + case 13: aes_round(_data, _subkeys + 9); break; + case 14: aes_round(_data, _subkeys + 10); break; + case 15: aes_round(_data, _subkeys + 11); break; + case 16: aes_round(_data, _subkeys + 12); break; + case 17: aese(_data, _subkeys + 13); break; + case 18: eor(_data, T16B, _data, _subkeys + 14); break; + case 19: + if (_to != noreg) { + st1(_data, T16B, _to); + } + break; + default: ShouldNotReachHere(); + } + } + + virtual KernelGenerator *next() { + return new AESKernelGenerator(this, _unrolls, + _from, _to, _keylen, + _data + 1, _subkeys, /*once*/false); + } + + virtual int length() { return 20; } +}; + +// Uses expanded key in v17..v31 +// Returns encrypted values in inputs. +// If to != noreg, store value at to; likewise from +// Preserves key, keylen +// Increments from, to +// Input data in v0, v1, ... +// unrolls controls the number of times to unroll the generated function +void MacroAssembler::aesecb_encrypt(Register from, Register to, Register keylen, + FloatRegister data, int unrolls) { + AESKernelGenerator(this, unrolls, from, to, keylen, data, v17) .unroll(); +} + +// ghash_multiply and ghash_reduce are the non-unrolled versions of +// the GHASH function generators. +void MacroAssembler::ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, + FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, + FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3) { + // Karatsuba multiplication performs a 128*128 -> 256-bit + // multiplication in three 128-bit multiplications and a few + // additions. + // + // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) + // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 + // + // Inputs: + // + // A0 in a.d[0] (subkey) + // A1 in a.d[1] + // (A1+A0) in a1_xor_a0.d[0] + // + // B0 in b.d[0] (state) + // B1 in b.d[1] + + ext(tmp1, T16B, b, b, 0x08); + pmull2(result_hi, T1Q, b, a, T2D); // A1*B1 + eor(tmp1, T16B, tmp1, b); // (B1+B0) + pmull(result_lo, T1Q, b, a, T1D); // A0*B0 + pmull(tmp2, T1Q, tmp1, a1_xor_a0, T1D); // (A1+A0)(B1+B0) + + ext(tmp1, T16B, result_lo, result_hi, 0x08); + eor(tmp3, T16B, result_hi, result_lo); // A1*B1+A0*B0 + eor(tmp2, T16B, tmp2, tmp1); + eor(tmp2, T16B, tmp2, tmp3); + + // Register pair <result_hi:result_lo> holds the result of carry-less multiplication + ins(result_hi, D, tmp2, 0, 1); + ins(result_lo, D, tmp2, 1, 0); +} + +void MacroAssembler::ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, + FloatRegister p, FloatRegister vzr, FloatRegister t1) { + const FloatRegister t0 = result; + + // The GCM field polynomial f is z^128 + p(z), where p = + // z^7+z^2+z+1. + // + // z^128 === -p(z) (mod (z^128 + p(z))) + // + // so, given that the product we're reducing is + // a == lo + hi * z^128 + // substituting, + // === lo - hi * p(z) (mod (z^128 + p(z))) + // + // we reduce by multiplying hi by p(z) and subtracting the result + // from (i.e. XORing it with) lo. Because p has no nonzero high + // bits we can do this with two 64-bit multiplications, lo*p and + // hi*p. + + pmull2(t0, T1Q, hi, p, T2D); + ext(t1, T16B, t0, vzr, 8); + eor(hi, T16B, hi, t1); + ext(t1, T16B, vzr, t0, 8); + eor(lo, T16B, lo, t1); + pmull(t0, T1Q, hi, p, T1D); + eor(result, T16B, lo, t0); +} + +class GHASHMultiplyGenerator: public KernelGenerator { + FloatRegister _result_lo, _result_hi, _b, + _a, _vzr, _a1_xor_a0, _p, + _tmp1, _tmp2, _tmp3; + +public: + GHASHMultiplyGenerator(Assembler *as, int unrolls, + FloatRegister result_lo, FloatRegister result_hi, + /* offsetted registers */ + FloatRegister b, + /* non-offsetted (shared) registers */ + FloatRegister a, FloatRegister a1_xor_a0, FloatRegister p, FloatRegister vzr, + /* offseted (temp) registers */ + FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3) + : KernelGenerator(as, unrolls), + _result_lo(result_lo), _result_hi(result_hi), _b(b), + _a(a), _vzr(vzr), _a1_xor_a0(a1_xor_a0), _p(p), + _tmp1(tmp1), _tmp2(tmp2), _tmp3(tmp3) { } + + static const int register_stride = 7; + + virtual void generate(int index) { + // Karatsuba multiplication performs a 128*128 -> 256-bit + // multiplication in three 128-bit multiplications and a few + // additions. + // + // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) + // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 + // + // Inputs: + // + // A0 in a.d[0] (subkey) + // A1 in a.d[1] + // (A1+A0) in a1_xor_a0.d[0] + // + // B0 in b.d[0] (state) + // B1 in b.d[1] + + switch (index) { + case 0: ext(_tmp1, T16B, _b, _b, 0x08); break; + case 1: pmull2(_result_hi, T1Q, _b, _a, T2D); // A1*B1 + break; + case 2: eor(_tmp1, T16B, _tmp1, _b); // (B1+B0) + break; + case 3: pmull(_result_lo, T1Q, _b, _a, T1D); // A0*B0 + break; + case 4: pmull(_tmp2, T1Q, _tmp1, _a1_xor_a0, T1D); // (A1+A0)(B1+B0) + break; + + case 5: ext(_tmp1, T16B, _result_lo, _result_hi, 0x08); break; + case 6: eor(_tmp3, T16B, _result_hi, _result_lo); // A1*B1+A0*B0 + break; + case 7: eor(_tmp2, T16B, _tmp2, _tmp1); break; + case 8: eor(_tmp2, T16B, _tmp2, _tmp3); break; + + // Register pair <_result_hi:_result_lo> holds the _result of carry-less multiplication + case 9: ins(_result_hi, D, _tmp2, 0, 1); break; + case 10: ins(_result_lo, D, _tmp2, 1, 0); break; + default: ShouldNotReachHere(); + } + } + + virtual KernelGenerator *next() { + GHASHMultiplyGenerator *result + = new GHASHMultiplyGenerator(this, _unrolls, _result_lo, _result_hi, + _b, _a, _a1_xor_a0, _p, _vzr, + _tmp1, _tmp2, _tmp3); + result->_result_lo += register_stride; + result->_result_hi += register_stride; + result->_b += register_stride; + result->_tmp1 += register_stride; + result->_tmp2 += register_stride; + result->_tmp3 += register_stride; + return result; + } + + virtual int length() { return 11; } +}; + +// Reduce the 128-bit product in hi:lo by the GCM field polynomial. +// The FloatRegister argument called data is optional: if it is a +// valid register, we interleave LD1 instructions with the +// reduction. This is to reduce latency next time around the loop. +class GHASHReduceGenerator: public KernelGenerator { + FloatRegister _result, _lo, _hi, _p, _vzr, _data, _t1; + int _once; +public: + GHASHReduceGenerator(Assembler *as, int unrolls, + /* offsetted registers */ + FloatRegister result, FloatRegister lo, FloatRegister hi, + /* non-offsetted (shared) registers */ + FloatRegister p, FloatRegister vzr, FloatRegister data, + /* offseted (temp) registers */ + FloatRegister t1) + : KernelGenerator(as, unrolls), + _result(result), _lo(lo), _hi(hi), + _p(p), _vzr(vzr), _data(data), _t1(t1), _once(true) { } + + static const int register_stride = 7; + + virtual void generate(int index) { + const FloatRegister t0 = _result; + + switch (index) { + // The GCM field polynomial f is z^128 + p(z), where p = + // z^7+z^2+z+1. + // + // z^128 === -p(z) (mod (z^128 + p(z))) + // + // so, given that the product we're reducing is + // a == lo + hi * z^128 + // substituting, + // === lo - hi * p(z) (mod (z^128 + p(z))) + // + // we reduce by multiplying hi by p(z) and subtracting the _result + // from (i.e. XORing it with) lo. Because p has no nonzero high + // bits we can do this with two 64-bit multiplications, lo*p and + // hi*p. + + case 0: pmull2(t0, T1Q, _hi, _p, T2D); break; + case 1: ext(_t1, T16B, t0, _vzr, 8); break; + case 2: eor(_hi, T16B, _hi, _t1); break; + case 3: ext(_t1, T16B, _vzr, t0, 8); break; + case 4: eor(_lo, T16B, _lo, _t1); break; + case 5: pmull(t0, T1Q, _hi, _p, T1D); break; + case 6: eor(_result, T16B, _lo, t0); break; + default: ShouldNotReachHere(); + } + + // Sprinkle load instructions into the generated instructions + if (_data->is_valid() && _once) { + assert(length() >= unrolls(), "not enough room for inteleaved loads"); + if (index < unrolls()) { + ld1((_data + index*register_stride), T16B, post(r2, 0x10)); + } + } + } + + virtual KernelGenerator *next() { + GHASHReduceGenerator *result + = new GHASHReduceGenerator(this, _unrolls, + _result, _lo, _hi, _p, _vzr, _data, _t1); + result->_result += register_stride; + result->_hi += register_stride; + result->_lo += register_stride; + result->_t1 += register_stride; + result->_once = false; + return result; + } + + int length() { return 7; } +}; + +// Perform a GHASH multiply/reduce on a single FloatRegister. +void MacroAssembler::ghash_modmul(FloatRegister result, + FloatRegister result_lo, FloatRegister result_hi, FloatRegister b, + FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p, + FloatRegister t1, FloatRegister t2, FloatRegister t3) { + ghash_multiply(result_lo, result_hi, a, b, a1_xor_a0, t1, t2, t3); + ghash_reduce(result, result_lo, result_hi, p, vzr, t1); +} + +// Interleaved GHASH processing. +// +// Clobbers all vector registers. +// +void MacroAssembler::ghash_processBlocks_wide(address field_polynomial, Register state, + Register subkeyH, + Register data, Register blocks, int unrolls) { + int register_stride = 7; + + // Bafflingly, GCM uses little-endian for the byte order, but + // big-endian for the bit order. For example, the polynomial 1 is + // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. + // + // So, we must either reverse the bytes in each word and do + // everything big-endian or reverse the bits in each byte and do + // it little-endian. On AArch64 it's more idiomatic to reverse + // the bits in each byte (we have an instruction, RBIT, to do + // that) and keep the data in little-endian bit order throught the + // calculation, bit-reversing the inputs and outputs. + + assert(unrolls * register_stride < 32, "out of registers"); + + FloatRegister a1_xor_a0 = v28; + FloatRegister Hprime = v29; + FloatRegister vzr = v30; + FloatRegister p = v31; + eor(vzr, T16B, vzr, vzr); // zero register + + ldrq(p, field_polynomial); // The field polynomial + + ldrq(v0, Address(state)); + ldrq(Hprime, Address(subkeyH)); + + rev64(v0, T16B, v0); // Bit-reverse words in state and subkeyH + rbit(v0, T16B, v0); + rev64(Hprime, T16B, Hprime); + rbit(Hprime, T16B, Hprime); + + // Powers of H -> Hprime + + Label already_calculated, done; + { + // The first time around we'll have to calculate H**2, H**3, etc. + // Look at the largest power of H in the subkeyH array to see if + // it's already been calculated. + ldp(rscratch1, rscratch2, Address(subkeyH, 16 * (unrolls - 1))); + orr(rscratch1, rscratch1, rscratch2); + cbnz(rscratch1, already_calculated); + + orr(v6, T16B, Hprime, Hprime); // Start with H in v6 and Hprime + for (int i = 1; i < unrolls; i++) { + ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0 + eor(a1_xor_a0, T16B, a1_xor_a0, Hprime); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) + ghash_modmul(/*result*/v6, /*result_lo*/v5, /*result_hi*/v4, /*b*/v6, + Hprime, vzr, a1_xor_a0, p, + /*temps*/v1, v3, v2); + rev64(v1, T16B, v6); + rbit(v1, T16B, v1); + strq(v1, Address(subkeyH, 16 * i)); + } + b(done); + } + { + bind(already_calculated); + + // Load the largest power of H we need into v6. + ldrq(v6, Address(subkeyH, 16 * (unrolls - 1))); + rev64(v6, T16B, v6); + rbit(v6, T16B, v6); + } + bind(done); + + orr(Hprime, T16B, v6, v6); // Move H ** unrolls into Hprime + + // Hprime contains (H ** 1, H ** 2, ... H ** unrolls) + // v0 contains the initial state. Clear the others. + for (int i = 1; i < unrolls; i++) { + int ofs = register_stride * i; + eor(ofs+v0, T16B, ofs+v0, ofs+v0); // zero each state register + } + + ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0 + eor(a1_xor_a0, T16B, a1_xor_a0, Hprime); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) + + // Load #unrolls blocks of data + for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) { + ld1(v2+ofs, T16B, post(data, 0x10)); + } + + // Register assignments, replicated across 4 clones, v0 ... v23 + // + // v0: input / output: current state, result of multiply/reduce + // v1: temp + // v2: input: one block of data (the ciphertext) + // also used as a temp once the data has been consumed + // v3: temp + // v4: output: high part of product + // v5: output: low part ... + // v6: unused + // + // Not replicated: + // + // v28: High part of H xor low part of H' + // v29: H' (hash subkey) + // v30: zero + // v31: Reduction polynomial of the Galois field + + // Inner loop. + // Do the whole load/add/multiply/reduce over all our data except + // the last few rows. + { + Label L_ghash_loop; + bind(L_ghash_loop); + + // Prefetching doesn't help here. In fact, on Neoverse N1 it's worse. + // prfm(Address(data, 128), PLDL1KEEP); + + // Xor data into current state + for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) { + rbit((v2+ofs), T16B, (v2+ofs)); + eor((v2+ofs), T16B, v0+ofs, (v2+ofs)); // bit-swapped data ^ bit-swapped state + } + + // Generate fully-unrolled multiply-reduce in two stages. + + (new GHASHMultiplyGenerator(this, unrolls, + /*result_lo*/v5, /*result_hi*/v4, /*data*/v2, + Hprime, a1_xor_a0, p, vzr, + /*temps*/v1, v3, /* reuse b*/v2))->unroll(); + + // NB: GHASHReduceGenerator also loads the next #unrolls blocks of + // data into v0, v0+ofs, the current state. + (new GHASHReduceGenerator (this, unrolls, + /*result*/v0, /*lo*/v5, /*hi*/v4, p, vzr, + /*data*/v2, /*temp*/v3))->unroll(); + + sub(blocks, blocks, unrolls); + cmp(blocks, (unsigned char)(unrolls * 2)); + br(GE, L_ghash_loop); + } + + // Merge the #unrolls states. Note that the data for the next + // iteration has already been loaded into v4, v4+ofs, etc... + + // First, we multiply/reduce each clone by the appropriate power of H. + for (int i = 0; i < unrolls; i++) { + int ofs = register_stride * i; + ldrq(Hprime, Address(subkeyH, 16 * (unrolls - i - 1))); + + rbit(v2+ofs, T16B, v2+ofs); + eor(v2+ofs, T16B, ofs+v0, v2+ofs); // bit-swapped data ^ bit-swapped state + + rev64(Hprime, T16B, Hprime); + rbit(Hprime, T16B, Hprime); + ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0 + eor(a1_xor_a0, T16B, a1_xor_a0, Hprime); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) + ghash_modmul(/*result*/v0+ofs, /*result_lo*/v5+ofs, /*result_hi*/v4+ofs, /*b*/v2+ofs, + Hprime, vzr, a1_xor_a0, p, + /*temps*/v1+ofs, v3+ofs, /* reuse b*/v2+ofs); + } + + // Then we sum the results. + for (int i = 0; i < unrolls - 1; i++) { + int ofs = register_stride * i; + eor(v0, T16B, v0, v0 + register_stride + ofs); + } + + sub(blocks, blocks, (unsigned char)unrolls); + + // And finally bit-reverse the state back to big endian. + rev64(v0, T16B, v0); + rbit(v0, T16B, v0); + st1(v0, T16B, state); +} \ No newline at end of file diff --git a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp index 2e2e8ae78..c024dec55 100644 --- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp +++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp @@ -2804,6 +2804,266 @@ class StubGenerator: public StubCodeGenerator { return start; } + // CTR AES crypt. + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // c_rarg3 - counter vector byte array address + // c_rarg4 - input length + // c_rarg5 - saved encryptedCounter start + // c_rarg6 - saved used length + // + // Output: + // r0 - input length + // + address generate_counterMode_AESCrypt() { + const Register in = c_rarg0; + const Register out = c_rarg1; + const Register key = c_rarg2; + const Register counter = c_rarg3; + const Register saved_len = c_rarg4, len = r10; + const Register saved_encrypted_ctr = c_rarg5; + const Register used_ptr = c_rarg6, used = r12; + + const Register offset = r7; + const Register keylen = r11; + + const unsigned char block_size = 16; + const int bulk_width = 4; + // NB: bulk_width can be 4 or 8. 8 gives slightly faster + // performance with larger data sizes, but it also means that the + // fast path isn't used until you have at least 8 blocks, and up + // to 127 bytes of data will be executed on the slow path. For + // that reason, and also so as not to blow away too much icache, 4 + // blocks seems like a sensible compromise. + + // Algorithm: + // + // if (len == 0) { + // goto DONE; + // } + // int result = len; + // do { + // if (used >= blockSize) { + // if (len >= bulk_width * blockSize) { + // CTR_large_block(); + // if (len == 0) + // goto DONE; + // } + // for (;;) { + // 16ByteVector v0 = counter; + // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); + // used = 0; + // if (len < blockSize) + // break; /* goto NEXT */ + // 16ByteVector v1 = load16Bytes(in, offset); + // v1 = v1 ^ encryptedCounter; + // store16Bytes(out, offset); + // used = blockSize; + // offset += blockSize; + // len -= blockSize; + // if (len == 0) + // goto DONE; + // } + // } + // NEXT: + // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); + // len--; + // } while (len != 0); + // DONE: + // return result; + // + // CTR_large_block() + // Wide bulk encryption of whole blocks. + + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); + const address start = __ pc(); + __ enter(); + + Label DONE, CTR_large_block, large_block_return; + __ ldrw(used, Address(used_ptr)); + __ cbzw(saved_len, DONE); + + __ mov(len, saved_len); + __ mov(offset, 0); + + // Compute #rounds for AES based on the length of the key array + __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + + __ aesenc_loadkeys(key, keylen); + + { + Label L_CTR_loop, NEXT; + + __ bind(L_CTR_loop); + + __ cmp(used, block_size); + __ br(__ LO, NEXT); + + // Maybe we have a lot of data + __ subsw(rscratch1, len, bulk_width * block_size); + __ br(__ HS, CTR_large_block); + __ BIND(large_block_return); + __ cbzw(len, DONE); + + // Setup the counter + __ movi(v4, __ T4S, 0); + __ movi(v5, __ T4S, 1); + __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 } + + __ ld1(v0, __ T16B, counter); // Load the counter into v0 + __ rev32(v16, __ T16B, v0); + __ addv(v16, __ T4S, v16, v4); + __ rev32(v16, __ T16B, v16); + __ st1(v16, __ T16B, counter); // Save the incremented counter back + + { + // We have fewer than bulk_width blocks of data left. Encrypt + // them one by one until there is less than a full block + // remaining, being careful to save both the encrypted counter + // and the counter. + + Label inner_loop; + __ bind(inner_loop); + // Counter to encrypt is in v0 + __ aesecb_encrypt(noreg, noreg, keylen); + __ st1(v0, __ T16B, saved_encrypted_ctr); + + // Do we have a remaining full block? + + __ mov(used, 0); + __ cmp(len, block_size); + __ br(__ LO, NEXT); + + // Yes, we have a full block + __ ldrq(v1, Address(in, offset)); + __ eor(v1, __ T16B, v1, v0); + __ strq(v1, Address(out, offset)); + __ mov(used, block_size); + __ add(offset, offset, block_size); + + __ subw(len, len, block_size); + __ cbzw(len, DONE); + + // Increment the counter, store it back + __ orr(v0, __ T16B, v16, v16); + __ rev32(v16, __ T16B, v16); + __ addv(v16, __ T4S, v16, v4); + __ rev32(v16, __ T16B, v16); + __ st1(v16, __ T16B, counter); // Save the incremented counter back + + __ b(inner_loop); + } + + __ BIND(NEXT); + + // Encrypt a single byte, and loop. + // We expect this to be a rare event. + __ ldrb(rscratch1, Address(in, offset)); + __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); + __ eor(rscratch1, rscratch1, rscratch2); + __ strb(rscratch1, Address(out, offset)); + __ add(offset, offset, 1); + __ add(used, used, 1); + __ subw(len, len,1); + __ cbnzw(len, L_CTR_loop); + } + + __ bind(DONE); + __ strw(used, Address(used_ptr)); + __ mov(r0, saved_len); + + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(lr); + + // Bulk encryption + + __ BIND (CTR_large_block); + assert(bulk_width == 4 || bulk_width == 8, "must be"); + + if (bulk_width == 8) { + __ sub(sp, sp, 4 * 16); + __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); + } + __ sub(sp, sp, 4 * 16); + __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); + RegSet saved_regs = (RegSet::of(in, out, offset) + + RegSet::of(saved_encrypted_ctr, used_ptr, len)); + __ push(saved_regs, sp); + __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption + __ add(in, in, offset); + __ add(out, out, offset); + + // Keys should already be loaded into the correct registers + + __ ld1(v0, __ T16B, counter); // v0 contains the first counter + __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter + + // AES/CTR loop + { + Label L_CTR_loop; + __ BIND(L_CTR_loop); + + // Setup the counters + __ movi(v8, __ T4S, 0); + __ movi(v9, __ T4S, 1); + __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } + + for (FloatRegister f = v0; f < v0 + bulk_width; f++) { + __ rev32(f, __ T16B, v16); + __ addv(v16, __ T4S, v16, v8); + } + + __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); + + // Encrypt the counters + __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); + + if (bulk_width == 8) { + __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); + } + + // XOR the encrypted counters with the inputs + for (int i = 0; i < bulk_width; i++) { + __ eor(v0 + i, __ T16B, v0 + i, v8 + i); + } + + // Write the encrypted data + __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); + if (bulk_width == 8) { + __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); + } + + __ subw(len, len, 16 * bulk_width); + __ cbnzw(len, L_CTR_loop); + } + + // Save the counter back where it goes + __ rev32(v16, __ T16B, v16); + __ st1(v16, __ T16B, counter); + + __ pop(saved_regs, sp); + + __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); + if (bulk_width == 8) { + __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); + } + + __ andr(rscratch1, len, -16 * bulk_width); + __ sub(len, len, rscratch1); + __ add(offset, offset, rscratch1); + __ mov(used, 16); + __ strw(used, Address(used_ptr)); + __ b(large_block_return); + + return start; + } + + // Arguments: // // Inputs: @@ -3677,6 +3937,56 @@ class StubGenerator: public StubCodeGenerator { return start; } + address generate_ghash_processBlocks_wide() { + address small = generate_ghash_processBlocks(); + + StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); + __ align(wordSize * 2); + address p = __ pc(); + __ emit_int64(0x87); // The low-order bits of the field + // polynomial (i.e. p = z^7+z^2+z+1) + // repeated in the low and high parts of a + // 128-bit vector + __ emit_int64(0x87); + + __ align(CodeEntryAlignment); + address start = __ pc(); + + Register state = c_rarg0; + Register subkeyH = c_rarg1; + Register data = c_rarg2; + Register blocks = c_rarg3; + + const int unroll = 4; + + __ cmp(blocks, (unsigned char)(unroll * 2)); + __ br(__ LT, small); + + if (unroll > 1) { + // Save state before entering routine + __ sub(sp, sp, 4 * 16); + __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); + __ sub(sp, sp, 4 * 16); + __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); + } + + __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); + + if (unroll > 1) { + // And restore state + __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); + __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); + } + + __ cmp(blocks, 0u); + __ br(__ GT, small); + + __ ret(lr); + + return start; + } + + // Continuation point for throwing of implicit exceptions that are // not handled in the current activation. Fabricates an exception // oop and initiates normal exception dispatching in this @@ -4687,6 +4997,15 @@ class StubGenerator: public StubCodeGenerator { StubRoutines::_montgomerySquare = g.generate_multiply(); } + // generate GHASH intrinsics code + if (UseGHASHIntrinsics) { + if (UseAESCTRIntrinsics) { + StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); + } else { + StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); + } + } + if (UseAESIntrinsics) { StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); @@ -4694,9 +5013,8 @@ class StubGenerator: public StubCodeGenerator { StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); } - // generate GHASH intrinsics code - if (UseGHASHIntrinsics) { - StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); + if (UseAESCTRIntrinsics) { + StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); } if (UseSHA1Intrinsics) { diff --git a/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp index d1c312ab3..05619ce7f 100644 --- a/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp +++ b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp @@ -37,7 +37,7 @@ static bool returns_to_call_stub(address return_pc) { enum platform_dependent_constants { code_size1 = 19000, // simply increase if too small (assembler will crash if too small) - code_size2 = 22000 // simply increase if too small (assembler will crash if too small) + code_size2 = 32000 // simply increase if too small (assembler will crash if too small) }; class aarch64 { diff --git a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp index 9808337a0..de636fb83 100644 --- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp +++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp @@ -233,12 +233,21 @@ void VM_Version::get_processor_features() { warning("UseAESIntrinsics enabled, but UseAES not, enabling"); UseAES = true; } + if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); + } } else { if (UseAES) { - warning("UseAES specified, but not supported on this CPU"); + warning("AES instructions are not available on this CPU"); + FLAG_SET_DEFAULT(UseAES, false); } if (UseAESIntrinsics) { - warning("UseAESIntrinsics specified, but not supported on this CPU"); + warning("AES intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseAESIntrinsics, false); + } + if (UseAESCTRIntrinsics) { + warning("AES/CTR intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); } } diff --git a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp index b5ce1cfa9..fea8b1f87 100644 --- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp +++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp @@ -194,6 +194,11 @@ void VM_Version::initialize() { FLAG_SET_DEFAULT(UseAESIntrinsics, false); } + if (UseAESCTRIntrinsics) { + warning("AES/CTR intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); + } + if (UseGHASHIntrinsics) { warning("GHASH intrinsics are not available on this CPU"); FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); diff --git a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp index bd893e138..08d7a7311 100644 --- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp +++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp @@ -319,6 +319,11 @@ void VM_Version::initialize() { } } + if (UseAESCTRIntrinsics) { + warning("AES/CTR intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); + } + // GHASH/GCM intrinsics if (has_vis3() && (UseVIS > 2)) { if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.cpp b/hotspot/src/cpu/x86/vm/assembler_x86.cpp index 1759ecdfd..ddc1acfd8 100644 --- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp +++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp @@ -2373,20 +2373,52 @@ void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) { void Assembler::pextrd(Register dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, false); + int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, false); emit_int8(0x16); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); } +void Assembler::pextrd(Address dst, XMMRegister src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, false); + emit_int8(0x16); + emit_operand(src, dst); + emit_int8(imm8); +} + void Assembler::pextrq(Register dst, XMMRegister src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); - int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, true); + int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, true); emit_int8(0x16); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(imm8); } +void Assembler::pextrq(Address dst, XMMRegister src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, true); + emit_int8(0x16); + emit_operand(src, dst); + emit_int8(imm8); +} + +void Assembler::pextrw(Address dst, XMMRegister src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A); + emit_int8((unsigned char)0x15); + emit_operand(src, dst); + emit_int8(imm8); +} + +void Assembler::pextrb(Address dst, XMMRegister src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A); + emit_int8(0x14); + emit_operand(src, dst); + emit_int8(imm8); +} + void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, false); @@ -2395,6 +2427,14 @@ void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) { emit_int8(imm8); } +void Assembler::pinsrd(XMMRegister dst, Address src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, false); + emit_int8(0x22); + emit_operand(dst,src); + emit_int8(imm8); +} + void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) { assert(VM_Version::supports_sse4_1(), ""); int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, true); @@ -2403,6 +2443,30 @@ void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) { emit_int8(imm8); } +void Assembler::pinsrq(XMMRegister dst, Address src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, true); + emit_int8(0x22); + emit_operand(dst, src); + emit_int8(imm8); +} + +void Assembler::pinsrw(XMMRegister dst, Address src, int imm8) { + assert(VM_Version::supports_sse2(), ""); + simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F); + emit_int8((unsigned char)0xC4); + emit_operand(dst, src); + emit_int8(imm8); +} + +void Assembler::pinsrb(XMMRegister dst, Address src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A); + emit_int8(0x20); + emit_operand(dst, src); + emit_int8(imm8); +} + void Assembler::pmovzxbw(XMMRegister dst, Address src) { assert(VM_Version::supports_sse4_1(), ""); InstructionMark im(this); @@ -3075,6 +3139,12 @@ void Assembler::xorl(Register dst, Register src) { emit_arith(0x33, 0xC0, dst, src); } +void Assembler::xorb(Register dst, Address src) { + InstructionMark im(this); + prefix(src, dst); + emit_int8(0x32); + emit_operand(dst, src); +} // AVX 3-operands scalar float-point arithmetic instructions diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.hpp b/hotspot/src/cpu/x86/vm/assembler_x86.hpp index 5ea01311e..c2e70bc2a 100644 --- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp +++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp @@ -1479,10 +1479,20 @@ private: // SSE 4.1 extract void pextrd(Register dst, XMMRegister src, int imm8); void pextrq(Register dst, XMMRegister src, int imm8); + void pextrd(Address dst, XMMRegister src, int imm8); + void pextrq(Address dst, XMMRegister src, int imm8); + void pextrb(Address dst, XMMRegister src, int imm8); + // SSE 2 extract + void pextrw(Address dst, XMMRegister src, int imm8); // SSE 4.1 insert void pinsrd(XMMRegister dst, Register src, int imm8); void pinsrq(XMMRegister dst, Register src, int imm8); + void pinsrd(XMMRegister dst, Address src, int imm8); + void pinsrq(XMMRegister dst, Address src, int imm8); + void pinsrb(XMMRegister dst, Address src, int imm8); + // SSE 2 insert + void pinsrw(XMMRegister dst, Address src, int imm8); // SSE4.1 packed move void pmovzxbw(XMMRegister dst, XMMRegister src); @@ -1687,6 +1697,8 @@ private: void xorl(Register dst, Address src); void xorl(Register dst, Register src); + void xorb(Register dst, Address src); + void xorq(Register dst, Address src); void xorq(Register dst, Register src); diff --git a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp index 2e5599807..f555f3326 100644 --- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp +++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp @@ -2153,6 +2153,17 @@ class StubGenerator: public StubCodeGenerator { return start; } + address generate_counter_shuffle_mask() { + __ align(16); + StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask"); + address start = __ pc(); + __ emit_data(0x0c0d0e0f, relocInfo::none, 0); + __ emit_data(0x08090a0b, relocInfo::none, 0); + __ emit_data(0x04050607, relocInfo::none, 0); + __ emit_data(0x00010203, relocInfo::none, 0); + return start; + } + // Utility routine for loading a 128-bit key word in little endian format // can optionally specify that the shuffle mask is already in an xmmregister void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { @@ -2178,6 +2189,31 @@ class StubGenerator: public StubCodeGenerator { __ aesdec(xmmdst, xmmtmp); } + // Utility routine for increase 128bit counter (iv in CTR mode) + // XMM_128bit, D3, D2, D1, D0 + void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) { + __ pextrd(reg, xmmdst, 0x0); + __ addl(reg, inc_delta); + __ pinsrd(xmmdst, reg, 0x0); + __ jcc(Assembler::carryClear, next_block); // jump if no carry + + __ pextrd(reg, xmmdst, 0x01); // Carry-> D1 + __ addl(reg, 0x01); + __ pinsrd(xmmdst, reg, 0x01); + __ jcc(Assembler::carryClear, next_block); // jump if no carry + + __ pextrd(reg, xmmdst, 0x02); // Carry-> D2 + __ addl(reg, 0x01); + __ pinsrd(xmmdst, reg, 0x02); + __ jcc(Assembler::carryClear, next_block); // jump if no carry + + __ pextrd(reg, xmmdst, 0x03); // Carry -> D3 + __ addl(reg, 0x01); + __ pinsrd(xmmdst, reg, 0x03); + + __ BIND(next_block); // next instruction + } + // Arguments: // @@ -2719,6 +2755,309 @@ class StubGenerator: public StubCodeGenerator { return start; } + + // CTR AES crypt. + // In 32-bit stub, parallelize 4 blocks at a time + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // c_rarg3 - counter vector byte array address + // c_rarg4 - input length + // + // Output: + // rax - input length + // + address generate_counterMode_AESCrypt_Parallel() { + assert(UseAES, "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); + address start = __ pc(); + const Register from = rsi; // source array address + const Register to = rdx; // destination array address + const Register key = rcx; // key array address + const Register counter = rdi; // counter byte array initialized from initvector array address + + // and left with the results of the last encryption block + const Register len_reg = rbx; + const Register pos = rax; + + __ enter(); // required for proper stackwalking of RuntimeStub frame + handleSOERegisters(true /*saving*/); // save rbx, rsi, rdi + + // load registers from incoming parameters + const Address from_param(rbp, 8+0); + const Address to_param (rbp, 8+4); + const Address key_param (rbp, 8+8); + const Address rvec_param (rbp, 8+12); + const Address len_param (rbp, 8+16); + const Address saved_counter_param(rbp, 8 + 20); + const Address used_addr_param(rbp, 8 + 24); + + __ movptr(from , from_param); + __ movptr(to , to_param); + //__ movptr(key, key_param); + //__ movptr(counter, rvec_param); + __ movptr(len_reg , len_param); + //__ movptr(pos, 0); + + // Use the partially used encrpyted counter from last invocation + Label L_exit_preLoop, L_preLoop_start; + + // Use the registers 'counter' and 'key' here in this preloop + // to hold of last 2 params 'used' and 'saved_encCounter_start' + Register used = counter; + Register saved_encCounter_start = key; + Register used_addr = saved_encCounter_start; + + __ movptr(used_addr, used_addr_param); + __ movptr(used, Address(used_addr, 0)); + __ movptr(saved_encCounter_start, saved_counter_param); + + __ BIND(L_preLoop_start); + __ cmpptr(used, 16); + __ jcc(Assembler::aboveEqual, L_exit_preLoop); + __ cmpptr(len_reg, 0); + __ jcc(Assembler::lessEqual, L_exit_preLoop); + __ movb(rax, Address(saved_encCounter_start, used)); + __ xorb(rax, Address(from, 0)); + __ movb(Address(to, 0), rax); + __ addptr(from, 1); + __ addptr(to, 1); + __ addptr(used, 1); + __ subptr(len_reg, 1); + + __ jmp(L_preLoop_start); + + __ BIND(L_exit_preLoop); + __ movptr(used_addr, used_addr_param); + __ movptr(used_addr, used_addr_param); + __ movl(Address(used_addr, 0), used); + + // load the parameters 'key' and 'counter' + __ movptr(key, key_param); + __ movptr(counter, rvec_param); + + // xmm register assignments for the loops below + const XMMRegister xmm_curr_counter = xmm0; + const XMMRegister xmm_counter_shuf_mask = xmm1; // need to be reloaded + const XMMRegister xmm_key_shuf_mask = xmm2; // need to be reloaded + const XMMRegister xmm_key = xmm3; + const XMMRegister xmm_result0 = xmm4; + const XMMRegister xmm_result1 = xmm5; + const XMMRegister xmm_result2 = xmm6; + const XMMRegister xmm_result3 = xmm7; + const XMMRegister xmm_from0 = xmm1; //reuse XMM register + const XMMRegister xmm_from1 = xmm2; + const XMMRegister xmm_from2 = xmm3; + const XMMRegister xmm_from3 = xmm4; + + //for key_128, key_192, key_256 + const int rounds[3] = {10, 12, 14}; + Label L_singleBlockLoopTop[3]; + Label L_multiBlock_loopTop[3]; + Label L_key192_top, L_key256_top; + Label L_incCounter[3][4]; // 3: different key length, 4: 4 blocks at a time + Label L_incCounter_single[3]; //for single block, key128, key192, key256 + Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3]; + Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3]; + + Label L_exit; + const int PARALLEL_FACTOR = 4; //because of the limited register number + + // initialize counter with initial counter + __ movdqu(xmm_curr_counter, Address(counter, 0x00)); + __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); + __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled for increase + + // key length could be only {11, 13, 15} * 4 = {44, 52, 60} + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + __ cmpl(rax, 52); + __ jcc(Assembler::equal, L_key192_top); + __ cmpl(rax, 60); + __ jcc(Assembler::equal, L_key256_top); + + //key128 begins here + __ movptr(pos, 0); // init pos before L_multiBlock_loopTop + +#define CTR_DoFour(opc, src_reg) \ + __ opc(xmm_result0, src_reg); \ + __ opc(xmm_result1, src_reg); \ + __ opc(xmm_result2, src_reg); \ + __ opc(xmm_result3, src_reg); + + // k == 0 : generate code for key_128 + // k == 1 : generate code for key_192 + // k == 2 : generate code for key_256 + for (int k = 0; k < 3; ++k) { + //multi blocks starts here + __ align(OptoLoopAlignment); + __ BIND(L_multiBlock_loopTop[k]); + __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left + __ jcc(Assembler::less, L_singleBlockLoopTop[k]); + + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); + + //load, then increase counters + CTR_DoFour(movdqa, xmm_curr_counter); + __ push(rbx); + inc_counter(rbx, xmm_result1, 0x01, L_incCounter[k][0]); + inc_counter(rbx, xmm_result2, 0x02, L_incCounter[k][1]); + inc_counter(rbx, xmm_result3, 0x03, L_incCounter[k][2]); + inc_counter(rbx, xmm_curr_counter, 0x04, L_incCounter[k][3]); + __ pop (rbx); + + load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); // load Round 0 key. interleaving for better performance + + CTR_DoFour(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR + CTR_DoFour(pxor, xmm_key); //PXOR with Round 0 key + + for (int i = 1; i < rounds[k]; ++i) { + load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask); + CTR_DoFour(aesenc, xmm_key); + } + load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask); + CTR_DoFour(aesenclast, xmm_key); + + // get next PARALLEL_FACTOR blocks into xmm_from registers + __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); + __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); + __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); + + // PXOR with input text + __ pxor(xmm_result0, xmm_from0); //result0 is xmm4 + __ pxor(xmm_result1, xmm_from1); + __ pxor(xmm_result2, xmm_from2); + + // store PARALLEL_FACTOR results into the next 64 bytes of output + __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); + __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); + __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); + + // do it here after xmm_result0 is saved, because xmm_from3 reuse the same register of xmm_result0. + __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); + __ pxor(xmm_result3, xmm_from3); + __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); + + __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text + __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length + __ jmp(L_multiBlock_loopTop[k]); + + // singleBlock starts here + __ align(OptoLoopAlignment); + __ BIND(L_singleBlockLoopTop[k]); + __ cmpptr(len_reg, 0); + __ jcc(Assembler::equal, L_exit); + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); + __ movdqa(xmm_result0, xmm_curr_counter); + load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); + __ push(rbx);//rbx is used for increasing counter + inc_counter(rbx, xmm_curr_counter, 0x01, L_incCounter_single[k]); + __ pop (rbx); + __ pshufb(xmm_result0, xmm_counter_shuf_mask); + __ pxor(xmm_result0, xmm_key); + for (int i = 1; i < rounds[k]; i++) { + load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask); + __ aesenc(xmm_result0, xmm_key); + } + load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask); + __ aesenclast(xmm_result0, xmm_key); + __ cmpptr(len_reg, AESBlockSize); + __ jcc(Assembler::less, L_processTail_insr[k]); + __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); + __ pxor(xmm_result0, xmm_from0); + __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jmp(L_singleBlockLoopTop[k]); + + __ BIND(L_processTail_insr[k]); + __ addptr(pos, len_reg); + __ testptr(len_reg, 8); + __ jcc(Assembler::zero, L_processTail_4_insr[k]); + __ subptr(pos,8); + __ pinsrd(xmm_from0, Address(from, pos), 0); + __ pinsrd(xmm_from0, Address(from, pos, Address::times_1, 4), 1); + __ BIND(L_processTail_4_insr[k]); + __ testptr(len_reg, 4); + __ jcc(Assembler::zero, L_processTail_2_insr[k]); + __ subptr(pos,4); + __ pslldq(xmm_from0, 4); + __ pinsrd(xmm_from0, Address(from, pos), 0); + __ BIND(L_processTail_2_insr[k]); + __ testptr(len_reg, 2); + __ jcc(Assembler::zero, L_processTail_1_insr[k]); + __ subptr(pos, 2); + __ pslldq(xmm_from0, 2); + __ pinsrw(xmm_from0, Address(from, pos), 0); + __ BIND(L_processTail_1_insr[k]); + __ testptr(len_reg, 1); + __ jcc(Assembler::zero, L_processTail_exit_insr[k]); + __ subptr(pos, 1); + __ pslldq(xmm_from0, 1); + __ pinsrb(xmm_from0, Address(from, pos), 0); + __ BIND(L_processTail_exit_insr[k]); + + __ movptr(saved_encCounter_start, saved_counter_param); + __ movdqu(Address(saved_encCounter_start, 0), xmm_result0); + __ pxor(xmm_result0, xmm_from0); + + __ testptr(len_reg, 8); + __ jcc(Assembler::zero, L_processTail_4_extr[k]); + __ pextrd(Address(to, pos), xmm_result0, 0); + __ pextrd(Address(to, pos, Address::times_1, 4), xmm_result0, 1); + __ psrldq(xmm_result0, 8); + __ addptr(pos, 8); + __ BIND(L_processTail_4_extr[k]); + __ testptr(len_reg, 4); + __ jcc(Assembler::zero, L_processTail_2_extr[k]); + __ pextrd(Address(to, pos), xmm_result0, 0); + __ psrldq(xmm_result0, 4); + __ addptr(pos, 4); + __ BIND(L_processTail_2_extr[k]); + __ testptr(len_reg, 2); + __ jcc(Assembler::zero, L_processTail_1_extr[k]); + __ pextrb(Address(to, pos), xmm_result0, 0); + __ pextrb(Address(to, pos, Address::times_1, 1), xmm_result0, 1); + __ psrldq(xmm_result0, 2); + __ addptr(pos, 2); + __ BIND(L_processTail_1_extr[k]); + __ testptr(len_reg, 1); + __ jcc(Assembler::zero, L_processTail_exit_extr[k]); + __ pextrb(Address(to, pos), xmm_result0, 0); + + __ BIND(L_processTail_exit_extr[k]); + __ movptr(used_addr, used_addr_param); + __ movl(Address(used_addr, 0), len_reg); + __ jmp(L_exit); + } + + __ BIND(L_exit); + __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); + __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back. + __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back + handleSOERegisters(false /*restoring*/); + __ movptr(rax, len_param); // return length + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + + __ BIND (L_key192_top); + __ movptr(pos, 0); // init pos before L_multiBlock_loopTop + __ jmp(L_multiBlock_loopTop[1]); //key192 + + __ BIND (L_key256_top); + __ movptr(pos, 0); // init pos before L_multiBlock_loopTop + __ jmp(L_multiBlock_loopTop[2]); //key192 + + return start; + } + + // byte swap x86 long address generate_ghash_long_swap_mask() { __ align(CodeEntryAlignment); @@ -3181,6 +3520,11 @@ class StubGenerator: public StubCodeGenerator { StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); } + if (UseAESCTRIntrinsics) { + StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask(); + StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel(); + } + // Generate GHASH intrinsics code if (UseGHASHIntrinsics) { StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); diff --git a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp index c5811b28b..254f63392 100644 --- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp +++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp @@ -3010,6 +3010,15 @@ class StubGenerator: public StubCodeGenerator { return start; } + address generate_counter_shuffle_mask() { + __ align(16); + StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask"); + address start = __ pc(); + __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); + __ emit_data64(0x0001020304050607, relocInfo::none); + return start; + } + // Utility routine for loading a 128-bit key word in little endian format // can optionally specify that the shuffle mask is already in an xmmregister void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { @@ -3021,6 +3030,18 @@ class StubGenerator: public StubCodeGenerator { } } + // Utility routine for increase 128bit counter (iv in CTR mode) + void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) { + __ pextrq(reg, xmmdst, 0x0); + __ addq(reg, inc_delta); + __ pinsrq(xmmdst, reg, 0x0); + __ jcc(Assembler::carryClear, next_block); // jump if no carry + __ pextrq(reg, xmmdst, 0x01); // Carry + __ addq(reg, 0x01); + __ pinsrq(xmmdst, reg, 0x01); //Carry end + __ BIND(next_block); // next instruction + } + // Arguments: // // Inputs: @@ -3639,6 +3660,320 @@ class StubGenerator: public StubCodeGenerator { return start; } + // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time + // to hide instruction latency + // + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // c_rarg3 - counter vector byte array address + // Linux + // c_rarg4 - input length + // c_rarg5 - saved encryptedCounter start + // rbp + 6 * wordSize - saved used length + // Windows + // rbp + 6 * wordSize - input length + // rbp + 7 * wordSize - saved encryptedCounter start + // rbp + 8 * wordSize - saved used length + // + // Output: + // rax - input length + // + address generate_counterMode_AESCrypt_Parallel() { + assert(UseAES, "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); + address start = __ pc(); + const Register from = c_rarg0; // source array address + const Register to = c_rarg1; // destination array address + const Register key = c_rarg2; // key array address + const Register counter = c_rarg3; // counter byte array initialized from counter array address + // and left with the results of the last encryption block +#ifndef _WIN64 + const Register len_reg = c_rarg4; + const Register saved_encCounter_start = c_rarg5; + const Register used_addr = r10; + const Address used_mem(rbp, 2 * wordSize); + const Register used = r11; +#else + const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 + const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64 + const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64 + const Register len_reg = r10; // pick the first volatile windows register + const Register saved_encCounter_start = r11; + const Register used_addr = r13; + const Register used = r14; +#endif + const Register pos = rax; + + const int PARALLEL_FACTOR = 6; + const XMMRegister xmm_counter_shuf_mask = xmm0; + const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front + const XMMRegister xmm_curr_counter = xmm2; + + const XMMRegister xmm_key_tmp0 = xmm3; + const XMMRegister xmm_key_tmp1 = xmm4; + + // registers holding the four results in the parallelized loop + const XMMRegister xmm_result0 = xmm5; + const XMMRegister xmm_result1 = xmm6; + const XMMRegister xmm_result2 = xmm7; + const XMMRegister xmm_result3 = xmm8; + const XMMRegister xmm_result4 = xmm9; + const XMMRegister xmm_result5 = xmm10; + + const XMMRegister xmm_from0 = xmm11; + const XMMRegister xmm_from1 = xmm12; + const XMMRegister xmm_from2 = xmm13; + const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64. + const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text + const XMMRegister xmm_from5 = xmm4; + + //for key_128, key_192, key_256 + const int rounds[3] = {10, 12, 14}; + Label L_exit_preLoop, L_preLoop_start; + Label L_multiBlock_loopTop[3]; + Label L_singleBlockLoopTop[3]; + Label L__incCounter[3][6]; //for 6 blocks + Label L__incCounter_single[3]; //for single block, key128, key192, key256 + Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3]; + Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3]; + + Label L_exit; + + __ enter(); // required for proper stackwalking of RuntimeStub frame + +#ifdef _WIN64 + // save the xmm registers which must be preserved 6-14 + const int XMM_REG_NUM_KEY_LAST = 14; + __ subptr(rsp, -rsp_after_call_off * wordSize); + for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { + __ movdqu(xmm_save(i), as_XMMRegister(i)); + } + + const Address r13_save(rbp, rdi_off * wordSize); + const Address r14_save(rbp, rsi_off * wordSize); + + __ movptr(r13_save, r13); + __ movptr(r14_save, r14); + + // on win64, fill len_reg from stack position + __ movl(len_reg, len_mem); + __ movptr(saved_encCounter_start, saved_encCounter_mem); + __ movptr(used_addr, used_mem); + __ movl(used, Address(used_addr, 0)); +#else + __ push(len_reg); // Save + __ movptr(used_addr, used_mem); + __ movl(used, Address(used_addr, 0)); +#endif + + __ push(rbx); // Save RBX + __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter + __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); + __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled + __ movptr(pos, 0); + + // Use the partially used encrpyted counter from last invocation + __ BIND(L_preLoop_start); + __ cmpptr(used, 16); + __ jcc(Assembler::aboveEqual, L_exit_preLoop); + __ cmpptr(len_reg, 0); + __ jcc(Assembler::lessEqual, L_exit_preLoop); + __ movb(rbx, Address(saved_encCounter_start, used)); + __ xorb(rbx, Address(from, pos)); + __ movb(Address(to, pos), rbx); + __ addptr(pos, 1); + __ addptr(used, 1); + __ subptr(len_reg, 1); + + __ jmp(L_preLoop_start); + + __ BIND(L_exit_preLoop); + __ movl(Address(used_addr, 0), used); + + // key length could be only {11, 13, 15} * 4 = {44, 52, 60} + __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); + __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + __ cmpl(rbx, 52); + __ jcc(Assembler::equal, L_multiBlock_loopTop[1]); + __ cmpl(rbx, 60); + __ jcc(Assembler::equal, L_multiBlock_loopTop[2]); + +#define CTR_DoSix(opc, src_reg) \ + __ opc(xmm_result0, src_reg); \ + __ opc(xmm_result1, src_reg); \ + __ opc(xmm_result2, src_reg); \ + __ opc(xmm_result3, src_reg); \ + __ opc(xmm_result4, src_reg); \ + __ opc(xmm_result5, src_reg); + + // k == 0 : generate code for key_128 + // k == 1 : generate code for key_192 + // k == 2 : generate code for key_256 + for (int k = 0; k < 3; ++k) { + //multi blocks starts here + __ align(OptoLoopAlignment); + __ BIND(L_multiBlock_loopTop[k]); + __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left + __ jcc(Assembler::less, L_singleBlockLoopTop[k]); + load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); + + //load, then increase counters + CTR_DoSix(movdqa, xmm_curr_counter); + inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]); + inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]); + inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]); + inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]); + inc_counter(rbx, xmm_result5, 0x05, L__incCounter[k][4]); + inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]); + CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR + CTR_DoSix(pxor, xmm_key_tmp0); //PXOR with Round 0 key + + //load two ROUND_KEYs at a time + for (int i = 1; i < rounds[k]; ) { + load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask); + load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask); + CTR_DoSix(aesenc, xmm_key_tmp1); + i++; + if (i != rounds[k]) { + CTR_DoSix(aesenc, xmm_key_tmp0); + } else { + CTR_DoSix(aesenclast, xmm_key_tmp0); + } + i++; + } + + // get next PARALLEL_FACTOR blocks into xmm_result registers + __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); + __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); + __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); + __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); + __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize)); + __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize)); + + __ pxor(xmm_result0, xmm_from0); + __ pxor(xmm_result1, xmm_from1); + __ pxor(xmm_result2, xmm_from2); + __ pxor(xmm_result3, xmm_from3); + __ pxor(xmm_result4, xmm_from4); + __ pxor(xmm_result5, xmm_from5); + + // store 6 results into the next 64 bytes of output + __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); + __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); + __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); + __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); + __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4); + __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5); + + __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text + __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length + __ jmp(L_multiBlock_loopTop[k]); + + // singleBlock starts here + __ align(OptoLoopAlignment); + __ BIND(L_singleBlockLoopTop[k]); + __ cmpptr(len_reg, 0); + __ jcc(Assembler::lessEqual, L_exit); + load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); + __ movdqa(xmm_result0, xmm_curr_counter); + inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]); + __ pshufb(xmm_result0, xmm_counter_shuf_mask); + __ pxor(xmm_result0, xmm_key_tmp0); + for (int i = 1; i < rounds[k]; i++) { + load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask); + __ aesenc(xmm_result0, xmm_key_tmp0); + } + load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask); + __ aesenclast(xmm_result0, xmm_key_tmp0); + __ cmpptr(len_reg, AESBlockSize); + __ jcc(Assembler::less, L_processTail_insr[k]); + __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); + __ pxor(xmm_result0, xmm_from0); + __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); + __ addptr(pos, AESBlockSize); + __ subptr(len_reg, AESBlockSize); + __ jmp(L_singleBlockLoopTop[k]); + __ BIND(L_processTail_insr[k]); + __ addptr(pos, len_reg); + __ testptr(len_reg, 8); + __ jcc(Assembler::zero, L_processTail_4_insr[k]); + __ subptr(pos,8); + __ pinsrq(xmm_from0, Address(from, pos), 0); + __ BIND(L_processTail_4_insr[k]); + __ testptr(len_reg, 4); + __ jcc(Assembler::zero, L_processTail_2_insr[k]); + __ subptr(pos,4); + __ pslldq(xmm_from0, 4); + __ pinsrd(xmm_from0, Address(from, pos), 0); + __ BIND(L_processTail_2_insr[k]); + __ testptr(len_reg, 2); + __ jcc(Assembler::zero, L_processTail_1_insr[k]); + __ subptr(pos, 2); + __ pslldq(xmm_from0, 2); + __ pinsrw(xmm_from0, Address(from, pos), 0); + __ BIND(L_processTail_1_insr[k]); + __ testptr(len_reg, 1); + __ jcc(Assembler::zero, L_processTail_exit_insr[k]); + __ subptr(pos, 1); + __ pslldq(xmm_from0, 1); + __ pinsrb(xmm_from0, Address(from, pos), 0); + __ BIND(L_processTail_exit_insr[k]); + + __ movdqu(Address(saved_encCounter_start, 0), xmm_result0); + __ pxor(xmm_result0, xmm_from0); + + __ testptr(len_reg, 8); + __ jcc(Assembler::zero, L_processTail_4_extr[k]); + __ pextrq(Address(to, pos), xmm_result0, 0); + __ psrldq(xmm_result0, 8); + __ addptr(pos, 8); + __ BIND(L_processTail_4_extr[k]); + __ testptr(len_reg, 4); + __ jcc(Assembler::zero, L_processTail_2_extr[k]); + __ pextrd(Address(to, pos), xmm_result0, 0); + __ psrldq(xmm_result0, 4); + __ addptr(pos, 4); + __ BIND(L_processTail_2_extr[k]); + __ testptr(len_reg, 2); + __ jcc(Assembler::zero, L_processTail_1_extr[k]); + __ pextrw(Address(to, pos), xmm_result0, 0); + __ psrldq(xmm_result0, 2); + __ addptr(pos, 2); + __ BIND(L_processTail_1_extr[k]); + __ testptr(len_reg, 1); + __ jcc(Assembler::zero, L_processTail_exit_extr[k]); + __ pextrb(Address(to, pos), xmm_result0, 0); + + __ BIND(L_processTail_exit_extr[k]); + __ movl(Address(used_addr, 0), len_reg); + __ jmp(L_exit); + + } + + __ BIND(L_exit); + __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back. + __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back + __ pop(rbx); // pop the saved RBX. +#ifdef _WIN64 + // restore regs belonging to calling function + for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { + __ movdqu(as_XMMRegister(i), xmm_save(i)); + } + __ movl(rax, len_mem); + __ movptr(r13, r13_save); + __ movptr(r14, r14_save); +#else + __ pop(rax); // return 'len' +#endif + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + return start; + } // byte swap x86 long address generate_ghash_long_swap_mask() { @@ -4239,12 +4574,15 @@ class StubGenerator: public StubCodeGenerator { // don't bother generating these AES intrinsic stubs unless global flag is set if (UseAESIntrinsics) { StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others - StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); } + if (UseAESCTRIntrinsics){ + StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask(); + StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel(); + } // Generate GHASH intrinsics code if (UseGHASHIntrinsics) { diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp index 9b0d8fc75..617879377 100644 --- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp +++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp @@ -33,6 +33,7 @@ address StubRoutines::x86::_verify_mxcsr_entry = NULL; address StubRoutines::x86::_key_shuffle_mask_addr = NULL; +address StubRoutines::x86::_counter_shuffle_mask_addr = NULL; address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL; address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL; diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp index bb160486c..70b5a34ac 100644 --- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp +++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp @@ -33,6 +33,10 @@ static address _verify_mxcsr_entry; // shuffle mask for fixing up 128-bit words consisting of big-endian 32-bit integers static address _key_shuffle_mask_addr; + + //shuffle mask for big-endian 128-bit integers + static address _counter_shuffle_mask_addr; + // masks and table for CRC32 static uint64_t _crc_by128_masks[]; static juint _crc_table[]; @@ -43,6 +47,7 @@ public: static address verify_mxcsr_entry() { return _verify_mxcsr_entry; } static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; } + static address counter_shuffle_mask_addr() { return _counter_shuffle_mask_addr; } static address crc_by128_masks_addr() { return (address)_crc_by128_masks; } static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; } static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; } diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp index bca5d493c..538f83e69 100644 --- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp +++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp @@ -31,7 +31,7 @@ enum platform_dependent_constants { code_size1 = 9000, // simply increase if too small (assembler will crash if too small) - code_size2 = 22000 // simply increase if too small (assembler will crash if too small) + code_size2 = 25800 // simply increase if too small (assembler will crash if too small) }; class x86 { diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp index b048fd74e..f963cd2f8 100644 --- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp +++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp @@ -33,7 +33,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _ enum platform_dependent_constants { code_size1 = 19000, // simply increase if too small (assembler will crash if too small) - code_size2 = 24000 // simply increase if too small (assembler will crash if too small) + code_size2 = 27000 // simply increase if too small (assembler will crash if too small) }; class x86 { diff --git a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp index 46b3e32ea..ce3037d76 100644 --- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp +++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp @@ -573,6 +573,28 @@ void VM_Version::get_processor_features() { } FLAG_SET_DEFAULT(UseAESIntrinsics, false); } + + // --AES-CTR begins-- + if (!UseAESIntrinsics) { + if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { + warning("AES-CTR intrinsics require UseAESIntrinsics flag to be enabled. Intrinsics will be disabled."); + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); + } + } else { + if(supports_sse4_1() && UseSSE >= 4) { + if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, true); + } + } else { + // The AES-CTR intrinsic stubs require AES instruction support (of course) + // but also require sse4.1 mode or higher for instructions it use. + if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { + warning("X86 AES-CTR intrinsics require SSE4.1 instructions or higher. Intrinsics will be disabled."); + } + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); + } + } + // --AES-CTR ends-- } } else if (UseAES || UseAESIntrinsics) { if (UseAES && !FLAG_IS_DEFAULT(UseAES)) { @@ -583,6 +605,10 @@ void VM_Version::get_processor_features() { warning("AES intrinsics are not available on this CPU"); FLAG_SET_DEFAULT(UseAESIntrinsics, false); } + if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { + warning("AES-CTR intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); + } } // Use CLMUL instructions if available. @@ -606,6 +632,16 @@ void VM_Version::get_processor_features() { FLAG_SET_DEFAULT(UseCRC32Intrinsics, false); } + if (UseAESIntrinsics) { + if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { + UseAESCTRIntrinsics = true; + } + } else if (UseAESCTRIntrinsics) { + if (!FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) + warning("AES/CTR intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); + } + // GHASH/GCM intrinsics if (UseCLMUL && (UseSSE > 2)) { if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { diff --git a/hotspot/src/share/vm/classfile/vmSymbols.hpp b/hotspot/src/share/vm/classfile/vmSymbols.hpp index 942d172a1..4ca2a3ad4 100644 --- a/hotspot/src/share/vm/classfile/vmSymbols.hpp +++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp @@ -846,6 +846,10 @@ do_name( decrypt_name, "implDecrypt") \ do_signature(byteArray_int_int_byteArray_int_signature, "([BII[BI)I") \ \ + do_class(com_sun_crypto_provider_counterMode, "com/sun/crypto/provider/CounterMode") \ + do_intrinsic(_counterMode_AESCrypt, com_sun_crypto_provider_counterMode, crypt_name, byteArray_int_int_byteArray_int_signature, F_R) \ + do_name( crypt_name, "implCrypt") \ + \ /* support for sun.security.provider.SHA */ \ do_class(sun_security_provider_sha, "sun/security/provider/SHA") \ do_intrinsic(_sha_implCompress, sun_security_provider_sha, implCompress_name, implCompress_signature, F_R) \ diff --git a/hotspot/src/share/vm/opto/escape.cpp b/hotspot/src/share/vm/opto/escape.cpp index 6f8ffe608..a0e497f08 100644 --- a/hotspot/src/share/vm/opto/escape.cpp +++ b/hotspot/src/share/vm/opto/escape.cpp @@ -952,6 +952,7 @@ void ConnectionGraph::process_call_arguments(CallNode *call) { strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 || strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 || strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_decryptAESCrypt") == 0 || + strcmp(call->as_CallLeaf()->_name, "counterMode_AESCrypt") == 0 || strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 || strcmp(call->as_CallLeaf()->_name, "sha1_implCompress") == 0 || strcmp(call->as_CallLeaf()->_name, "sha1_implCompressMB") == 0 || diff --git a/hotspot/src/share/vm/opto/library_call.cpp b/hotspot/src/share/vm/opto/library_call.cpp index bb721f6f1..2add82dd1 100644 --- a/hotspot/src/share/vm/opto/library_call.cpp +++ b/hotspot/src/share/vm/opto/library_call.cpp @@ -196,6 +196,7 @@ class LibraryCallKit : public GraphKit { return generate_method_call(method_id, true, false); } Node * load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static); + Node * field_address_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static, ciInstanceKlass * fromKls); Node* make_string_method_node(int opcode, Node* str1_start, Node* cnt1, Node* str2_start, Node* cnt2); Node* make_string_method_node(int opcode, Node* str1, Node* str2); @@ -309,7 +310,9 @@ class LibraryCallKit : public GraphKit { bool inline_reference_get(); bool inline_aescrypt_Block(vmIntrinsics::ID id); bool inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id); + bool inline_counterMode_AESCrypt(vmIntrinsics::ID id); Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting); + Node* inline_counterMode_AESCrypt_predicate(); Node* get_key_start_from_aescrypt_object(Node* aescrypt_object); Node* get_original_key_start_from_aescrypt_object(Node* aescrypt_object); bool inline_ghash_processBlocks(); @@ -558,6 +561,13 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) { predicates = 1; break; + case vmIntrinsics::_counterMode_AESCrypt: + if (!UseAESCTRIntrinsics) { + return NULL; + } + predicates = 1; + break; + case vmIntrinsics::_sha_implCompress: if (!UseSHA1Intrinsics) return NULL; break; @@ -950,6 +960,9 @@ bool LibraryCallKit::try_to_inline(int predicate) { case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: return inline_cipherBlockChaining_AESCrypt(intrinsic_id()); + case vmIntrinsics::_counterMode_AESCrypt: + return inline_counterMode_AESCrypt(intrinsic_id()); + case vmIntrinsics::_sha_implCompress: case vmIntrinsics::_sha2_implCompress: case vmIntrinsics::_sha5_implCompress: @@ -1021,6 +1034,8 @@ Node* LibraryCallKit::try_to_predicate(int predicate) { return inline_cipherBlockChaining_AESCrypt_predicate(false); case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: return inline_cipherBlockChaining_AESCrypt_predicate(true); + case vmIntrinsics::_counterMode_AESCrypt: + return inline_counterMode_AESCrypt_predicate(); case vmIntrinsics::_digestBase_implCompressMB: return inline_digestBase_implCompressMB_predicate(predicate); @@ -6581,6 +6596,39 @@ Node * LibraryCallKit::load_field_from_object(Node * fromObj, const char * field return loadedField; } +Node * LibraryCallKit::field_address_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, + bool is_exact = true, bool is_static = false, + ciInstanceKlass * fromKls = NULL) { + if (fromKls == NULL) { + const TypeInstPtr* tinst = _gvn.type(fromObj)->isa_instptr(); + assert(tinst != NULL, "obj is null"); + assert(tinst->klass()->is_loaded(), "obj is not loaded"); + assert(!is_exact || tinst->klass_is_exact(), "klass not exact"); + fromKls = tinst->klass()->as_instance_klass(); + } + else { + assert(is_static, "only for static field access"); + } + ciField* field = fromKls->get_field_by_name(ciSymbol::make(fieldName), + ciSymbol::make(fieldTypeString), + is_static); + + assert(field != NULL, "undefined field"); + assert(!field->is_volatile(), "not defined for volatile fields"); + + if (is_static) { + const TypeInstPtr* tip = TypeInstPtr::make(fromKls->java_mirror()); + fromObj = makecon(tip); + } + + // Next code copied from Parse::do_get_xxx(): + + // Compute address and memory type. + int offset = field->offset_in_bytes(); + Node *adr = basic_plus_adr(fromObj, fromObj, offset); + + return adr; +} //------------------------------inline_aescrypt_Block----------------------- bool LibraryCallKit::inline_aescrypt_Block(vmIntrinsics::ID id) { @@ -6747,6 +6795,90 @@ bool LibraryCallKit::inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id) { return true; } +//------------------------------inline_counterMode_AESCrypt----------------------- +bool LibraryCallKit::inline_counterMode_AESCrypt(vmIntrinsics::ID id) { + assert(UseAES, "need AES instruction support"); + if (!UseAESCTRIntrinsics) return false; + + address stubAddr = NULL; + const char *stubName = NULL; + if (id == vmIntrinsics::_counterMode_AESCrypt) { + stubAddr = StubRoutines::counterMode_AESCrypt(); + stubName = "counterMode_AESCrypt"; + } + if (stubAddr == NULL) return false; + + Node* counterMode_object = argument(0); + Node* src = argument(1); + Node* src_offset = argument(2); + Node* len = argument(3); + Node* dest = argument(4); + Node* dest_offset = argument(5); + + // (1) src and dest are arrays. + const Type* src_type = src->Value(&_gvn); + const Type* dest_type = dest->Value(&_gvn); + const TypeAryPtr* top_src = src_type->isa_aryptr(); + const TypeAryPtr* top_dest = dest_type->isa_aryptr(); + assert(top_src != NULL && top_src->klass() != NULL && + top_dest != NULL && top_dest->klass() != NULL, "args are strange"); + + // checks are the responsibility of the caller + Node* src_start = src; + Node* dest_start = dest; + if (src_offset != NULL || dest_offset != NULL) { + assert(src_offset != NULL && dest_offset != NULL, ""); + src_start = array_element_address(src, src_offset, T_BYTE); + dest_start = array_element_address(dest, dest_offset, T_BYTE); + } + + // if we are in this set of code, we "know" the embeddedCipher is an AESCrypt object + // (because of the predicated logic executed earlier). + // so we cast it here safely. + // this requires a newer class file that has this array as littleEndian ints, otherwise we revert to java + Node* embeddedCipherObj = load_field_from_object(counterMode_object, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false); + if (embeddedCipherObj == NULL) return false; + // cast it to what we know it will be at runtime + const TypeInstPtr* tinst = _gvn.type(counterMode_object)->isa_instptr(); + assert(tinst != NULL, "CTR obj is null"); + assert(tinst->klass()->is_loaded(), "CTR obj is not loaded"); + ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt")); + assert(klass_AESCrypt->is_loaded(), "predicate checks that this class is loaded"); + ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass(); + const TypeKlassPtr* aklass = TypeKlassPtr::make(instklass_AESCrypt); + const TypeOopPtr* xtype = aklass->as_instance_type(); + Node* aescrypt_object = new (C) CheckCastPPNode(control(), embeddedCipherObj, xtype); + aescrypt_object = _gvn.transform(aescrypt_object); + // we need to get the start of the aescrypt_object's expanded key array + Node* k_start = get_key_start_from_aescrypt_object(aescrypt_object); + if (k_start == NULL) return false; + // similarly, get the start address of the r vector + Node* obj_counter = load_field_from_object(counterMode_object, "counter", "[B", /*is_exact*/ false); + if (obj_counter == NULL) return false; + Node* cnt_start = array_element_address(obj_counter, intcon(0), T_BYTE); + + Node* saved_encCounter = load_field_from_object(counterMode_object, "encryptedCounter", "[B", /*is_exact*/ false); + if (saved_encCounter == NULL) return false; + Node* saved_encCounter_start = array_element_address(saved_encCounter, intcon(0), T_BYTE); + Node* used = field_address_from_object(counterMode_object, "used", "I", /*is_exact*/ false); + + Node* ctrCrypt; + if (Matcher::pass_original_key_for_aes()) { + // no SPARC version for AES/CTR intrinsics now. + return false; + } + // Call the stub, passing src_start, dest_start, k_start, r_start and src_len + ctrCrypt = make_runtime_call(RC_LEAF|RC_NO_FP, + OptoRuntime::counterMode_aescrypt_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + src_start, dest_start, k_start, cnt_start, len, saved_encCounter_start, used); + + // return cipher length (int) + Node* retvalue = _gvn.transform(new (C) ProjNode(ctrCrypt, TypeFunc::Parms)); + set_result(retvalue); + return true; +} + //------------------------------get_key_start_from_aescrypt_object----------------------- Node * LibraryCallKit::get_key_start_from_aescrypt_object(Node *aescrypt_object) { #ifdef PPC64 @@ -6841,6 +6973,48 @@ Node* LibraryCallKit::inline_cipherBlockChaining_AESCrypt_predicate(bool decrypt return _gvn.transform(region); } +//----------------------------inline_counterMode_AESCrypt_predicate---------------------------- +// Return node representing slow path of predicate check. +// the pseudo code we want to emulate with this predicate is: +// for encryption: +// if (embeddedCipherObj instanceof AESCrypt) do_intrinsic, else do_javapath +// for decryption: +// if ((embeddedCipherObj instanceof AESCrypt) && (cipher!=plain)) do_intrinsic, else do_javapath +// note cipher==plain is more conservative than the original java code but that's OK +// + +Node* LibraryCallKit::inline_counterMode_AESCrypt_predicate() { + // The receiver was checked for NULL already. + Node* objCTR = argument(0); + + // Load embeddedCipher field of CipherBlockChaining object. + Node* embeddedCipherObj = load_field_from_object(objCTR, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false); + + // get AESCrypt klass for instanceOf check + // AESCrypt might not be loaded yet if some other SymmetricCipher got us to this compile point + // will have same classloader as CipherBlockChaining object + const TypeInstPtr* tinst = _gvn.type(objCTR)->isa_instptr(); + assert(tinst != NULL, "CTRobj is null"); + assert(tinst->klass()->is_loaded(), "CTRobj is not loaded"); + + // we want to do an instanceof comparison against the AESCrypt class + ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt")); + if (!klass_AESCrypt->is_loaded()) { + // if AESCrypt is not even loaded, we never take the intrinsic fast path + Node* ctrl = control(); + set_control(top()); // no regular fast path + return ctrl; + } + + ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass(); + Node* instof = gen_instanceof(embeddedCipherObj, makecon(TypeKlassPtr::make(instklass_AESCrypt))); + Node* cmp_instof = _gvn.transform(new (C) CmpINode(instof, intcon(1))); + Node* bool_instof = _gvn.transform(new (C) BoolNode(cmp_instof, BoolTest::ne)); + Node* instof_false = generate_guard(bool_instof, NULL, PROB_MIN); + + return instof_false; // even if it is NULL +} + //------------------------------inline_ghash_processBlocks bool LibraryCallKit::inline_ghash_processBlocks() { address stubAddr; diff --git a/hotspot/src/share/vm/opto/runtime.cpp b/hotspot/src/share/vm/opto/runtime.cpp index 0a86211ba..1c51be19b 100644 --- a/hotspot/src/share/vm/opto/runtime.cpp +++ b/hotspot/src/share/vm/opto/runtime.cpp @@ -1021,6 +1021,35 @@ const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() { return TypeFunc::make(domain, range); } +//for counterMode calls of aescrypt encrypt/decrypt, four pointers and a length, returning int +const TypeFunc* OptoRuntime::counterMode_aescrypt_Type() { + // create input type (domain) + int num_args = 7; + if (Matcher::pass_original_key_for_aes()) { + num_args = 8; + } + int argcnt = num_args; + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // src + fields[argp++] = TypePtr::NOTNULL; // dest + fields[argp++] = TypePtr::NOTNULL; // k array + fields[argp++] = TypePtr::NOTNULL; // counter array + fields[argp++] = TypeInt::INT; // src len + fields[argp++] = TypePtr::NOTNULL; // saved_encCounter + fields[argp++] = TypePtr::NOTNULL; // saved used addr + if (Matcher::pass_original_key_for_aes()) { + fields[argp++] = TypePtr::NOTNULL; // original k array + } + assert(argp == TypeFunc::Parms + argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields); + // returning cipher len (int) + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms + 0] = TypeInt::INT; + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms + 1, fields); + return TypeFunc::make(domain, range); +} + /* * void implCompress(byte[] buf, int ofs) */ diff --git a/hotspot/src/share/vm/opto/runtime.hpp b/hotspot/src/share/vm/opto/runtime.hpp index 47133d58c..f27e7d507 100644 --- a/hotspot/src/share/vm/opto/runtime.hpp +++ b/hotspot/src/share/vm/opto/runtime.hpp @@ -299,6 +299,7 @@ private: static const TypeFunc* aescrypt_block_Type(); static const TypeFunc* cipherBlockChaining_aescrypt_Type(); + static const TypeFunc* counterMode_aescrypt_Type(); static const TypeFunc* sha_implCompress_Type(); static const TypeFunc* digestBase_implCompressMB_Type(); diff --git a/hotspot/src/share/vm/runtime/globals.hpp b/hotspot/src/share/vm/runtime/globals.hpp index 65dfcf69b..91e52f033 100644 --- a/hotspot/src/share/vm/runtime/globals.hpp +++ b/hotspot/src/share/vm/runtime/globals.hpp @@ -734,6 +734,9 @@ class CommandLineFlags { product(bool, UseAESIntrinsics, false, \ "Use intrinsics for AES versions of crypto") \ \ + product(bool, UseAESCTRIntrinsics, false, \ + "Use intrinsics for the paralleled version of AES/CTR crypto") \ + \ product(bool, UseSHA1Intrinsics, false, \ "Use intrinsics for SHA-1 crypto hash function") \ \ diff --git a/hotspot/src/share/vm/runtime/stubRoutines.cpp b/hotspot/src/share/vm/runtime/stubRoutines.cpp index f2106d13a..d66237137 100644 --- a/hotspot/src/share/vm/runtime/stubRoutines.cpp +++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp @@ -124,6 +124,7 @@ address StubRoutines::_aescrypt_encryptBlock = NULL; address StubRoutines::_aescrypt_decryptBlock = NULL; address StubRoutines::_cipherBlockChaining_encryptAESCrypt = NULL; address StubRoutines::_cipherBlockChaining_decryptAESCrypt = NULL; +address StubRoutines::_counterMode_AESCrypt = NULL; address StubRoutines::_ghash_processBlocks = NULL; address StubRoutines::_sha1_implCompress = NULL; diff --git a/hotspot/src/share/vm/runtime/stubRoutines.hpp b/hotspot/src/share/vm/runtime/stubRoutines.hpp index 16075d9f4..9fb589540 100644 --- a/hotspot/src/share/vm/runtime/stubRoutines.hpp +++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp @@ -202,6 +202,7 @@ class StubRoutines: AllStatic { static address _aescrypt_decryptBlock; static address _cipherBlockChaining_encryptAESCrypt; static address _cipherBlockChaining_decryptAESCrypt; + static address _counterMode_AESCrypt; static address _ghash_processBlocks; static address _sha1_implCompress; @@ -370,6 +371,7 @@ class StubRoutines: AllStatic { static address aescrypt_decryptBlock() { return _aescrypt_decryptBlock; } static address cipherBlockChaining_encryptAESCrypt() { return _cipherBlockChaining_encryptAESCrypt; } static address cipherBlockChaining_decryptAESCrypt() { return _cipherBlockChaining_decryptAESCrypt; } + static address counterMode_AESCrypt() { return _counterMode_AESCrypt; } static address ghash_processBlocks() { return _ghash_processBlocks; } static address sha1_implCompress() { return _sha1_implCompress; } diff --git a/hotspot/src/share/vm/runtime/vmStructs.cpp b/hotspot/src/share/vm/runtime/vmStructs.cpp index 3f2bfeb74..842b5840d 100644 --- a/hotspot/src/share/vm/runtime/vmStructs.cpp +++ b/hotspot/src/share/vm/runtime/vmStructs.cpp @@ -815,6 +815,7 @@ typedef TwoOopHashtable<Symbol*, mtClass> SymbolTwoOopHashtable; static_field(StubRoutines, _aescrypt_decryptBlock, address) \ static_field(StubRoutines, _cipherBlockChaining_encryptAESCrypt, address) \ static_field(StubRoutines, _cipherBlockChaining_decryptAESCrypt, address) \ + static_field(StubRoutines, _counterMode_AESCrypt, address) \ static_field(StubRoutines, _ghash_processBlocks, address) \ static_field(StubRoutines, _updateBytesCRC32, address) \ static_field(StubRoutines, _crc_table_adr, address) \ diff --git a/hotspot/test/compiler/7184394/TestAESBase.java b/hotspot/test/compiler/7184394/TestAESBase.java index 5c3e6881e..afda2a1f7 100644 --- a/hotspot/test/compiler/7184394/TestAESBase.java +++ b/hotspot/test/compiler/7184394/TestAESBase.java @@ -106,8 +106,8 @@ abstract public class TestAESBase { cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE"); dCipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE"); - // CBC init - if (mode.equals("CBC")) { + // CBC or CTR init + if (mode.equals("CBC") || mode.equals("CTR")) { IvParameterSpec initVector = new IvParameterSpec(iv); cipher.init(Cipher.ENCRYPT_MODE, key, initVector); algParams = cipher.getParameters(); diff --git a/hotspot/test/compiler/7184394/TestAESMain.java b/hotspot/test/compiler/7184394/TestAESMain.java index ddd8eeaef..65949420a 100644 --- a/hotspot/test/compiler/7184394/TestAESMain.java +++ b/hotspot/test/compiler/7184394/TestAESMain.java @@ -48,6 +48,13 @@ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 TestAESMain * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencOutputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DdecOutputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 -DencOutputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain * * @author Tom Deneau */ diff --git a/jdk/src/share/classes/com/sun/crypto/provider/CounterMode.java b/jdk/src/share/classes/com/sun/crypto/provider/CounterMode.java index aea9336c9..c2bd38a71 100644 --- a/jdk/src/share/classes/com/sun/crypto/provider/CounterMode.java +++ b/jdk/src/share/classes/com/sun/crypto/provider/CounterMode.java @@ -39,10 +39,10 @@ import java.security.InvalidKeyException; * @author Andreas Sterbenz * @since 1.4.2 */ -final class CounterMode extends FeedbackCipher { +class CounterMode extends FeedbackCipher { // current counter value - private final byte[] counter; + final byte[] counter; // encrypted bytes of the previous counter value private final byte[] encryptedCounter; @@ -137,7 +137,7 @@ final class CounterMode extends FeedbackCipher { * <code>cipherOffset</code>. * * @param in the buffer with the input data to be encrypted - * @param inOffset the offset in <code>plain</code> + * @param inOff the offset in <code>plain</code> * @param len the length of the input data * @param out the buffer for the result * @param outOff the offset in <code>cipher</code> @@ -176,6 +176,11 @@ final class CounterMode extends FeedbackCipher { RangeUtil.nullAndBoundsCheck(in, inOff, len); RangeUtil.nullAndBoundsCheck(out, outOff, len); + return implCrypt(in, inOff, len, out, outOff); + } + + // Implementation of crpyt() method. Possibly replaced with a compiler intrinsic. + private int implCrypt(byte[] in, int inOff, int len, byte[] out, int outOff) { int result = len; while (len-- > 0) { if (used >= blockSize) { diff --git a/jdk/src/share/classes/com/sun/crypto/provider/GCTR.java b/jdk/src/share/classes/com/sun/crypto/provider/GCTR.java index f8a3eaa0a..6a394e448 100644 --- a/jdk/src/share/classes/com/sun/crypto/provider/GCTR.java +++ b/jdk/src/share/classes/com/sun/crypto/provider/GCTR.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2017 Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -29,52 +29,43 @@ package com.sun.crypto.provider; -import java.security.*; -import javax.crypto.*; +import javax.crypto.IllegalBlockSizeException; import static com.sun.crypto.provider.AESConstants.AES_BLOCK_SIZE; /** * This class represents the GCTR function defined in NIST 800-38D - * under section 6.5. It needs to be constructed w/ an initialized - * cipher object, and initial counter block(ICB). Given an input X - * of arbitrary length, it processes and returns an output which has - * the same length as X. The invariants of this class are: - * - * (1) The length of intialCounterBlk (and also of its clones, e.g., - * fields counter and counterSave) is equal to AES_BLOCK_SIZE. - * - * (2) After construction, the field counter never becomes null, it - * always contains a byte array of length AES_BLOCK_SIZE. + * under section 6.5. With a given cipher object and initial counter + * block, a counter mode operation is performed. Blocksize is limited + * to 16 bytes. * * If any invariant is broken, failures can occur because the * AESCrypt.encryptBlock method can be intrinsified on the HotSpot VM * (see JDK-8067648 for details). * + * The counter mode operations can be intrinsified and parallelized + * by using CounterMode.implCrypt() if HotSpot VM supports it on the + * architecture. + * * <p>This function is used in the implementation of GCM mode. * * @since 1.8 */ -final class GCTR { - - // these fields should not change after the object has been constructed - private final SymmetricCipher aes; - private final byte[] icb; - - // the current counter value - private byte[] counter; +final class GCTR extends CounterMode { - // needed for save/restore calls - private byte[] counterSave = null; - - // NOTE: cipher should already be initialized GCTR(SymmetricCipher cipher, byte[] initialCounterBlk) { - this.aes = cipher; + super(cipher); if (initialCounterBlk.length != AES_BLOCK_SIZE) { throw new RuntimeException("length of initial counter block (" + initialCounterBlk.length + ") not equal to AES_BLOCK_SIZE (" + AES_BLOCK_SIZE + ")"); } - this.icb = initialCounterBlk; - this.counter = icb.clone(); + + iv = initialCounterBlk; + reset(); + } + + @Override + String getFeedback() { + return "GCTR"; } // input must be multiples of 128-bit blocks when calling update @@ -89,23 +80,11 @@ final class GCTR { throw new RuntimeException("output buffer too small"); } - byte[] encryptedCntr = new byte[AES_BLOCK_SIZE]; - - int numOfCompleteBlocks = inLen / AES_BLOCK_SIZE; - for (int i = 0; i < numOfCompleteBlocks; i++) { - aes.encryptBlock(counter, 0, encryptedCntr, 0); - for (int n = 0; n < AES_BLOCK_SIZE; n++) { - int index = (i * AES_BLOCK_SIZE + n); - out[outOfs + index] = - (byte) ((in[inOfs + index] ^ encryptedCntr[n])); - } - GaloisCounterMode.increment32(counter); - } - return inLen; + return encrypt(in, inOfs, inLen, out, outOfs); } // input can be arbitrary size when calling doFinal - protected int doFinal(byte[] in, int inOfs, int inLen, byte[] out, + int doFinal(byte[] in, int inOfs, int inLen, byte[] out, int outOfs) throws IllegalBlockSizeException { try { if (inLen < 0) { @@ -118,7 +97,7 @@ final class GCTR { if (lastBlockSize != 0) { // do the last partial block byte[] encryptedCntr = new byte[AES_BLOCK_SIZE]; - aes.encryptBlock(counter, 0, encryptedCntr, 0); + embeddedCipher.encryptBlock(counter, 0, encryptedCntr, 0); for (int n = 0; n < lastBlockSize; n++) { out[outOfs + completeBlkLen + n] = (byte) ((in[inOfs + completeBlkLen + n] ^ @@ -131,28 +110,4 @@ final class GCTR { } return inLen; } - - /** - * Resets the content of this object to when it's first constructed. - */ - void reset() { - System.arraycopy(icb, 0, counter, 0, icb.length); - counterSave = null; - } - - /** - * Save the current content of this object. - */ - void save() { - this.counterSave = this.counter.clone(); - } - - /** - * Restores the content of this object to the previous saved one. - */ - void restore() { - if (this.counterSave != null) { - this.counter = this.counterSave; - } - } } diff --git a/jdk/src/share/classes/com/sun/crypto/provider/GHASH.java b/jdk/src/share/classes/com/sun/crypto/provider/GHASH.java index dc42e6bbf..78f0723d7 100644 --- a/jdk/src/share/classes/com/sun/crypto/provider/GHASH.java +++ b/jdk/src/share/classes/com/sun/crypto/provider/GHASH.java @@ -122,10 +122,10 @@ final class GHASH { } - /* subkeyH and state are stored in long[] for GHASH intrinsic use */ + /* subkeyHtbl and state are stored in long[] for GHASH intrinsic use */ - // hash subkey H; should not change after the object has been constructed - private final long[] subkeyH; + // hashtable subkeyHtbl; holds 2*9 powers of subkeyH computed using carry-less multiplication + private long[] subkeyHtbl; // buffer for storing hash private final long[] state; @@ -147,9 +147,9 @@ final class GHASH { throw new ProviderException("Internal error"); } state = new long[2]; - this.subkeyH = new long[2]; - this.subkeyH[0] = getLong(subkeyH, 0); - this.subkeyH[1] = getLong(subkeyH, 8); + subkeyHtbl = new long[2*9]; + subkeyHtbl[0] = getLong(subkeyH, 0); + subkeyHtbl[1] = getLong(subkeyH, 8); } /** @@ -192,8 +192,8 @@ final class GHASH { if (inLen == 0) { return; } - ghashRangeCheck(in, inOfs, inLen, state, subkeyH); - processBlocks(in, inOfs, inLen/AES_BLOCK_SIZE, state, subkeyH); + ghashRangeCheck(in, inOfs, inLen, state, subkeyHtbl); + processBlocks(in, inOfs, inLen/AES_BLOCK_SIZE, state, subkeyHtbl); } private static void ghashRangeCheck(byte[] in, int inOfs, int inLen, long[] st, long[] subH) { @@ -217,8 +217,8 @@ final class GHASH { throw new RuntimeException("internal state has invalid length: " + st.length); } - if (subH.length != 2) { - throw new RuntimeException("internal subkeyH has invalid length: " + + if (subH.length != 18) { + throw new RuntimeException("internal subkeyHtbl has invalid length: " + subH.length); } } diff --git a/jdk/src/share/classes/sun/security/ssl/SSLSocketImpl.java b/jdk/src/share/classes/sun/security/ssl/SSLSocketImpl.java index ab93e3097..dd2618455 100644 --- a/jdk/src/share/classes/sun/security/ssl/SSLSocketImpl.java +++ b/jdk/src/share/classes/sun/security/ssl/SSLSocketImpl.java @@ -439,6 +439,8 @@ public final class SSLSocketImpl if (!conContext.isNegotiated) { readHandshakeRecord(); } + } catch (InterruptedIOException iioe) { + handleException(iioe); } catch (IOException ioe) { throw conContext.fatal(Alert.HANDSHAKE_FAILURE, "Couldn't kickstart handshaking", ioe); @@ -1309,12 +1311,11 @@ public final class SSLSocketImpl } } catch (SSLException ssle) { throw ssle; + } catch (InterruptedIOException iioe) { + // don't change exception in case of timeouts or interrupts + throw iioe; } catch (IOException ioe) { - if (!(ioe instanceof SSLException)) { - throw new SSLException("readHandshakeRecord", ioe); - } else { - throw ioe; - } + throw new SSLException("readHandshakeRecord", ioe); } } @@ -1375,6 +1376,9 @@ public final class SSLSocketImpl } } catch (SSLException ssle) { throw ssle; + } catch (InterruptedIOException iioe) { + // don't change exception in case of timeouts or interrupts + throw iioe; } catch (IOException ioe) { if (!(ioe instanceof SSLException)) { throw new SSLException("readApplicationRecord", ioe); diff --git a/jdk/src/share/classes/sun/security/ssl/SSLSocketInputRecord.java b/jdk/src/share/classes/sun/security/ssl/SSLSocketInputRecord.java index 401822759..ab5712acc 100644 --- a/jdk/src/share/classes/sun/security/ssl/SSLSocketInputRecord.java +++ b/jdk/src/share/classes/sun/security/ssl/SSLSocketInputRecord.java @@ -26,6 +26,7 @@ package sun.security.ssl; import java.io.EOFException; +import java.io.InterruptedIOException; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -47,37 +48,31 @@ import sun.security.ssl.SSLCipher.SSLReadCipher; final class SSLSocketInputRecord extends InputRecord implements SSLRecord { private InputStream is = null; private OutputStream os = null; - private final byte[] temporary = new byte[1024]; + private final byte[] header = new byte[headerSize]; + private int headerOff = 0; + // Cache for incomplete record body. + private ByteBuffer recordBody = ByteBuffer.allocate(1024); private boolean formatVerified = false; // SSLv2 ruled out? // Cache for incomplete handshake messages. private ByteBuffer handshakeBuffer = null; - private boolean hasHeader = false; // Had read the record header - SSLSocketInputRecord(HandshakeHash handshakeHash) { super(handshakeHash, SSLReadCipher.nullTlsReadCipher()); } @Override int bytesInCompletePacket() throws IOException { - if (!hasHeader) { - // read exactly one record - try { - int really = read(is, temporary, 0, headerSize); - if (really < 0) { - // EOF: peer shut down incorrectly - return -1; - } - } catch (EOFException eofe) { - // The caller will handle EOF. - return -1; - } - hasHeader = true; + // read header + try { + readHeader(); + } catch (EOFException eofe) { + // The caller will handle EOF. + return -1; } - byte byteZero = temporary[0]; + byte byteZero = header[0]; int len = 0; /* @@ -93,9 +88,9 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { * Last sanity check that it's not a wild record */ if (!ProtocolVersion.isNegotiable( - temporary[1], temporary[2], false)) { + header[1], header[2], false)) { throw new SSLException("Unrecognized record version " + - ProtocolVersion.nameOf(temporary[1], temporary[2]) + + ProtocolVersion.nameOf(header[1], header[2]) + " , plaintext connection?"); } @@ -109,8 +104,8 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { /* * One of the SSLv3/TLS message types. */ - len = ((temporary[3] & 0xFF) << 8) + - (temporary[4] & 0xFF) + headerSize; + len = ((header[3] & 0xFF) << 8) + + (header[4] & 0xFF) + headerSize; } else { /* * Must be SSLv2 or something unknown. @@ -121,11 +116,11 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { */ boolean isShort = ((byteZero & 0x80) != 0); - if (isShort && ((temporary[2] == 1) || (temporary[2] == 4))) { + if (isShort && ((header[2] == 1) || (header[2] == 4))) { if (!ProtocolVersion.isNegotiable( - temporary[3], temporary[4], false)) { + header[3], header[4], false)) { throw new SSLException("Unrecognized record version " + - ProtocolVersion.nameOf(temporary[3], temporary[4]) + + ProtocolVersion.nameOf(header[3], header[4]) + " , plaintext connection?"); } @@ -138,9 +133,9 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { // // int mask = (isShort ? 0x7F : 0x3F); // len = ((byteZero & mask) << 8) + - // (temporary[1] & 0xFF) + (isShort ? 2 : 3); + // (header[1] & 0xFF) + (isShort ? 2 : 3); // - len = ((byteZero & 0x7F) << 8) + (temporary[1] & 0xFF) + 2; + len = ((byteZero & 0x7F) << 8) + (header[1] & 0xFF) + 2; } else { // Gobblygook! throw new SSLException( @@ -160,34 +155,41 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { return null; } - if (!hasHeader) { - // read exactly one record - int really = read(is, temporary, 0, headerSize); - if (really < 0) { - throw new EOFException("SSL peer shut down incorrectly"); - } - hasHeader = true; - } + // read header + readHeader(); - Plaintext plaintext = null; - if (!formatVerified) { - formatVerified = true; + Plaintext[] plaintext = null; + boolean cleanInBuffer = true; + try { + if (!formatVerified) { + formatVerified = true; - /* - * The first record must either be a handshake record or an - * alert message. If it's not, it is either invalid or an - * SSLv2 message. - */ - if ((temporary[0] != ContentType.HANDSHAKE.id) && - (temporary[0] != ContentType.ALERT.id)) { - hasHeader = false; - return handleUnknownRecord(temporary); + /* + * The first record must either be a handshake record or an + * alert message. If it's not, it is either invalid or an + * SSLv2 message. + */ + if ((header[0] != ContentType.HANDSHAKE.id) && + (header[0] != ContentType.ALERT.id)) { + plaintext = handleUnknownRecord(); + } } - } - // The record header should has consumed. - hasHeader = false; - return decodeInputRecord(temporary); + // The record header should has consumed. + if (plaintext == null) { + plaintext = decodeInputRecord(); + } + } catch(InterruptedIOException e) { + // do not clean header and recordBody in case of Socket Timeout + cleanInBuffer = false; + throw e; + } finally { + if (cleanInBuffer) { + headerOff = 0; + recordBody.clear(); + } + } + return plaintext; } @Override @@ -200,9 +202,7 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { this.os = outputStream; } - // Note that destination may be null - private Plaintext[] decodeInputRecord( - byte[] header) throws IOException, BadPaddingException { + private Plaintext[] decodeInputRecord() throws IOException, BadPaddingException { byte contentType = header[0]; // pos: 0 byte majorVersion = header[1]; // pos: 1 byte minorVersion = header[2]; // pos: 2 @@ -227,30 +227,27 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { } // - // Read a complete record. + // Read a complete record and store in the recordBody + // recordBody is used to cache incoming record and restore in case of + // read operation timedout // - ByteBuffer destination = ByteBuffer.allocate(headerSize + contentLen); - int dstPos = destination.position(); - destination.put(temporary, 0, headerSize); - while (contentLen > 0) { - int howmuch = Math.min(temporary.length, contentLen); - int really = read(is, temporary, 0, howmuch); - if (really < 0) { - throw new EOFException("SSL peer shut down incorrectly"); + if (recordBody.position() == 0) { + if (recordBody.capacity() < contentLen) { + recordBody = ByteBuffer.allocate(contentLen); } - - destination.put(temporary, 0, howmuch); - contentLen -= howmuch; + recordBody.limit(contentLen); + } else { + contentLen = recordBody.remaining(); } - destination.flip(); - destination.position(dstPos + headerSize); + readFully(contentLen); + recordBody.flip(); if (SSLLogger.isOn && SSLLogger.isOn("record")) { SSLLogger.fine( "READ: " + ProtocolVersion.nameOf(majorVersion, minorVersion) + " " + ContentType.nameOf(contentType) + ", length = " + - destination.remaining()); + recordBody.remaining()); } // @@ -259,7 +256,7 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { ByteBuffer fragment; try { Plaintext plaintext = - readCipher.decrypt(contentType, destination, null); + readCipher.decrypt(contentType, recordBody, null); fragment = plaintext.fragment; contentType = plaintext.contentType; } catch (BadPaddingException bpe) { @@ -368,8 +365,7 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { }; } - private Plaintext[] handleUnknownRecord( - byte[] header) throws IOException, BadPaddingException { + private Plaintext[] handleUnknownRecord() throws IOException, BadPaddingException { byte firstByte = header[0]; byte thirdByte = header[2]; @@ -411,32 +407,29 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { } int msgLen = ((header[0] & 0x7F) << 8) | (header[1] & 0xFF); - - ByteBuffer destination = ByteBuffer.allocate(headerSize + msgLen); - destination.put(temporary, 0, headerSize); - msgLen -= 3; // had read 3 bytes of content as header - while (msgLen > 0) { - int howmuch = Math.min(temporary.length, msgLen); - int really = read(is, temporary, 0, howmuch); - if (really < 0) { - throw new EOFException("SSL peer shut down incorrectly"); + if (recordBody.position() == 0) { + if (recordBody.capacity() < (headerSize + msgLen)) { + recordBody = ByteBuffer.allocate(headerSize + msgLen); } - - destination.put(temporary, 0, howmuch); - msgLen -= howmuch; + recordBody.limit(headerSize + msgLen); + recordBody.put(header, 0, headerSize); + } else { + msgLen = recordBody.remaining(); } - destination.flip(); + msgLen -= 3; // had read 3 bytes of content as header + readFully(msgLen); + recordBody.flip(); /* * If we can map this into a V3 ClientHello, read and * hash the rest of the V2 handshake, turn it into a * V3 ClientHello message, and pass it up. */ - destination.position(2); // exclude the header - handshakeHash.receive(destination); - destination.position(0); + recordBody.position(2); // exclude the header + handshakeHash.receive(recordBody); + recordBody.position(0); - ByteBuffer converted = convertToClientHello(destination); + ByteBuffer converted = convertToClientHello(recordBody); if (SSLLogger.isOn && SSLLogger.isOn("packet")) { SSLLogger.fine( @@ -456,28 +449,42 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { } } - // Read the exact bytes of data, otherwise, return -1. - private static int read(InputStream is, - byte[] buffer, int offset, int len) throws IOException { - int n = 0; - while (n < len) { - int readLen = is.read(buffer, offset + n, len - n); - if (readLen < 0) { - if (SSLLogger.isOn && SSLLogger.isOn("packet")) { - SSLLogger.fine("Raw read: EOF"); - } - return -1; + // Read the exact bytes of data, otherwise, throw IOException. + private int readFully(int len) throws IOException { + int end = len + recordBody.position(); + int off = recordBody.position(); + try { + while (off < end) { + off += read(is, recordBody.array(), off, end - off); } + } finally { + recordBody.position(off); + } + return len; + } + + // Read SSE record header, otherwise, throw IOException. + private int readHeader() throws IOException { + while (headerOff < headerSize) { + headerOff += read(is, header, headerOff, headerSize - headerOff); + } + return headerSize; + } + private static int read(InputStream is, byte[] buf, int off, int len) throws IOException { + int readLen = is.read(buf, off, len); + if (readLen < 0) { if (SSLLogger.isOn && SSLLogger.isOn("packet")) { - ByteBuffer bb = ByteBuffer.wrap(buffer, offset + n, readLen); - SSLLogger.fine("Raw read", bb); + SSLLogger.fine("Raw read: EOF"); } - - n += readLen; + throw new EOFException("SSL peer shut down incorrectly"); } - return n; + if (SSLLogger.isOn && SSLLogger.isOn("packet")) { + ByteBuffer bb = ByteBuffer.wrap(buf, off, readLen); + SSLLogger.fine("Raw read", bb); + } + return readLen; } // Try to use up the input stream without impact the performance too much. diff --git a/jdk/src/share/classes/sun/security/ssl/SSLTransport.java b/jdk/src/share/classes/sun/security/ssl/SSLTransport.java index b3d03b370..78e13ea2c 100644 --- a/jdk/src/share/classes/sun/security/ssl/SSLTransport.java +++ b/jdk/src/share/classes/sun/security/ssl/SSLTransport.java @@ -27,6 +27,7 @@ package sun.security.ssl; import java.io.EOFException; import java.io.IOException; +import java.io.InterruptedIOException; import java.nio.ByteBuffer; import javax.crypto.AEADBadTagException; import javax.crypto.BadPaddingException; @@ -134,6 +135,9 @@ interface SSLTransport { } catch (EOFException eofe) { // rethrow EOFException, the call will handle it if neede. throw eofe; + } catch (InterruptedIOException iioe) { + // don't close the Socket in case of timeouts or interrupts. + throw iioe; } catch (IOException ioe) { throw context.fatal(Alert.UNEXPECTED_MESSAGE, ioe); } diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMBench.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMBench.java new file mode 100644 index 000000000..258672f59 --- /dev/null +++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMBench.java @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.javax.crypto.full; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Setup; + +import javax.crypto.Cipher; +import javax.crypto.spec.GCMParameterSpec; +import javax.crypto.spec.SecretKeySpec; + +/** + * This performance tests runs AES/GCM encryption and decryption using byte[] + * as input and output buffers for single and multi-part testing. + * + * This test rotates the IV and creates a new GCMParameterSpec for each encrypt + * benchmark operation + */ + +public class AESGCMBench extends CryptoBase { + + @Param({"128"}) + private int keyLength; + + @Param({"1024", "1500", "4096", "16384"}) + private int dataSize; + + byte[] encryptedData; + byte[] in, out; + private Cipher encryptCipher; + private Cipher decryptCipher; + SecretKeySpec ks; + GCMParameterSpec gcm_spec; + byte[] iv; + + private static final int IV_BUFFER_SIZE = 32; + private static final int IV_MODULO = IV_BUFFER_SIZE - 16; + int iv_index = 0; + int updateLen = 0; + + private int next_iv_index() { + int r = iv_index; + iv_index = (iv_index + 1) % IV_MODULO; + return r; + } + + @Setup + public void setup() throws Exception { + setupProvider(); + + // Setup key material + byte[] keystring = fillSecureRandom(new byte[keyLength / 8]); + ks = new SecretKeySpec(keystring, "AES"); + iv = fillSecureRandom(new byte[IV_BUFFER_SIZE]); + gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16); + + // Setup Cipher classes + encryptCipher = makeCipher(prov, "AES/GCM/NoPadding"); + encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec); + decryptCipher = makeCipher(prov, "AES/GCM/NoPadding"); + decryptCipher.init(Cipher.DECRYPT_MODE, ks, + encryptCipher.getParameters(). + getParameterSpec(GCMParameterSpec.class)); + + // Setup input/output buffers + in = fillRandom(new byte[dataSize]); + encryptedData = new byte[encryptCipher.getOutputSize(in.length)]; + out = new byte[encryptedData.length]; + encryptCipher.doFinal(in, 0, in.length, encryptedData, 0); + updateLen = in.length / 2; + + } + + @Benchmark + public void encrypt() throws Exception { + gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16); + encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec); + encryptCipher.doFinal(in, 0, in.length, out, 0); + } + + @Benchmark + public void encryptMultiPart() throws Exception { + gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16); + encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec); + int outOfs = encryptCipher.update(in, 0, updateLen, out, 0); + encryptCipher.doFinal(in, updateLen, in.length - updateLen, + out, outOfs); + } + + @Benchmark + public void decrypt() throws Exception { + decryptCipher.init(Cipher.DECRYPT_MODE, ks, + encryptCipher.getParameters(). + getParameterSpec(GCMParameterSpec.class)); + decryptCipher.doFinal(encryptedData, 0, encryptedData.length, out, 0); + } + + @Benchmark + public void decryptMultiPart() throws Exception { + decryptCipher.init(Cipher.DECRYPT_MODE, ks, + encryptCipher.getParameters(). + getParameterSpec(GCMParameterSpec.class)); + decryptCipher.update(encryptedData, 0, updateLen, out, 0); + decryptCipher.doFinal(encryptedData, updateLen, + encryptedData.length - updateLen, out, 0); + } +} \ No newline at end of file diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMByteBuffer.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMByteBuffer.java new file mode 100644 index 000000000..cb6d20c51 --- /dev/null +++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMByteBuffer.java @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.javax.crypto.full; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Setup; + +import javax.crypto.Cipher; +import javax.crypto.spec.GCMParameterSpec; +import javax.crypto.spec.SecretKeySpec; +import java.nio.ByteBuffer; + +/** + * This performance tests runs AES/GCM encryption and decryption using heap and + * direct ByteBuffers as input and output buffers for single and multi-part + * operations. + * + * This test rotates the IV and creates a new GCMParameterSpec for each encrypt + * benchmark operation + */ + +public class AESGCMByteBuffer extends CryptoBase { + + @Param({"128"}) + private int keyLength; + + @Param({"1024", "1500", "4096", "16384"}) + private int dataSize; + + @Param({"direct", "heap"}) + private String dataMethod; + + byte[] data; + ByteBuffer encryptedData; + ByteBuffer in, out; + private Cipher encryptCipher; + private Cipher decryptCipher; + SecretKeySpec ks; + GCMParameterSpec gcm_spec; + byte[] iv; + + private static final int IV_BUFFER_SIZE = 32; + private static final int IV_MODULO = IV_BUFFER_SIZE - 16; + int iv_index = 0; + int updateLen = 0; + + private int next_iv_index() { + int r = iv_index; + iv_index = (iv_index + 1) % IV_MODULO; + return r; + } + + @Setup + public void setup() throws Exception { + setupProvider(); + + // Setup key material + byte[] keystring = fillSecureRandom(new byte[keyLength / 8]); + ks = new SecretKeySpec(keystring, "AES"); + iv = fillSecureRandom(new byte[IV_BUFFER_SIZE]); + gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16); + + // Setup Cipher classes + encryptCipher = makeCipher(prov, "AES/GCM/NoPadding"); + encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec); + decryptCipher = makeCipher(prov, "AES/GCM/NoPadding"); + decryptCipher.init(Cipher.DECRYPT_MODE, ks, + encryptCipher.getParameters(). + getParameterSpec(GCMParameterSpec.class)); + + // Setup input/output buffers + data = fillRandom(new byte[dataSize]); + if (dataMethod.equalsIgnoreCase("direct")) { + in = ByteBuffer.allocateDirect(data.length); + in.put(data); + in.flip(); + encryptedData = ByteBuffer.allocateDirect( + encryptCipher.getOutputSize(data.length)); + out = ByteBuffer.allocateDirect(encryptedData.capacity()); + } else if (dataMethod.equalsIgnoreCase("heap")) { + in = ByteBuffer.wrap(data); + encryptedData = ByteBuffer.allocate( + encryptCipher.getOutputSize(data.length)); + out = ByteBuffer.allocate(encryptedData.capacity()); + } + + encryptCipher.doFinal(in, encryptedData); + encryptedData.flip(); + in.flip(); + updateLen = in.remaining() / 2; + } + + @Benchmark + public void encrypt() throws Exception { + gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16); + encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec); + encryptCipher.doFinal(in, out); + out.flip(); + in.flip(); + } + + @Benchmark + public void encryptMultiPart() throws Exception { + gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16); + encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec); + in.limit(updateLen); + encryptCipher.update(in, out); + in.limit(in.capacity()); + encryptCipher.doFinal(in, out); + out.flip(); + in.flip(); + } + + @Benchmark + public void decrypt() throws Exception { + decryptCipher.init(Cipher.DECRYPT_MODE, ks, + encryptCipher.getParameters(). + getParameterSpec(GCMParameterSpec.class)); + decryptCipher.doFinal(encryptedData, out); + encryptedData.flip(); + out.flip(); + } + + @Benchmark + public void decryptMultiPart() throws Exception { + decryptCipher.init(Cipher.DECRYPT_MODE, ks, + encryptCipher.getParameters(). + getParameterSpec(GCMParameterSpec.class)); + + int len = encryptedData.remaining(); + encryptedData.limit(updateLen); + decryptCipher.update(encryptedData, out); + encryptedData.limit(len); + + decryptCipher.doFinal(encryptedData, out); + encryptedData.flip(); + out.flip(); + } + +} \ No newline at end of file diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/full/CryptoBase.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/CryptoBase.java new file mode 100644 index 000000000..4af12703b --- /dev/null +++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/CryptoBase.java @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.javax.crypto.full; + +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + +import javax.crypto.BadPaddingException; +import javax.crypto.Cipher; +import javax.crypto.IllegalBlockSizeException; +import javax.crypto.NoSuchPaddingException; +import java.security.NoSuchAlgorithmException; +import java.security.Provider; +import java.security.SecureRandom; +import java.security.Security; +import java.util.Random; +import java.util.concurrent.TimeUnit; + + +@Fork(jvmArgsAppend = {"-XX:+AlwaysPreTouch"}, value = 5) +@Warmup(iterations = 3, time = 3) +@Measurement(iterations = 8, time = 2) +@OutputTimeUnit(TimeUnit.SECONDS) +@State(Scope.Thread) +@BenchmarkMode(Mode.Throughput) +public class CryptoBase { + + @Param({""}) + private String provider; + + public Provider prov = null; + + @Setup + public void setupProvider() { + if (provider != null && !provider.isEmpty()) { + prov = Security.getProvider(provider); + if (prov == null) { + throw new RuntimeException("Can't find prodiver \"" + provider + "\""); + } + } + } + + public static Cipher makeCipher(Provider prov, String algorithm) throws NoSuchPaddingException, NoSuchAlgorithmException { + return (prov == null) ? Cipher.getInstance(algorithm) : Cipher.getInstance(algorithm, prov); + } + + public static byte[][] fillRandom(byte[][] data) { + Random rnd = new Random(); + for (byte[] d : data) { + rnd.nextBytes(d); + } + return data; + } + + public static byte[] fillRandom(byte[] data) { + Random rnd = new Random(); + rnd.nextBytes(data); + return data; + } + + public static byte[] fillSecureRandom(byte[] data) { + SecureRandom rnd = new SecureRandom(); + rnd.nextBytes(data); + return data; + } + + public static byte[][] fillEncrypted(byte[][] data, Cipher encryptCipher) throws BadPaddingException, IllegalBlockSizeException { + byte[][] encryptedData = new byte[data.length][]; + for (int i = 0; i < encryptedData.length; i++) { + encryptedData[i] = encryptCipher.doFinal(data[i]); + } + return encryptedData; + } +} \ No newline at end of file diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMBench.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMBench.java new file mode 100644 index 000000000..a21b0c87f --- /dev/null +++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMBench.java @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2015, 2021, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.javax.crypto.small; + +import org.openjdk.jmh.annotations.Param; + +public class AESGCMBench extends + org.openjdk.bench.javax.crypto.full.AESGCMBench { + + @Param({"128"}) + private int keyLength; + + @Param({"1024"}) + private int dataSize; + +} \ No newline at end of file diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMByteBuffer.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMByteBuffer.java new file mode 100644 index 000000000..2e389d300 --- /dev/null +++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMByteBuffer.java @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.javax.crypto.small; + +import org.openjdk.jmh.annotations.Param; + +public class AESGCMByteBuffer extends + org.openjdk.bench.javax.crypto.full.AESGCMByteBuffer { + + @Param({"128"}) + private int keyLength; + + @Param({"1024"}) + private int dataSize; + +} \ No newline at end of file diff --git a/jdk/test/sun/security/ssl/SSLSocketImpl/ClientTimeout.java b/jdk/test/sun/security/ssl/SSLSocketImpl/ClientTimeout.java index 3eb1d7b89..7678cc71f 100644 --- a/jdk/test/sun/security/ssl/SSLSocketImpl/ClientTimeout.java +++ b/jdk/test/sun/security/ssl/SSLSocketImpl/ClientTimeout.java @@ -26,8 +26,7 @@ /* * @test - * @bug 4836493 - * @ignore need further evaluation + * @bug 4836493 8239798 * @summary Socket timeouts for SSLSockets causes data corruption. * @run main/othervm ClientTimeout */ diff --git a/jdk/test/sun/security/ssl/SSLSocketImpl/SSLExceptionForIOIssue.java b/jdk/test/sun/security/ssl/SSLSocketImpl/SSLExceptionForIOIssue.java index 3e626a257..5578ea725 100644 --- a/jdk/test/sun/security/ssl/SSLSocketImpl/SSLExceptionForIOIssue.java +++ b/jdk/test/sun/security/ssl/SSLSocketImpl/SSLExceptionForIOIssue.java @@ -36,7 +36,7 @@ import javax.net.ssl.*; import java.io.*; -import java.net.InetAddress; +import java.net.*; public class SSLExceptionForIOIssue implements SSLContextTemplate { @@ -139,7 +139,7 @@ public class SSLExceptionForIOIssue implements SSLContextTemplate { } catch (SSLProtocolException | SSLHandshakeException sslhe) { clientException = sslhe; System.err.println("unexpected client exception: " + sslhe); - } catch (SSLException ssle) { + } catch (SSLException | SocketTimeoutException ssle) { // the expected exception, ignore it System.err.println("expected client exception: " + ssle); } catch (Exception e) { -- 2.17.1
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.
浙ICP备2022010568号-2