diff --git a/8143925-enhancing-CounterMode.crypt-for-AESCrypt.patch b/8143925-enhancing-CounterMode.crypt-for-AESCrypt.patch new file mode 100644 index 0000000000000000000000000000000000000000..81acb5c4ec2e49c9990e1cf2ed85d1ecad7211ac --- /dev/null +++ b/8143925-enhancing-CounterMode.crypt-for-AESCrypt.patch @@ -0,0 +1,3938 @@ +From 02b097417275acaad294d71a852c2def2222be25 Mon Sep 17 00:00:00 2001 +From: kuenking111 +Date: Sat, 3 Sep 2022 14:17:50 +0000 +Subject: [PATCH 1/6] 8143925-enhancing-CounterMode.crypt-for-AESCrypt + +--- + .../src/cpu/aarch64/vm/assembler_aarch64.hpp | 35 +- + .../cpu/aarch64/vm/macroAssembler_aarch64.hpp | 17 + + .../aarch64/vm/macroAssembler_aarch64_aes.cpp | 685 ++++++++++++++++++ + .../cpu/aarch64/vm/stubGenerator_aarch64.cpp | 324 ++++++++- + .../cpu/aarch64/vm/stubRoutines_aarch64.hpp | 2 +- + .../src/cpu/aarch64/vm/vm_version_aarch64.cpp | 13 +- + hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp | 5 + + hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp | 5 + + hotspot/src/cpu/x86/vm/assembler_x86.cpp | 74 +- + hotspot/src/cpu/x86/vm/assembler_x86.hpp | 12 + + .../src/cpu/x86/vm/stubGenerator_x86_32.cpp | 344 +++++++++ + .../src/cpu/x86/vm/stubGenerator_x86_64.cpp | 340 ++++++++- + hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp | 1 + + hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp | 5 + + .../src/cpu/x86/vm/stubRoutines_x86_32.hpp | 2 +- + .../src/cpu/x86/vm/stubRoutines_x86_64.hpp | 2 +- + hotspot/src/cpu/x86/vm/vm_version_x86.cpp | 36 + + hotspot/src/share/vm/classfile/vmSymbols.hpp | 4 + + hotspot/src/share/vm/opto/escape.cpp | 1 + + hotspot/src/share/vm/opto/library_call.cpp | 174 +++++ + hotspot/src/share/vm/opto/runtime.cpp | 29 + + hotspot/src/share/vm/opto/runtime.hpp | 1 + + hotspot/src/share/vm/runtime/globals.hpp | 3 + + hotspot/src/share/vm/runtime/stubRoutines.cpp | 1 + + hotspot/src/share/vm/runtime/stubRoutines.hpp | 2 + + hotspot/src/share/vm/runtime/vmStructs.cpp | 1 + + .../test/compiler/7184394/TestAESBase.java | 4 +- + .../test/compiler/7184394/TestAESMain.java | 7 + + .../com/sun/crypto/provider/CounterMode.java | 11 +- + .../classes/com/sun/crypto/provider/GCTR.java | 89 +-- + .../com/sun/crypto/provider/GHASH.java | 20 +- + .../sun/security/ssl/SSLSocketImpl.java | 14 +- + .../security/ssl/SSLSocketInputRecord.java | 215 +++--- + .../sun/security/ssl/SSLTransport.java | 4 + + .../bench/javax/crypto/full/AESGCMBench.java | 128 ++++ + .../javax/crypto/full/AESGCMByteBuffer.java | 163 +++++ + .../bench/javax/crypto/full/CryptoBase.java | 102 +++ + .../bench/javax/crypto/small/AESGCMBench.java | 36 + + .../javax/crypto/small/AESGCMByteBuffer.java | 36 + + .../ssl/SSLSocketImpl/ClientTimeout.java | 3 +- + .../SSLSocketImpl/SSLExceptionForIOIssue.java | 4 +- + 41 files changed, 2738 insertions(+), 216 deletions(-) + create mode 100644 hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64_aes.cpp + create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMBench.java + create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMByteBuffer.java + create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/full/CryptoBase.java + create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMBench.java + create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMByteBuffer.java + +diff --git a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp +index b0fa9b5fc..9202e61f8 100644 +--- a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp ++++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp +@@ -146,6 +146,21 @@ REGISTER_DECLARATION(Register, esp, r20); + + #define assert_cond(ARG1) assert(ARG1, #ARG1) + ++// In many places we've added C-style casts to silence compiler ++// warnings, for example when truncating a size_t to an int when we ++// know the size_t is a small struct. Such casts are risky because ++// they effectively disable useful compiler warnings. We can make our ++// lives safer with this function, which ensures that any cast is ++// reversible without loss of information. It doesn't check ++// everything: it isn't intended to make sure that pointer types are ++// compatible, for example. ++template ++T2 checked_cast(T1 thing) { ++ T2 result = static_cast(thing); ++ assert(static_cast(result) == thing, "must be"); ++ return result; ++} ++ + namespace asm_util { + uint32_t encode_logical_immediate(bool is32, uint64_t imm); + }; +@@ -193,7 +208,7 @@ public: + static inline uint32_t extract(uint32_t val, int msb, int lsb) { + int nbits = msb - lsb + 1; + assert_cond(msb >= lsb); +- uint32_t mask = (1U << nbits) - 1; ++ uint32_t mask = checked_cast(right_n_bits(nbits)); + uint32_t result = val >> lsb; + result &= mask; + return result; +@@ -208,7 +223,7 @@ public: + int nbits = msb - lsb + 1; + guarantee(val < (1U << nbits), "Field too big for insn"); + assert_cond(msb >= lsb); +- unsigned mask = (1U << nbits) - 1; ++ unsigned mask = checked_cast(right_n_bits(nbits)); + val <<= lsb; + mask <<= lsb; + unsigned target = *(unsigned *)a; +@@ -222,7 +237,7 @@ public: + long chk = val >> (nbits - 1); + guarantee (chk == -1 || chk == 0, "Field too big for insn"); + unsigned uval = val; +- unsigned mask = (1U << nbits) - 1; ++ unsigned mask = checked_cast(right_n_bits(nbits)); + uval &= mask; + uval <<= lsb; + mask <<= lsb; +@@ -234,9 +249,9 @@ public: + + void f(unsigned val, int msb, int lsb) { + int nbits = msb - lsb + 1; +- guarantee(val < (1U << nbits), "Field too big for insn"); ++ guarantee(val < (1ULL << nbits), "Field too big for insn"); + assert_cond(msb >= lsb); +- unsigned mask = (1U << nbits) - 1; ++ unsigned mask = checked_cast(right_n_bits(nbits)); + val <<= lsb; + mask <<= lsb; + insn |= val; +@@ -255,7 +270,7 @@ public: + long chk = val >> (nbits - 1); + guarantee (chk == -1 || chk == 0, "Field too big for insn"); + unsigned uval = val; +- unsigned mask = (1U << nbits) - 1; ++ unsigned mask = checked_cast(right_n_bits(nbits)); + uval &= mask; + f(uval, lsb + nbits - 1, lsb); + } +@@ -280,7 +295,7 @@ public: + + unsigned get(int msb = 31, int lsb = 0) { + int nbits = msb - lsb + 1; +- unsigned mask = ((1U << nbits) - 1) << lsb; ++ unsigned mask = checked_cast(right_n_bits(nbits)) << lsb; + assert_cond((bits & mask) == mask); + return (insn & mask) >> lsb; + } +@@ -1991,21 +2006,21 @@ public: + starti; + f(0,31), f((int)T & 1, 30); + f(op1, 29, 21), f(0, 20, 16), f(op2, 15, 12); +- f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0); ++ f((int)T >> 1, 11, 10), srf(Xn, 5), rf(Vt, 0); + } + void ld_st(FloatRegister Vt, SIMD_Arrangement T, Register Xn, + int imm, int op1, int op2) { + starti; + f(0,31), f((int)T & 1, 30); + f(op1 | 0b100, 29, 21), f(0b11111, 20, 16), f(op2, 15, 12); +- f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0); ++ f((int)T >> 1, 11, 10), srf(Xn, 5), rf(Vt, 0); + } + void ld_st(FloatRegister Vt, SIMD_Arrangement T, Register Xn, + Register Xm, int op1, int op2) { + starti; + f(0,31), f((int)T & 1, 30); + f(op1 | 0b100, 29, 21), rf(Xm, 16), f(op2, 15, 12); +- f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0); ++ f((int)T >> 1, 11, 10), srf(Xn, 5), rf(Vt, 0); + } + + void ld_st(FloatRegister Vt, SIMD_Arrangement T, Address a, int op1, int op2) { +diff --git a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp +index 0ca694038..d334f1b69 100644 +--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp ++++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp +@@ -1240,6 +1240,23 @@ public: + void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, + Register zlen, Register tmp1, Register tmp2, Register tmp3, + Register tmp4, Register tmp5, Register tmp6, Register tmp7); ++ void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, ++ FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, ++ FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3); ++ void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, ++ FloatRegister p, FloatRegister z, FloatRegister t1); ++ void ghash_processBlocks_wide(address p, Register state, Register subkeyH, ++ Register data, Register blocks, int unrolls); ++ void ghash_modmul (FloatRegister result, ++ FloatRegister result_lo, FloatRegister result_hi, FloatRegister b, ++ FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p, ++ FloatRegister t1, FloatRegister t2, FloatRegister t3); ++ ++ void aesenc_loadkeys(Register key, Register keylen); ++ void aesecb_encrypt(Register from, Register to, Register keylen, ++ FloatRegister data = v0, int unrolls = 1); ++ void aesecb_decrypt(Register from, Register to, Register key, Register keylen); ++ void aes_round(FloatRegister input, FloatRegister subkey); + // ISB may be needed because of a safepoint + void maybe_isb() { isb(); } + +diff --git a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64_aes.cpp b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64_aes.cpp +new file mode 100644 +index 000000000..1db79c97a +--- /dev/null ++++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64_aes.cpp +@@ -0,0 +1,685 @@ ++/* ++ * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved. ++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ++ * ++ * This code is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 only, as ++ * published by the Free Software Foundation. ++ * ++ * This code is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * version 2 for more details (a copy is included in the LICENSE file that ++ * accompanied this code). ++ * ++ * You should have received a copy of the GNU General Public License version ++ * 2 along with this work; if not, write to the Free Software Foundation, ++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. ++ * ++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA ++ * or visit www.oracle.com if you need additional information or have any ++ * questions. ++ * ++ */ ++ ++#include "precompiled.hpp" ++ ++#include "asm/assembler.hpp" ++#include "asm/assembler.inline.hpp" ++#include "macroAssembler_aarch64.hpp" ++#include "memory/resourceArea.hpp" ++#include "runtime/stubRoutines.hpp" ++ ++void MacroAssembler::aesecb_decrypt(Register from, Register to, Register key, Register keylen) { ++ Label L_doLast; ++ ++ ld1(v0, T16B, from); // get 16 bytes of input ++ ++ ld1(v5, T16B, post(key, 16)); ++ rev32(v5, T16B, v5); ++ ++ ld1(v1, v2, v3, v4, T16B, post(key, 64)); ++ rev32(v1, T16B, v1); ++ rev32(v2, T16B, v2); ++ rev32(v3, T16B, v3); ++ rev32(v4, T16B, v4); ++ aesd(v0, v1); ++ aesimc(v0, v0); ++ aesd(v0, v2); ++ aesimc(v0, v0); ++ aesd(v0, v3); ++ aesimc(v0, v0); ++ aesd(v0, v4); ++ aesimc(v0, v0); ++ ++ ld1(v1, v2, v3, v4, T16B, post(key, 64)); ++ rev32(v1, T16B, v1); ++ rev32(v2, T16B, v2); ++ rev32(v3, T16B, v3); ++ rev32(v4, T16B, v4); ++ aesd(v0, v1); ++ aesimc(v0, v0); ++ aesd(v0, v2); ++ aesimc(v0, v0); ++ aesd(v0, v3); ++ aesimc(v0, v0); ++ aesd(v0, v4); ++ aesimc(v0, v0); ++ ++ ld1(v1, v2, T16B, post(key, 32)); ++ rev32(v1, T16B, v1); ++ rev32(v2, T16B, v2); ++ ++ cmpw(keylen, 44); ++ br(Assembler::EQ, L_doLast); ++ ++ aesd(v0, v1); ++ aesimc(v0, v0); ++ aesd(v0, v2); ++ aesimc(v0, v0); ++ ++ ld1(v1, v2, T16B, post(key, 32)); ++ rev32(v1, T16B, v1); ++ rev32(v2, T16B, v2); ++ ++ cmpw(keylen, 52); ++ br(Assembler::EQ, L_doLast); ++ ++ aesd(v0, v1); ++ aesimc(v0, v0); ++ aesd(v0, v2); ++ aesimc(v0, v0); ++ ++ ld1(v1, v2, T16B, post(key, 32)); ++ rev32(v1, T16B, v1); ++ rev32(v2, T16B, v2); ++ ++ bind(L_doLast); ++ ++ aesd(v0, v1); ++ aesimc(v0, v0); ++ aesd(v0, v2); ++ ++ eor(v0, T16B, v0, v5); ++ ++ st1(v0, T16B, to); ++ ++ // Preserve the address of the start of the key ++ sub(key, key, keylen, LSL, exact_log2(sizeof (jint))); ++} ++ ++// Load expanded key into v17..v31 ++void MacroAssembler::aesenc_loadkeys(Register key, Register keylen) { ++ Label L_loadkeys_44, L_loadkeys_52; ++ cmpw(keylen, 52); ++ br(Assembler::LO, L_loadkeys_44); ++ br(Assembler::EQ, L_loadkeys_52); ++ ++ ld1(v17, v18, T16B, post(key, 32)); ++ rev32(v17, T16B, v17); ++ rev32(v18, T16B, v18); ++ bind(L_loadkeys_52); ++ ld1(v19, v20, T16B, post(key, 32)); ++ rev32(v19, T16B, v19); ++ rev32(v20, T16B, v20); ++ bind(L_loadkeys_44); ++ ld1(v21, v22, v23, v24, T16B, post(key, 64)); ++ rev32(v21, T16B, v21); ++ rev32(v22, T16B, v22); ++ rev32(v23, T16B, v23); ++ rev32(v24, T16B, v24); ++ ld1(v25, v26, v27, v28, T16B, post(key, 64)); ++ rev32(v25, T16B, v25); ++ rev32(v26, T16B, v26); ++ rev32(v27, T16B, v27); ++ rev32(v28, T16B, v28); ++ ld1(v29, v30, v31, T16B, post(key, 48)); ++ rev32(v29, T16B, v29); ++ rev32(v30, T16B, v30); ++ rev32(v31, T16B, v31); ++ ++ // Preserve the address of the start of the key ++ sub(key, key, keylen, LSL, exact_log2(sizeof (jint))); ++} ++ ++// NeoverseTM N1Software Optimization Guide: ++// Adjacent AESE/AESMC instruction pairs and adjacent AESD/AESIMC ++// instruction pairs will exhibit the performance characteristics ++// described in Section 4.6. ++void MacroAssembler::aes_round(FloatRegister input, FloatRegister subkey) { ++ aese(input, subkey); aesmc(input, input); ++} ++ ++// KernelGenerator ++// ++// The abstract base class of an unrolled function generator. ++// Subclasses override generate(), length(), and next() to generate ++// unrolled and interleaved functions. ++// ++// The core idea is that a subclass defines a method which generates ++// the base case of a function and a method to generate a clone of it, ++// shifted to a different set of registers. KernelGenerator will then ++// generate several interleaved copies of the function, with each one ++// using a different set of registers. ++ ++// The subclass must implement three methods: length(), which is the ++// number of instruction bundles in the intrinsic, generate(int n) ++// which emits the nth instruction bundle in the intrinsic, and next() ++// which takes an instance of the generator and returns a version of it, ++// shifted to a new set of registers. ++ ++class KernelGenerator: public MacroAssembler { ++protected: ++ const int _unrolls; ++public: ++ KernelGenerator(Assembler *as, int unrolls) ++ : MacroAssembler(as->code()), _unrolls(unrolls) { } ++ virtual void generate(int index) = 0; ++ virtual int length() = 0; ++ virtual KernelGenerator *next() = 0; ++ int unrolls() { return _unrolls; } ++ void unroll(); ++}; ++ ++void KernelGenerator::unroll() { ++ ResourceMark rm; ++ KernelGenerator **generators ++ = NEW_RESOURCE_ARRAY(KernelGenerator *, unrolls()); ++ ++ generators[0] = this; ++ for (int i = 1; i < unrolls(); i++) { ++ generators[i] = generators[i-1]->next(); ++ } ++ ++ for (int j = 0; j < length(); j++) { ++ for (int i = 0; i < unrolls(); i++) { ++ generators[i]->generate(j); ++ } ++ } ++} ++ ++// An unrolled and interleaved generator for AES encryption. ++class AESKernelGenerator: public KernelGenerator { ++ Register _from, _to; ++ const Register _keylen; ++ FloatRegister _data; ++ const FloatRegister _subkeys; ++ bool _once; ++ Label _rounds_44, _rounds_52; ++ ++public: ++ AESKernelGenerator(Assembler *as, int unrolls, ++ Register from, Register to, Register keylen, FloatRegister data, ++ FloatRegister subkeys, bool once = true) ++ : KernelGenerator(as, unrolls), ++ _from(from), _to(to), _keylen(keylen), _data(data), ++ _subkeys(subkeys), _once(once) { ++ } ++ ++ virtual void generate(int index) { ++ switch (index) { ++ case 0: ++ if (_from != noreg) { ++ ld1(_data, T16B, _from); // get 16 bytes of input ++ } ++ break; ++ case 1: ++ if (_once) { ++ cmpw(_keylen, 52); ++ br(Assembler::LO, _rounds_44); ++ br(Assembler::EQ, _rounds_52); ++ } ++ break; ++ case 2: aes_round(_data, _subkeys + 0); break; ++ case 3: aes_round(_data, _subkeys + 1); break; ++ case 4: ++ if (_once) bind(_rounds_52); ++ break; ++ case 5: aes_round(_data, _subkeys + 2); break; ++ case 6: aes_round(_data, _subkeys + 3); break; ++ case 7: ++ if (_once) bind(_rounds_44); ++ break; ++ case 8: aes_round(_data, _subkeys + 4); break; ++ case 9: aes_round(_data, _subkeys + 5); break; ++ case 10: aes_round(_data, _subkeys + 6); break; ++ case 11: aes_round(_data, _subkeys + 7); break; ++ case 12: aes_round(_data, _subkeys + 8); break; ++ case 13: aes_round(_data, _subkeys + 9); break; ++ case 14: aes_round(_data, _subkeys + 10); break; ++ case 15: aes_round(_data, _subkeys + 11); break; ++ case 16: aes_round(_data, _subkeys + 12); break; ++ case 17: aese(_data, _subkeys + 13); break; ++ case 18: eor(_data, T16B, _data, _subkeys + 14); break; ++ case 19: ++ if (_to != noreg) { ++ st1(_data, T16B, _to); ++ } ++ break; ++ default: ShouldNotReachHere(); ++ } ++ } ++ ++ virtual KernelGenerator *next() { ++ return new AESKernelGenerator(this, _unrolls, ++ _from, _to, _keylen, ++ _data + 1, _subkeys, /*once*/false); ++ } ++ ++ virtual int length() { return 20; } ++}; ++ ++// Uses expanded key in v17..v31 ++// Returns encrypted values in inputs. ++// If to != noreg, store value at to; likewise from ++// Preserves key, keylen ++// Increments from, to ++// Input data in v0, v1, ... ++// unrolls controls the number of times to unroll the generated function ++void MacroAssembler::aesecb_encrypt(Register from, Register to, Register keylen, ++ FloatRegister data, int unrolls) { ++ AESKernelGenerator(this, unrolls, from, to, keylen, data, v17) .unroll(); ++} ++ ++// ghash_multiply and ghash_reduce are the non-unrolled versions of ++// the GHASH function generators. ++void MacroAssembler::ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, ++ FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, ++ FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3) { ++ // Karatsuba multiplication performs a 128*128 -> 256-bit ++ // multiplication in three 128-bit multiplications and a few ++ // additions. ++ // ++ // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) ++ // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 ++ // ++ // Inputs: ++ // ++ // A0 in a.d[0] (subkey) ++ // A1 in a.d[1] ++ // (A1+A0) in a1_xor_a0.d[0] ++ // ++ // B0 in b.d[0] (state) ++ // B1 in b.d[1] ++ ++ ext(tmp1, T16B, b, b, 0x08); ++ pmull2(result_hi, T1Q, b, a, T2D); // A1*B1 ++ eor(tmp1, T16B, tmp1, b); // (B1+B0) ++ pmull(result_lo, T1Q, b, a, T1D); // A0*B0 ++ pmull(tmp2, T1Q, tmp1, a1_xor_a0, T1D); // (A1+A0)(B1+B0) ++ ++ ext(tmp1, T16B, result_lo, result_hi, 0x08); ++ eor(tmp3, T16B, result_hi, result_lo); // A1*B1+A0*B0 ++ eor(tmp2, T16B, tmp2, tmp1); ++ eor(tmp2, T16B, tmp2, tmp3); ++ ++ // Register pair holds the result of carry-less multiplication ++ ins(result_hi, D, tmp2, 0, 1); ++ ins(result_lo, D, tmp2, 1, 0); ++} ++ ++void MacroAssembler::ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, ++ FloatRegister p, FloatRegister vzr, FloatRegister t1) { ++ const FloatRegister t0 = result; ++ ++ // The GCM field polynomial f is z^128 + p(z), where p = ++ // z^7+z^2+z+1. ++ // ++ // z^128 === -p(z) (mod (z^128 + p(z))) ++ // ++ // so, given that the product we're reducing is ++ // a == lo + hi * z^128 ++ // substituting, ++ // === lo - hi * p(z) (mod (z^128 + p(z))) ++ // ++ // we reduce by multiplying hi by p(z) and subtracting the result ++ // from (i.e. XORing it with) lo. Because p has no nonzero high ++ // bits we can do this with two 64-bit multiplications, lo*p and ++ // hi*p. ++ ++ pmull2(t0, T1Q, hi, p, T2D); ++ ext(t1, T16B, t0, vzr, 8); ++ eor(hi, T16B, hi, t1); ++ ext(t1, T16B, vzr, t0, 8); ++ eor(lo, T16B, lo, t1); ++ pmull(t0, T1Q, hi, p, T1D); ++ eor(result, T16B, lo, t0); ++} ++ ++class GHASHMultiplyGenerator: public KernelGenerator { ++ FloatRegister _result_lo, _result_hi, _b, ++ _a, _vzr, _a1_xor_a0, _p, ++ _tmp1, _tmp2, _tmp3; ++ ++public: ++ GHASHMultiplyGenerator(Assembler *as, int unrolls, ++ FloatRegister result_lo, FloatRegister result_hi, ++ /* offsetted registers */ ++ FloatRegister b, ++ /* non-offsetted (shared) registers */ ++ FloatRegister a, FloatRegister a1_xor_a0, FloatRegister p, FloatRegister vzr, ++ /* offseted (temp) registers */ ++ FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3) ++ : KernelGenerator(as, unrolls), ++ _result_lo(result_lo), _result_hi(result_hi), _b(b), ++ _a(a), _vzr(vzr), _a1_xor_a0(a1_xor_a0), _p(p), ++ _tmp1(tmp1), _tmp2(tmp2), _tmp3(tmp3) { } ++ ++ static const int register_stride = 7; ++ ++ virtual void generate(int index) { ++ // Karatsuba multiplication performs a 128*128 -> 256-bit ++ // multiplication in three 128-bit multiplications and a few ++ // additions. ++ // ++ // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) ++ // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 ++ // ++ // Inputs: ++ // ++ // A0 in a.d[0] (subkey) ++ // A1 in a.d[1] ++ // (A1+A0) in a1_xor_a0.d[0] ++ // ++ // B0 in b.d[0] (state) ++ // B1 in b.d[1] ++ ++ switch (index) { ++ case 0: ext(_tmp1, T16B, _b, _b, 0x08); break; ++ case 1: pmull2(_result_hi, T1Q, _b, _a, T2D); // A1*B1 ++ break; ++ case 2: eor(_tmp1, T16B, _tmp1, _b); // (B1+B0) ++ break; ++ case 3: pmull(_result_lo, T1Q, _b, _a, T1D); // A0*B0 ++ break; ++ case 4: pmull(_tmp2, T1Q, _tmp1, _a1_xor_a0, T1D); // (A1+A0)(B1+B0) ++ break; ++ ++ case 5: ext(_tmp1, T16B, _result_lo, _result_hi, 0x08); break; ++ case 6: eor(_tmp3, T16B, _result_hi, _result_lo); // A1*B1+A0*B0 ++ break; ++ case 7: eor(_tmp2, T16B, _tmp2, _tmp1); break; ++ case 8: eor(_tmp2, T16B, _tmp2, _tmp3); break; ++ ++ // Register pair <_result_hi:_result_lo> holds the _result of carry-less multiplication ++ case 9: ins(_result_hi, D, _tmp2, 0, 1); break; ++ case 10: ins(_result_lo, D, _tmp2, 1, 0); break; ++ default: ShouldNotReachHere(); ++ } ++ } ++ ++ virtual KernelGenerator *next() { ++ GHASHMultiplyGenerator *result ++ = new GHASHMultiplyGenerator(this, _unrolls, _result_lo, _result_hi, ++ _b, _a, _a1_xor_a0, _p, _vzr, ++ _tmp1, _tmp2, _tmp3); ++ result->_result_lo += register_stride; ++ result->_result_hi += register_stride; ++ result->_b += register_stride; ++ result->_tmp1 += register_stride; ++ result->_tmp2 += register_stride; ++ result->_tmp3 += register_stride; ++ return result; ++ } ++ ++ virtual int length() { return 11; } ++}; ++ ++// Reduce the 128-bit product in hi:lo by the GCM field polynomial. ++// The FloatRegister argument called data is optional: if it is a ++// valid register, we interleave LD1 instructions with the ++// reduction. This is to reduce latency next time around the loop. ++class GHASHReduceGenerator: public KernelGenerator { ++ FloatRegister _result, _lo, _hi, _p, _vzr, _data, _t1; ++ int _once; ++public: ++ GHASHReduceGenerator(Assembler *as, int unrolls, ++ /* offsetted registers */ ++ FloatRegister result, FloatRegister lo, FloatRegister hi, ++ /* non-offsetted (shared) registers */ ++ FloatRegister p, FloatRegister vzr, FloatRegister data, ++ /* offseted (temp) registers */ ++ FloatRegister t1) ++ : KernelGenerator(as, unrolls), ++ _result(result), _lo(lo), _hi(hi), ++ _p(p), _vzr(vzr), _data(data), _t1(t1), _once(true) { } ++ ++ static const int register_stride = 7; ++ ++ virtual void generate(int index) { ++ const FloatRegister t0 = _result; ++ ++ switch (index) { ++ // The GCM field polynomial f is z^128 + p(z), where p = ++ // z^7+z^2+z+1. ++ // ++ // z^128 === -p(z) (mod (z^128 + p(z))) ++ // ++ // so, given that the product we're reducing is ++ // a == lo + hi * z^128 ++ // substituting, ++ // === lo - hi * p(z) (mod (z^128 + p(z))) ++ // ++ // we reduce by multiplying hi by p(z) and subtracting the _result ++ // from (i.e. XORing it with) lo. Because p has no nonzero high ++ // bits we can do this with two 64-bit multiplications, lo*p and ++ // hi*p. ++ ++ case 0: pmull2(t0, T1Q, _hi, _p, T2D); break; ++ case 1: ext(_t1, T16B, t0, _vzr, 8); break; ++ case 2: eor(_hi, T16B, _hi, _t1); break; ++ case 3: ext(_t1, T16B, _vzr, t0, 8); break; ++ case 4: eor(_lo, T16B, _lo, _t1); break; ++ case 5: pmull(t0, T1Q, _hi, _p, T1D); break; ++ case 6: eor(_result, T16B, _lo, t0); break; ++ default: ShouldNotReachHere(); ++ } ++ ++ // Sprinkle load instructions into the generated instructions ++ if (_data->is_valid() && _once) { ++ assert(length() >= unrolls(), "not enough room for inteleaved loads"); ++ if (index < unrolls()) { ++ ld1((_data + index*register_stride), T16B, post(r2, 0x10)); ++ } ++ } ++ } ++ ++ virtual KernelGenerator *next() { ++ GHASHReduceGenerator *result ++ = new GHASHReduceGenerator(this, _unrolls, ++ _result, _lo, _hi, _p, _vzr, _data, _t1); ++ result->_result += register_stride; ++ result->_hi += register_stride; ++ result->_lo += register_stride; ++ result->_t1 += register_stride; ++ result->_once = false; ++ return result; ++ } ++ ++ int length() { return 7; } ++}; ++ ++// Perform a GHASH multiply/reduce on a single FloatRegister. ++void MacroAssembler::ghash_modmul(FloatRegister result, ++ FloatRegister result_lo, FloatRegister result_hi, FloatRegister b, ++ FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p, ++ FloatRegister t1, FloatRegister t2, FloatRegister t3) { ++ ghash_multiply(result_lo, result_hi, a, b, a1_xor_a0, t1, t2, t3); ++ ghash_reduce(result, result_lo, result_hi, p, vzr, t1); ++} ++ ++// Interleaved GHASH processing. ++// ++// Clobbers all vector registers. ++// ++void MacroAssembler::ghash_processBlocks_wide(address field_polynomial, Register state, ++ Register subkeyH, ++ Register data, Register blocks, int unrolls) { ++ int register_stride = 7; ++ ++ // Bafflingly, GCM uses little-endian for the byte order, but ++ // big-endian for the bit order. For example, the polynomial 1 is ++ // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. ++ // ++ // So, we must either reverse the bytes in each word and do ++ // everything big-endian or reverse the bits in each byte and do ++ // it little-endian. On AArch64 it's more idiomatic to reverse ++ // the bits in each byte (we have an instruction, RBIT, to do ++ // that) and keep the data in little-endian bit order throught the ++ // calculation, bit-reversing the inputs and outputs. ++ ++ assert(unrolls * register_stride < 32, "out of registers"); ++ ++ FloatRegister a1_xor_a0 = v28; ++ FloatRegister Hprime = v29; ++ FloatRegister vzr = v30; ++ FloatRegister p = v31; ++ eor(vzr, T16B, vzr, vzr); // zero register ++ ++ ldrq(p, field_polynomial); // The field polynomial ++ ++ ldrq(v0, Address(state)); ++ ldrq(Hprime, Address(subkeyH)); ++ ++ rev64(v0, T16B, v0); // Bit-reverse words in state and subkeyH ++ rbit(v0, T16B, v0); ++ rev64(Hprime, T16B, Hprime); ++ rbit(Hprime, T16B, Hprime); ++ ++ // Powers of H -> Hprime ++ ++ Label already_calculated, done; ++ { ++ // The first time around we'll have to calculate H**2, H**3, etc. ++ // Look at the largest power of H in the subkeyH array to see if ++ // it's already been calculated. ++ ldp(rscratch1, rscratch2, Address(subkeyH, 16 * (unrolls - 1))); ++ orr(rscratch1, rscratch1, rscratch2); ++ cbnz(rscratch1, already_calculated); ++ ++ orr(v6, T16B, Hprime, Hprime); // Start with H in v6 and Hprime ++ for (int i = 1; i < unrolls; i++) { ++ ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0 ++ eor(a1_xor_a0, T16B, a1_xor_a0, Hprime); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) ++ ghash_modmul(/*result*/v6, /*result_lo*/v5, /*result_hi*/v4, /*b*/v6, ++ Hprime, vzr, a1_xor_a0, p, ++ /*temps*/v1, v3, v2); ++ rev64(v1, T16B, v6); ++ rbit(v1, T16B, v1); ++ strq(v1, Address(subkeyH, 16 * i)); ++ } ++ b(done); ++ } ++ { ++ bind(already_calculated); ++ ++ // Load the largest power of H we need into v6. ++ ldrq(v6, Address(subkeyH, 16 * (unrolls - 1))); ++ rev64(v6, T16B, v6); ++ rbit(v6, T16B, v6); ++ } ++ bind(done); ++ ++ orr(Hprime, T16B, v6, v6); // Move H ** unrolls into Hprime ++ ++ // Hprime contains (H ** 1, H ** 2, ... H ** unrolls) ++ // v0 contains the initial state. Clear the others. ++ for (int i = 1; i < unrolls; i++) { ++ int ofs = register_stride * i; ++ eor(ofs+v0, T16B, ofs+v0, ofs+v0); // zero each state register ++ } ++ ++ ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0 ++ eor(a1_xor_a0, T16B, a1_xor_a0, Hprime); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) ++ ++ // Load #unrolls blocks of data ++ for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) { ++ ld1(v2+ofs, T16B, post(data, 0x10)); ++ } ++ ++ // Register assignments, replicated across 4 clones, v0 ... v23 ++ // ++ // v0: input / output: current state, result of multiply/reduce ++ // v1: temp ++ // v2: input: one block of data (the ciphertext) ++ // also used as a temp once the data has been consumed ++ // v3: temp ++ // v4: output: high part of product ++ // v5: output: low part ... ++ // v6: unused ++ // ++ // Not replicated: ++ // ++ // v28: High part of H xor low part of H' ++ // v29: H' (hash subkey) ++ // v30: zero ++ // v31: Reduction polynomial of the Galois field ++ ++ // Inner loop. ++ // Do the whole load/add/multiply/reduce over all our data except ++ // the last few rows. ++ { ++ Label L_ghash_loop; ++ bind(L_ghash_loop); ++ ++ // Prefetching doesn't help here. In fact, on Neoverse N1 it's worse. ++ // prfm(Address(data, 128), PLDL1KEEP); ++ ++ // Xor data into current state ++ for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) { ++ rbit((v2+ofs), T16B, (v2+ofs)); ++ eor((v2+ofs), T16B, v0+ofs, (v2+ofs)); // bit-swapped data ^ bit-swapped state ++ } ++ ++ // Generate fully-unrolled multiply-reduce in two stages. ++ ++ (new GHASHMultiplyGenerator(this, unrolls, ++ /*result_lo*/v5, /*result_hi*/v4, /*data*/v2, ++ Hprime, a1_xor_a0, p, vzr, ++ /*temps*/v1, v3, /* reuse b*/v2))->unroll(); ++ ++ // NB: GHASHReduceGenerator also loads the next #unrolls blocks of ++ // data into v0, v0+ofs, the current state. ++ (new GHASHReduceGenerator (this, unrolls, ++ /*result*/v0, /*lo*/v5, /*hi*/v4, p, vzr, ++ /*data*/v2, /*temp*/v3))->unroll(); ++ ++ sub(blocks, blocks, unrolls); ++ cmp(blocks, (unsigned char)(unrolls * 2)); ++ br(GE, L_ghash_loop); ++ } ++ ++ // Merge the #unrolls states. Note that the data for the next ++ // iteration has already been loaded into v4, v4+ofs, etc... ++ ++ // First, we multiply/reduce each clone by the appropriate power of H. ++ for (int i = 0; i < unrolls; i++) { ++ int ofs = register_stride * i; ++ ldrq(Hprime, Address(subkeyH, 16 * (unrolls - i - 1))); ++ ++ rbit(v2+ofs, T16B, v2+ofs); ++ eor(v2+ofs, T16B, ofs+v0, v2+ofs); // bit-swapped data ^ bit-swapped state ++ ++ rev64(Hprime, T16B, Hprime); ++ rbit(Hprime, T16B, Hprime); ++ ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0 ++ eor(a1_xor_a0, T16B, a1_xor_a0, Hprime); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) ++ ghash_modmul(/*result*/v0+ofs, /*result_lo*/v5+ofs, /*result_hi*/v4+ofs, /*b*/v2+ofs, ++ Hprime, vzr, a1_xor_a0, p, ++ /*temps*/v1+ofs, v3+ofs, /* reuse b*/v2+ofs); ++ } ++ ++ // Then we sum the results. ++ for (int i = 0; i < unrolls - 1; i++) { ++ int ofs = register_stride * i; ++ eor(v0, T16B, v0, v0 + register_stride + ofs); ++ } ++ ++ sub(blocks, blocks, (unsigned char)unrolls); ++ ++ // And finally bit-reverse the state back to big endian. ++ rev64(v0, T16B, v0); ++ rbit(v0, T16B, v0); ++ st1(v0, T16B, state); ++} +\ No newline at end of file +diff --git a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp +index 2e2e8ae78..c024dec55 100644 +--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp ++++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp +@@ -2804,6 +2804,266 @@ class StubGenerator: public StubCodeGenerator { + return start; + } + ++ // CTR AES crypt. ++ // Arguments: ++ // ++ // Inputs: ++ // c_rarg0 - source byte array address ++ // c_rarg1 - destination byte array address ++ // c_rarg2 - K (key) in little endian int array ++ // c_rarg3 - counter vector byte array address ++ // c_rarg4 - input length ++ // c_rarg5 - saved encryptedCounter start ++ // c_rarg6 - saved used length ++ // ++ // Output: ++ // r0 - input length ++ // ++ address generate_counterMode_AESCrypt() { ++ const Register in = c_rarg0; ++ const Register out = c_rarg1; ++ const Register key = c_rarg2; ++ const Register counter = c_rarg3; ++ const Register saved_len = c_rarg4, len = r10; ++ const Register saved_encrypted_ctr = c_rarg5; ++ const Register used_ptr = c_rarg6, used = r12; ++ ++ const Register offset = r7; ++ const Register keylen = r11; ++ ++ const unsigned char block_size = 16; ++ const int bulk_width = 4; ++ // NB: bulk_width can be 4 or 8. 8 gives slightly faster ++ // performance with larger data sizes, but it also means that the ++ // fast path isn't used until you have at least 8 blocks, and up ++ // to 127 bytes of data will be executed on the slow path. For ++ // that reason, and also so as not to blow away too much icache, 4 ++ // blocks seems like a sensible compromise. ++ ++ // Algorithm: ++ // ++ // if (len == 0) { ++ // goto DONE; ++ // } ++ // int result = len; ++ // do { ++ // if (used >= blockSize) { ++ // if (len >= bulk_width * blockSize) { ++ // CTR_large_block(); ++ // if (len == 0) ++ // goto DONE; ++ // } ++ // for (;;) { ++ // 16ByteVector v0 = counter; ++ // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); ++ // used = 0; ++ // if (len < blockSize) ++ // break; /* goto NEXT */ ++ // 16ByteVector v1 = load16Bytes(in, offset); ++ // v1 = v1 ^ encryptedCounter; ++ // store16Bytes(out, offset); ++ // used = blockSize; ++ // offset += blockSize; ++ // len -= blockSize; ++ // if (len == 0) ++ // goto DONE; ++ // } ++ // } ++ // NEXT: ++ // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); ++ // len--; ++ // } while (len != 0); ++ // DONE: ++ // return result; ++ // ++ // CTR_large_block() ++ // Wide bulk encryption of whole blocks. ++ ++ __ align(CodeEntryAlignment); ++ StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); ++ const address start = __ pc(); ++ __ enter(); ++ ++ Label DONE, CTR_large_block, large_block_return; ++ __ ldrw(used, Address(used_ptr)); ++ __ cbzw(saved_len, DONE); ++ ++ __ mov(len, saved_len); ++ __ mov(offset, 0); ++ ++ // Compute #rounds for AES based on the length of the key array ++ __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); ++ ++ __ aesenc_loadkeys(key, keylen); ++ ++ { ++ Label L_CTR_loop, NEXT; ++ ++ __ bind(L_CTR_loop); ++ ++ __ cmp(used, block_size); ++ __ br(__ LO, NEXT); ++ ++ // Maybe we have a lot of data ++ __ subsw(rscratch1, len, bulk_width * block_size); ++ __ br(__ HS, CTR_large_block); ++ __ BIND(large_block_return); ++ __ cbzw(len, DONE); ++ ++ // Setup the counter ++ __ movi(v4, __ T4S, 0); ++ __ movi(v5, __ T4S, 1); ++ __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 } ++ ++ __ ld1(v0, __ T16B, counter); // Load the counter into v0 ++ __ rev32(v16, __ T16B, v0); ++ __ addv(v16, __ T4S, v16, v4); ++ __ rev32(v16, __ T16B, v16); ++ __ st1(v16, __ T16B, counter); // Save the incremented counter back ++ ++ { ++ // We have fewer than bulk_width blocks of data left. Encrypt ++ // them one by one until there is less than a full block ++ // remaining, being careful to save both the encrypted counter ++ // and the counter. ++ ++ Label inner_loop; ++ __ bind(inner_loop); ++ // Counter to encrypt is in v0 ++ __ aesecb_encrypt(noreg, noreg, keylen); ++ __ st1(v0, __ T16B, saved_encrypted_ctr); ++ ++ // Do we have a remaining full block? ++ ++ __ mov(used, 0); ++ __ cmp(len, block_size); ++ __ br(__ LO, NEXT); ++ ++ // Yes, we have a full block ++ __ ldrq(v1, Address(in, offset)); ++ __ eor(v1, __ T16B, v1, v0); ++ __ strq(v1, Address(out, offset)); ++ __ mov(used, block_size); ++ __ add(offset, offset, block_size); ++ ++ __ subw(len, len, block_size); ++ __ cbzw(len, DONE); ++ ++ // Increment the counter, store it back ++ __ orr(v0, __ T16B, v16, v16); ++ __ rev32(v16, __ T16B, v16); ++ __ addv(v16, __ T4S, v16, v4); ++ __ rev32(v16, __ T16B, v16); ++ __ st1(v16, __ T16B, counter); // Save the incremented counter back ++ ++ __ b(inner_loop); ++ } ++ ++ __ BIND(NEXT); ++ ++ // Encrypt a single byte, and loop. ++ // We expect this to be a rare event. ++ __ ldrb(rscratch1, Address(in, offset)); ++ __ ldrb(rscratch2, Address(saved_encrypted_ctr, used)); ++ __ eor(rscratch1, rscratch1, rscratch2); ++ __ strb(rscratch1, Address(out, offset)); ++ __ add(offset, offset, 1); ++ __ add(used, used, 1); ++ __ subw(len, len,1); ++ __ cbnzw(len, L_CTR_loop); ++ } ++ ++ __ bind(DONE); ++ __ strw(used, Address(used_ptr)); ++ __ mov(r0, saved_len); ++ ++ __ leave(); // required for proper stackwalking of RuntimeStub frame ++ __ ret(lr); ++ ++ // Bulk encryption ++ ++ __ BIND (CTR_large_block); ++ assert(bulk_width == 4 || bulk_width == 8, "must be"); ++ ++ if (bulk_width == 8) { ++ __ sub(sp, sp, 4 * 16); ++ __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); ++ } ++ __ sub(sp, sp, 4 * 16); ++ __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); ++ RegSet saved_regs = (RegSet::of(in, out, offset) ++ + RegSet::of(saved_encrypted_ctr, used_ptr, len)); ++ __ push(saved_regs, sp); ++ __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption ++ __ add(in, in, offset); ++ __ add(out, out, offset); ++ ++ // Keys should already be loaded into the correct registers ++ ++ __ ld1(v0, __ T16B, counter); // v0 contains the first counter ++ __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter ++ ++ // AES/CTR loop ++ { ++ Label L_CTR_loop; ++ __ BIND(L_CTR_loop); ++ ++ // Setup the counters ++ __ movi(v8, __ T4S, 0); ++ __ movi(v9, __ T4S, 1); ++ __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 } ++ ++ for (FloatRegister f = v0; f < v0 + bulk_width; f++) { ++ __ rev32(f, __ T16B, v16); ++ __ addv(v16, __ T4S, v16, v8); ++ } ++ ++ __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16)); ++ ++ // Encrypt the counters ++ __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width); ++ ++ if (bulk_width == 8) { ++ __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16)); ++ } ++ ++ // XOR the encrypted counters with the inputs ++ for (int i = 0; i < bulk_width; i++) { ++ __ eor(v0 + i, __ T16B, v0 + i, v8 + i); ++ } ++ ++ // Write the encrypted data ++ __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16)); ++ if (bulk_width == 8) { ++ __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16)); ++ } ++ ++ __ subw(len, len, 16 * bulk_width); ++ __ cbnzw(len, L_CTR_loop); ++ } ++ ++ // Save the counter back where it goes ++ __ rev32(v16, __ T16B, v16); ++ __ st1(v16, __ T16B, counter); ++ ++ __ pop(saved_regs, sp); ++ ++ __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); ++ if (bulk_width == 8) { ++ __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); ++ } ++ ++ __ andr(rscratch1, len, -16 * bulk_width); ++ __ sub(len, len, rscratch1); ++ __ add(offset, offset, rscratch1); ++ __ mov(used, 16); ++ __ strw(used, Address(used_ptr)); ++ __ b(large_block_return); ++ ++ return start; ++ } ++ ++ + // Arguments: + // + // Inputs: +@@ -3677,6 +3937,56 @@ class StubGenerator: public StubCodeGenerator { + return start; + } + ++ address generate_ghash_processBlocks_wide() { ++ address small = generate_ghash_processBlocks(); ++ ++ StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide"); ++ __ align(wordSize * 2); ++ address p = __ pc(); ++ __ emit_int64(0x87); // The low-order bits of the field ++ // polynomial (i.e. p = z^7+z^2+z+1) ++ // repeated in the low and high parts of a ++ // 128-bit vector ++ __ emit_int64(0x87); ++ ++ __ align(CodeEntryAlignment); ++ address start = __ pc(); ++ ++ Register state = c_rarg0; ++ Register subkeyH = c_rarg1; ++ Register data = c_rarg2; ++ Register blocks = c_rarg3; ++ ++ const int unroll = 4; ++ ++ __ cmp(blocks, (unsigned char)(unroll * 2)); ++ __ br(__ LT, small); ++ ++ if (unroll > 1) { ++ // Save state before entering routine ++ __ sub(sp, sp, 4 * 16); ++ __ st1(v12, v13, v14, v15, __ T16B, Address(sp)); ++ __ sub(sp, sp, 4 * 16); ++ __ st1(v8, v9, v10, v11, __ T16B, Address(sp)); ++ } ++ ++ __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll); ++ ++ if (unroll > 1) { ++ // And restore state ++ __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16)); ++ __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16)); ++ } ++ ++ __ cmp(blocks, 0u); ++ __ br(__ GT, small); ++ ++ __ ret(lr); ++ ++ return start; ++ } ++ ++ + // Continuation point for throwing of implicit exceptions that are + // not handled in the current activation. Fabricates an exception + // oop and initiates normal exception dispatching in this +@@ -4687,6 +4997,15 @@ class StubGenerator: public StubCodeGenerator { + StubRoutines::_montgomerySquare = g.generate_multiply(); + } + ++ // generate GHASH intrinsics code ++ if (UseGHASHIntrinsics) { ++ if (UseAESCTRIntrinsics) { ++ StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide(); ++ } else { ++ StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); ++ } ++ } ++ + if (UseAESIntrinsics) { + StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); + StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); +@@ -4694,9 +5013,8 @@ class StubGenerator: public StubCodeGenerator { + StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); + } + +- // generate GHASH intrinsics code +- if (UseGHASHIntrinsics) { +- StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); ++ if (UseAESCTRIntrinsics) { ++ StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt(); + } + + if (UseSHA1Intrinsics) { +diff --git a/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp +index d1c312ab3..05619ce7f 100644 +--- a/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp ++++ b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp +@@ -37,7 +37,7 @@ static bool returns_to_call_stub(address return_pc) { + + enum platform_dependent_constants { + code_size1 = 19000, // simply increase if too small (assembler will crash if too small) +- code_size2 = 22000 // simply increase if too small (assembler will crash if too small) ++ code_size2 = 32000 // simply increase if too small (assembler will crash if too small) + }; + + class aarch64 { +diff --git a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp +index 9808337a0..de636fb83 100644 +--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp ++++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp +@@ -233,12 +233,21 @@ void VM_Version::get_processor_features() { + warning("UseAESIntrinsics enabled, but UseAES not, enabling"); + UseAES = true; + } ++ if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { ++ FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); ++ } + } else { + if (UseAES) { +- warning("UseAES specified, but not supported on this CPU"); ++ warning("AES instructions are not available on this CPU"); ++ FLAG_SET_DEFAULT(UseAES, false); + } + if (UseAESIntrinsics) { +- warning("UseAESIntrinsics specified, but not supported on this CPU"); ++ warning("AES intrinsics are not available on this CPU"); ++ FLAG_SET_DEFAULT(UseAESIntrinsics, false); ++ } ++ if (UseAESCTRIntrinsics) { ++ warning("AES/CTR intrinsics are not available on this CPU"); ++ FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); + } + } + +diff --git a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp +index b5ce1cfa9..fea8b1f87 100644 +--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp ++++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp +@@ -194,6 +194,11 @@ void VM_Version::initialize() { + FLAG_SET_DEFAULT(UseAESIntrinsics, false); + } + ++ if (UseAESCTRIntrinsics) { ++ warning("AES/CTR intrinsics are not available on this CPU"); ++ FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); ++ } ++ + if (UseGHASHIntrinsics) { + warning("GHASH intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); +diff --git a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp +index bd893e138..08d7a7311 100644 +--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp ++++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp +@@ -319,6 +319,11 @@ void VM_Version::initialize() { + } + } + ++ if (UseAESCTRIntrinsics) { ++ warning("AES/CTR intrinsics are not available on this CPU"); ++ FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); ++ } ++ + // GHASH/GCM intrinsics + if (has_vis3() && (UseVIS > 2)) { + if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { +diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.cpp b/hotspot/src/cpu/x86/vm/assembler_x86.cpp +index 1759ecdfd..ddc1acfd8 100644 +--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp ++++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp +@@ -2373,20 +2373,52 @@ void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) { + + void Assembler::pextrd(Register dst, XMMRegister src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); +- int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, false); ++ int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, false); + emit_int8(0x16); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(imm8); + } + ++void Assembler::pextrd(Address dst, XMMRegister src, int imm8) { ++ assert(VM_Version::supports_sse4_1(), ""); ++ simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, false); ++ emit_int8(0x16); ++ emit_operand(src, dst); ++ emit_int8(imm8); ++} ++ + void Assembler::pextrq(Register dst, XMMRegister src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); +- int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, true); ++ int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, true); + emit_int8(0x16); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(imm8); + } + ++void Assembler::pextrq(Address dst, XMMRegister src, int imm8) { ++ assert(VM_Version::supports_sse4_1(), ""); ++ simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, true); ++ emit_int8(0x16); ++ emit_operand(src, dst); ++ emit_int8(imm8); ++} ++ ++void Assembler::pextrw(Address dst, XMMRegister src, int imm8) { ++ assert(VM_Version::supports_sse4_1(), ""); ++ simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A); ++ emit_int8((unsigned char)0x15); ++ emit_operand(src, dst); ++ emit_int8(imm8); ++} ++ ++void Assembler::pextrb(Address dst, XMMRegister src, int imm8) { ++ assert(VM_Version::supports_sse4_1(), ""); ++ simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A); ++ emit_int8(0x14); ++ emit_operand(src, dst); ++ emit_int8(imm8); ++} ++ + void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, false); +@@ -2395,6 +2427,14 @@ void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) { + emit_int8(imm8); + } + ++void Assembler::pinsrd(XMMRegister dst, Address src, int imm8) { ++ assert(VM_Version::supports_sse4_1(), ""); ++ simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, false); ++ emit_int8(0x22); ++ emit_operand(dst,src); ++ emit_int8(imm8); ++} ++ + void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) { + assert(VM_Version::supports_sse4_1(), ""); + int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, true); +@@ -2403,6 +2443,30 @@ void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) { + emit_int8(imm8); + } + ++void Assembler::pinsrq(XMMRegister dst, Address src, int imm8) { ++ assert(VM_Version::supports_sse4_1(), ""); ++ simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, true); ++ emit_int8(0x22); ++ emit_operand(dst, src); ++ emit_int8(imm8); ++} ++ ++void Assembler::pinsrw(XMMRegister dst, Address src, int imm8) { ++ assert(VM_Version::supports_sse2(), ""); ++ simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F); ++ emit_int8((unsigned char)0xC4); ++ emit_operand(dst, src); ++ emit_int8(imm8); ++} ++ ++void Assembler::pinsrb(XMMRegister dst, Address src, int imm8) { ++ assert(VM_Version::supports_sse4_1(), ""); ++ simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A); ++ emit_int8(0x20); ++ emit_operand(dst, src); ++ emit_int8(imm8); ++} ++ + void Assembler::pmovzxbw(XMMRegister dst, Address src) { + assert(VM_Version::supports_sse4_1(), ""); + InstructionMark im(this); +@@ -3075,6 +3139,12 @@ void Assembler::xorl(Register dst, Register src) { + emit_arith(0x33, 0xC0, dst, src); + } + ++void Assembler::xorb(Register dst, Address src) { ++ InstructionMark im(this); ++ prefix(src, dst); ++ emit_int8(0x32); ++ emit_operand(dst, src); ++} + + // AVX 3-operands scalar float-point arithmetic instructions + +diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.hpp b/hotspot/src/cpu/x86/vm/assembler_x86.hpp +index 5ea01311e..c2e70bc2a 100644 +--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp ++++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp +@@ -1479,10 +1479,20 @@ private: + // SSE 4.1 extract + void pextrd(Register dst, XMMRegister src, int imm8); + void pextrq(Register dst, XMMRegister src, int imm8); ++ void pextrd(Address dst, XMMRegister src, int imm8); ++ void pextrq(Address dst, XMMRegister src, int imm8); ++ void pextrb(Address dst, XMMRegister src, int imm8); ++ // SSE 2 extract ++ void pextrw(Address dst, XMMRegister src, int imm8); + + // SSE 4.1 insert + void pinsrd(XMMRegister dst, Register src, int imm8); + void pinsrq(XMMRegister dst, Register src, int imm8); ++ void pinsrd(XMMRegister dst, Address src, int imm8); ++ void pinsrq(XMMRegister dst, Address src, int imm8); ++ void pinsrb(XMMRegister dst, Address src, int imm8); ++ // SSE 2 insert ++ void pinsrw(XMMRegister dst, Address src, int imm8); + + // SSE4.1 packed move + void pmovzxbw(XMMRegister dst, XMMRegister src); +@@ -1687,6 +1697,8 @@ private: + void xorl(Register dst, Address src); + void xorl(Register dst, Register src); + ++ void xorb(Register dst, Address src); ++ + void xorq(Register dst, Address src); + void xorq(Register dst, Register src); + +diff --git a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp +index 2e5599807..f555f3326 100644 +--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp ++++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp +@@ -2153,6 +2153,17 @@ class StubGenerator: public StubCodeGenerator { + return start; + } + ++ address generate_counter_shuffle_mask() { ++ __ align(16); ++ StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask"); ++ address start = __ pc(); ++ __ emit_data(0x0c0d0e0f, relocInfo::none, 0); ++ __ emit_data(0x08090a0b, relocInfo::none, 0); ++ __ emit_data(0x04050607, relocInfo::none, 0); ++ __ emit_data(0x00010203, relocInfo::none, 0); ++ return start; ++ } ++ + // Utility routine for loading a 128-bit key word in little endian format + // can optionally specify that the shuffle mask is already in an xmmregister + void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { +@@ -2178,6 +2189,31 @@ class StubGenerator: public StubCodeGenerator { + __ aesdec(xmmdst, xmmtmp); + } + ++ // Utility routine for increase 128bit counter (iv in CTR mode) ++ // XMM_128bit, D3, D2, D1, D0 ++ void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) { ++ __ pextrd(reg, xmmdst, 0x0); ++ __ addl(reg, inc_delta); ++ __ pinsrd(xmmdst, reg, 0x0); ++ __ jcc(Assembler::carryClear, next_block); // jump if no carry ++ ++ __ pextrd(reg, xmmdst, 0x01); // Carry-> D1 ++ __ addl(reg, 0x01); ++ __ pinsrd(xmmdst, reg, 0x01); ++ __ jcc(Assembler::carryClear, next_block); // jump if no carry ++ ++ __ pextrd(reg, xmmdst, 0x02); // Carry-> D2 ++ __ addl(reg, 0x01); ++ __ pinsrd(xmmdst, reg, 0x02); ++ __ jcc(Assembler::carryClear, next_block); // jump if no carry ++ ++ __ pextrd(reg, xmmdst, 0x03); // Carry -> D3 ++ __ addl(reg, 0x01); ++ __ pinsrd(xmmdst, reg, 0x03); ++ ++ __ BIND(next_block); // next instruction ++ } ++ + + // Arguments: + // +@@ -2719,6 +2755,309 @@ class StubGenerator: public StubCodeGenerator { + return start; + } + ++ ++ // CTR AES crypt. ++ // In 32-bit stub, parallelize 4 blocks at a time ++ // Arguments: ++ // ++ // Inputs: ++ // c_rarg0 - source byte array address ++ // c_rarg1 - destination byte array address ++ // c_rarg2 - K (key) in little endian int array ++ // c_rarg3 - counter vector byte array address ++ // c_rarg4 - input length ++ // ++ // Output: ++ // rax - input length ++ // ++ address generate_counterMode_AESCrypt_Parallel() { ++ assert(UseAES, "need AES instructions and misaligned SSE support"); ++ __ align(CodeEntryAlignment); ++ StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); ++ address start = __ pc(); ++ const Register from = rsi; // source array address ++ const Register to = rdx; // destination array address ++ const Register key = rcx; // key array address ++ const Register counter = rdi; // counter byte array initialized from initvector array address ++ ++ // and left with the results of the last encryption block ++ const Register len_reg = rbx; ++ const Register pos = rax; ++ ++ __ enter(); // required for proper stackwalking of RuntimeStub frame ++ handleSOERegisters(true /*saving*/); // save rbx, rsi, rdi ++ ++ // load registers from incoming parameters ++ const Address from_param(rbp, 8+0); ++ const Address to_param (rbp, 8+4); ++ const Address key_param (rbp, 8+8); ++ const Address rvec_param (rbp, 8+12); ++ const Address len_param (rbp, 8+16); ++ const Address saved_counter_param(rbp, 8 + 20); ++ const Address used_addr_param(rbp, 8 + 24); ++ ++ __ movptr(from , from_param); ++ __ movptr(to , to_param); ++ //__ movptr(key, key_param); ++ //__ movptr(counter, rvec_param); ++ __ movptr(len_reg , len_param); ++ //__ movptr(pos, 0); ++ ++ // Use the partially used encrpyted counter from last invocation ++ Label L_exit_preLoop, L_preLoop_start; ++ ++ // Use the registers 'counter' and 'key' here in this preloop ++ // to hold of last 2 params 'used' and 'saved_encCounter_start' ++ Register used = counter; ++ Register saved_encCounter_start = key; ++ Register used_addr = saved_encCounter_start; ++ ++ __ movptr(used_addr, used_addr_param); ++ __ movptr(used, Address(used_addr, 0)); ++ __ movptr(saved_encCounter_start, saved_counter_param); ++ ++ __ BIND(L_preLoop_start); ++ __ cmpptr(used, 16); ++ __ jcc(Assembler::aboveEqual, L_exit_preLoop); ++ __ cmpptr(len_reg, 0); ++ __ jcc(Assembler::lessEqual, L_exit_preLoop); ++ __ movb(rax, Address(saved_encCounter_start, used)); ++ __ xorb(rax, Address(from, 0)); ++ __ movb(Address(to, 0), rax); ++ __ addptr(from, 1); ++ __ addptr(to, 1); ++ __ addptr(used, 1); ++ __ subptr(len_reg, 1); ++ ++ __ jmp(L_preLoop_start); ++ ++ __ BIND(L_exit_preLoop); ++ __ movptr(used_addr, used_addr_param); ++ __ movptr(used_addr, used_addr_param); ++ __ movl(Address(used_addr, 0), used); ++ ++ // load the parameters 'key' and 'counter' ++ __ movptr(key, key_param); ++ __ movptr(counter, rvec_param); ++ ++ // xmm register assignments for the loops below ++ const XMMRegister xmm_curr_counter = xmm0; ++ const XMMRegister xmm_counter_shuf_mask = xmm1; // need to be reloaded ++ const XMMRegister xmm_key_shuf_mask = xmm2; // need to be reloaded ++ const XMMRegister xmm_key = xmm3; ++ const XMMRegister xmm_result0 = xmm4; ++ const XMMRegister xmm_result1 = xmm5; ++ const XMMRegister xmm_result2 = xmm6; ++ const XMMRegister xmm_result3 = xmm7; ++ const XMMRegister xmm_from0 = xmm1; //reuse XMM register ++ const XMMRegister xmm_from1 = xmm2; ++ const XMMRegister xmm_from2 = xmm3; ++ const XMMRegister xmm_from3 = xmm4; ++ ++ //for key_128, key_192, key_256 ++ const int rounds[3] = {10, 12, 14}; ++ Label L_singleBlockLoopTop[3]; ++ Label L_multiBlock_loopTop[3]; ++ Label L_key192_top, L_key256_top; ++ Label L_incCounter[3][4]; // 3: different key length, 4: 4 blocks at a time ++ Label L_incCounter_single[3]; //for single block, key128, key192, key256 ++ Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3]; ++ Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3]; ++ ++ Label L_exit; ++ const int PARALLEL_FACTOR = 4; //because of the limited register number ++ ++ // initialize counter with initial counter ++ __ movdqu(xmm_curr_counter, Address(counter, 0x00)); ++ __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); ++ __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled for increase ++ ++ // key length could be only {11, 13, 15} * 4 = {44, 52, 60} ++ __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); ++ __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); ++ __ cmpl(rax, 52); ++ __ jcc(Assembler::equal, L_key192_top); ++ __ cmpl(rax, 60); ++ __ jcc(Assembler::equal, L_key256_top); ++ ++ //key128 begins here ++ __ movptr(pos, 0); // init pos before L_multiBlock_loopTop ++ ++#define CTR_DoFour(opc, src_reg) \ ++ __ opc(xmm_result0, src_reg); \ ++ __ opc(xmm_result1, src_reg); \ ++ __ opc(xmm_result2, src_reg); \ ++ __ opc(xmm_result3, src_reg); ++ ++ // k == 0 : generate code for key_128 ++ // k == 1 : generate code for key_192 ++ // k == 2 : generate code for key_256 ++ for (int k = 0; k < 3; ++k) { ++ //multi blocks starts here ++ __ align(OptoLoopAlignment); ++ __ BIND(L_multiBlock_loopTop[k]); ++ __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left ++ __ jcc(Assembler::less, L_singleBlockLoopTop[k]); ++ ++ __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); ++ __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); ++ ++ //load, then increase counters ++ CTR_DoFour(movdqa, xmm_curr_counter); ++ __ push(rbx); ++ inc_counter(rbx, xmm_result1, 0x01, L_incCounter[k][0]); ++ inc_counter(rbx, xmm_result2, 0x02, L_incCounter[k][1]); ++ inc_counter(rbx, xmm_result3, 0x03, L_incCounter[k][2]); ++ inc_counter(rbx, xmm_curr_counter, 0x04, L_incCounter[k][3]); ++ __ pop (rbx); ++ ++ load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); // load Round 0 key. interleaving for better performance ++ ++ CTR_DoFour(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR ++ CTR_DoFour(pxor, xmm_key); //PXOR with Round 0 key ++ ++ for (int i = 1; i < rounds[k]; ++i) { ++ load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask); ++ CTR_DoFour(aesenc, xmm_key); ++ } ++ load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask); ++ CTR_DoFour(aesenclast, xmm_key); ++ ++ // get next PARALLEL_FACTOR blocks into xmm_from registers ++ __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); ++ __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); ++ __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); ++ ++ // PXOR with input text ++ __ pxor(xmm_result0, xmm_from0); //result0 is xmm4 ++ __ pxor(xmm_result1, xmm_from1); ++ __ pxor(xmm_result2, xmm_from2); ++ ++ // store PARALLEL_FACTOR results into the next 64 bytes of output ++ __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); ++ __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); ++ __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); ++ ++ // do it here after xmm_result0 is saved, because xmm_from3 reuse the same register of xmm_result0. ++ __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); ++ __ pxor(xmm_result3, xmm_from3); ++ __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); ++ ++ __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text ++ __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length ++ __ jmp(L_multiBlock_loopTop[k]); ++ ++ // singleBlock starts here ++ __ align(OptoLoopAlignment); ++ __ BIND(L_singleBlockLoopTop[k]); ++ __ cmpptr(len_reg, 0); ++ __ jcc(Assembler::equal, L_exit); ++ __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); ++ __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); ++ __ movdqa(xmm_result0, xmm_curr_counter); ++ load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); ++ __ push(rbx);//rbx is used for increasing counter ++ inc_counter(rbx, xmm_curr_counter, 0x01, L_incCounter_single[k]); ++ __ pop (rbx); ++ __ pshufb(xmm_result0, xmm_counter_shuf_mask); ++ __ pxor(xmm_result0, xmm_key); ++ for (int i = 1; i < rounds[k]; i++) { ++ load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask); ++ __ aesenc(xmm_result0, xmm_key); ++ } ++ load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask); ++ __ aesenclast(xmm_result0, xmm_key); ++ __ cmpptr(len_reg, AESBlockSize); ++ __ jcc(Assembler::less, L_processTail_insr[k]); ++ __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); ++ __ pxor(xmm_result0, xmm_from0); ++ __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); ++ __ addptr(pos, AESBlockSize); ++ __ subptr(len_reg, AESBlockSize); ++ __ jmp(L_singleBlockLoopTop[k]); ++ ++ __ BIND(L_processTail_insr[k]); ++ __ addptr(pos, len_reg); ++ __ testptr(len_reg, 8); ++ __ jcc(Assembler::zero, L_processTail_4_insr[k]); ++ __ subptr(pos,8); ++ __ pinsrd(xmm_from0, Address(from, pos), 0); ++ __ pinsrd(xmm_from0, Address(from, pos, Address::times_1, 4), 1); ++ __ BIND(L_processTail_4_insr[k]); ++ __ testptr(len_reg, 4); ++ __ jcc(Assembler::zero, L_processTail_2_insr[k]); ++ __ subptr(pos,4); ++ __ pslldq(xmm_from0, 4); ++ __ pinsrd(xmm_from0, Address(from, pos), 0); ++ __ BIND(L_processTail_2_insr[k]); ++ __ testptr(len_reg, 2); ++ __ jcc(Assembler::zero, L_processTail_1_insr[k]); ++ __ subptr(pos, 2); ++ __ pslldq(xmm_from0, 2); ++ __ pinsrw(xmm_from0, Address(from, pos), 0); ++ __ BIND(L_processTail_1_insr[k]); ++ __ testptr(len_reg, 1); ++ __ jcc(Assembler::zero, L_processTail_exit_insr[k]); ++ __ subptr(pos, 1); ++ __ pslldq(xmm_from0, 1); ++ __ pinsrb(xmm_from0, Address(from, pos), 0); ++ __ BIND(L_processTail_exit_insr[k]); ++ ++ __ movptr(saved_encCounter_start, saved_counter_param); ++ __ movdqu(Address(saved_encCounter_start, 0), xmm_result0); ++ __ pxor(xmm_result0, xmm_from0); ++ ++ __ testptr(len_reg, 8); ++ __ jcc(Assembler::zero, L_processTail_4_extr[k]); ++ __ pextrd(Address(to, pos), xmm_result0, 0); ++ __ pextrd(Address(to, pos, Address::times_1, 4), xmm_result0, 1); ++ __ psrldq(xmm_result0, 8); ++ __ addptr(pos, 8); ++ __ BIND(L_processTail_4_extr[k]); ++ __ testptr(len_reg, 4); ++ __ jcc(Assembler::zero, L_processTail_2_extr[k]); ++ __ pextrd(Address(to, pos), xmm_result0, 0); ++ __ psrldq(xmm_result0, 4); ++ __ addptr(pos, 4); ++ __ BIND(L_processTail_2_extr[k]); ++ __ testptr(len_reg, 2); ++ __ jcc(Assembler::zero, L_processTail_1_extr[k]); ++ __ pextrb(Address(to, pos), xmm_result0, 0); ++ __ pextrb(Address(to, pos, Address::times_1, 1), xmm_result0, 1); ++ __ psrldq(xmm_result0, 2); ++ __ addptr(pos, 2); ++ __ BIND(L_processTail_1_extr[k]); ++ __ testptr(len_reg, 1); ++ __ jcc(Assembler::zero, L_processTail_exit_extr[k]); ++ __ pextrb(Address(to, pos), xmm_result0, 0); ++ ++ __ BIND(L_processTail_exit_extr[k]); ++ __ movptr(used_addr, used_addr_param); ++ __ movl(Address(used_addr, 0), len_reg); ++ __ jmp(L_exit); ++ } ++ ++ __ BIND(L_exit); ++ __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); ++ __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back. ++ __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back ++ handleSOERegisters(false /*restoring*/); ++ __ movptr(rax, len_param); // return length ++ __ leave(); // required for proper stackwalking of RuntimeStub frame ++ __ ret(0); ++ ++ __ BIND (L_key192_top); ++ __ movptr(pos, 0); // init pos before L_multiBlock_loopTop ++ __ jmp(L_multiBlock_loopTop[1]); //key192 ++ ++ __ BIND (L_key256_top); ++ __ movptr(pos, 0); // init pos before L_multiBlock_loopTop ++ __ jmp(L_multiBlock_loopTop[2]); //key192 ++ ++ return start; ++ } ++ ++ + // byte swap x86 long + address generate_ghash_long_swap_mask() { + __ align(CodeEntryAlignment); +@@ -3181,6 +3520,11 @@ class StubGenerator: public StubCodeGenerator { + StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); + } + ++ if (UseAESCTRIntrinsics) { ++ StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask(); ++ StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel(); ++ } ++ + // Generate GHASH intrinsics code + if (UseGHASHIntrinsics) { + StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); +diff --git a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp +index c5811b28b..254f63392 100644 +--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp ++++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp +@@ -3010,6 +3010,15 @@ class StubGenerator: public StubCodeGenerator { + return start; + } + ++ address generate_counter_shuffle_mask() { ++ __ align(16); ++ StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask"); ++ address start = __ pc(); ++ __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); ++ __ emit_data64(0x0001020304050607, relocInfo::none); ++ return start; ++ } ++ + // Utility routine for loading a 128-bit key word in little endian format + // can optionally specify that the shuffle mask is already in an xmmregister + void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { +@@ -3021,6 +3030,18 @@ class StubGenerator: public StubCodeGenerator { + } + } + ++ // Utility routine for increase 128bit counter (iv in CTR mode) ++ void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) { ++ __ pextrq(reg, xmmdst, 0x0); ++ __ addq(reg, inc_delta); ++ __ pinsrq(xmmdst, reg, 0x0); ++ __ jcc(Assembler::carryClear, next_block); // jump if no carry ++ __ pextrq(reg, xmmdst, 0x01); // Carry ++ __ addq(reg, 0x01); ++ __ pinsrq(xmmdst, reg, 0x01); //Carry end ++ __ BIND(next_block); // next instruction ++ } ++ + // Arguments: + // + // Inputs: +@@ -3639,6 +3660,320 @@ class StubGenerator: public StubCodeGenerator { + return start; + } + ++ // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time ++ // to hide instruction latency ++ // ++ // Arguments: ++ // ++ // Inputs: ++ // c_rarg0 - source byte array address ++ // c_rarg1 - destination byte array address ++ // c_rarg2 - K (key) in little endian int array ++ // c_rarg3 - counter vector byte array address ++ // Linux ++ // c_rarg4 - input length ++ // c_rarg5 - saved encryptedCounter start ++ // rbp + 6 * wordSize - saved used length ++ // Windows ++ // rbp + 6 * wordSize - input length ++ // rbp + 7 * wordSize - saved encryptedCounter start ++ // rbp + 8 * wordSize - saved used length ++ // ++ // Output: ++ // rax - input length ++ // ++ address generate_counterMode_AESCrypt_Parallel() { ++ assert(UseAES, "need AES instructions and misaligned SSE support"); ++ __ align(CodeEntryAlignment); ++ StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); ++ address start = __ pc(); ++ const Register from = c_rarg0; // source array address ++ const Register to = c_rarg1; // destination array address ++ const Register key = c_rarg2; // key array address ++ const Register counter = c_rarg3; // counter byte array initialized from counter array address ++ // and left with the results of the last encryption block ++#ifndef _WIN64 ++ const Register len_reg = c_rarg4; ++ const Register saved_encCounter_start = c_rarg5; ++ const Register used_addr = r10; ++ const Address used_mem(rbp, 2 * wordSize); ++ const Register used = r11; ++#else ++ const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 ++ const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64 ++ const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64 ++ const Register len_reg = r10; // pick the first volatile windows register ++ const Register saved_encCounter_start = r11; ++ const Register used_addr = r13; ++ const Register used = r14; ++#endif ++ const Register pos = rax; ++ ++ const int PARALLEL_FACTOR = 6; ++ const XMMRegister xmm_counter_shuf_mask = xmm0; ++ const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front ++ const XMMRegister xmm_curr_counter = xmm2; ++ ++ const XMMRegister xmm_key_tmp0 = xmm3; ++ const XMMRegister xmm_key_tmp1 = xmm4; ++ ++ // registers holding the four results in the parallelized loop ++ const XMMRegister xmm_result0 = xmm5; ++ const XMMRegister xmm_result1 = xmm6; ++ const XMMRegister xmm_result2 = xmm7; ++ const XMMRegister xmm_result3 = xmm8; ++ const XMMRegister xmm_result4 = xmm9; ++ const XMMRegister xmm_result5 = xmm10; ++ ++ const XMMRegister xmm_from0 = xmm11; ++ const XMMRegister xmm_from1 = xmm12; ++ const XMMRegister xmm_from2 = xmm13; ++ const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64. ++ const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text ++ const XMMRegister xmm_from5 = xmm4; ++ ++ //for key_128, key_192, key_256 ++ const int rounds[3] = {10, 12, 14}; ++ Label L_exit_preLoop, L_preLoop_start; ++ Label L_multiBlock_loopTop[3]; ++ Label L_singleBlockLoopTop[3]; ++ Label L__incCounter[3][6]; //for 6 blocks ++ Label L__incCounter_single[3]; //for single block, key128, key192, key256 ++ Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3]; ++ Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3]; ++ ++ Label L_exit; ++ ++ __ enter(); // required for proper stackwalking of RuntimeStub frame ++ ++#ifdef _WIN64 ++ // save the xmm registers which must be preserved 6-14 ++ const int XMM_REG_NUM_KEY_LAST = 14; ++ __ subptr(rsp, -rsp_after_call_off * wordSize); ++ for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { ++ __ movdqu(xmm_save(i), as_XMMRegister(i)); ++ } ++ ++ const Address r13_save(rbp, rdi_off * wordSize); ++ const Address r14_save(rbp, rsi_off * wordSize); ++ ++ __ movptr(r13_save, r13); ++ __ movptr(r14_save, r14); ++ ++ // on win64, fill len_reg from stack position ++ __ movl(len_reg, len_mem); ++ __ movptr(saved_encCounter_start, saved_encCounter_mem); ++ __ movptr(used_addr, used_mem); ++ __ movl(used, Address(used_addr, 0)); ++#else ++ __ push(len_reg); // Save ++ __ movptr(used_addr, used_mem); ++ __ movl(used, Address(used_addr, 0)); ++#endif ++ ++ __ push(rbx); // Save RBX ++ __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter ++ __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr())); ++ __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled ++ __ movptr(pos, 0); ++ ++ // Use the partially used encrpyted counter from last invocation ++ __ BIND(L_preLoop_start); ++ __ cmpptr(used, 16); ++ __ jcc(Assembler::aboveEqual, L_exit_preLoop); ++ __ cmpptr(len_reg, 0); ++ __ jcc(Assembler::lessEqual, L_exit_preLoop); ++ __ movb(rbx, Address(saved_encCounter_start, used)); ++ __ xorb(rbx, Address(from, pos)); ++ __ movb(Address(to, pos), rbx); ++ __ addptr(pos, 1); ++ __ addptr(used, 1); ++ __ subptr(len_reg, 1); ++ ++ __ jmp(L_preLoop_start); ++ ++ __ BIND(L_exit_preLoop); ++ __ movl(Address(used_addr, 0), used); ++ ++ // key length could be only {11, 13, 15} * 4 = {44, 52, 60} ++ __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); ++ __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); ++ __ cmpl(rbx, 52); ++ __ jcc(Assembler::equal, L_multiBlock_loopTop[1]); ++ __ cmpl(rbx, 60); ++ __ jcc(Assembler::equal, L_multiBlock_loopTop[2]); ++ ++#define CTR_DoSix(opc, src_reg) \ ++ __ opc(xmm_result0, src_reg); \ ++ __ opc(xmm_result1, src_reg); \ ++ __ opc(xmm_result2, src_reg); \ ++ __ opc(xmm_result3, src_reg); \ ++ __ opc(xmm_result4, src_reg); \ ++ __ opc(xmm_result5, src_reg); ++ ++ // k == 0 : generate code for key_128 ++ // k == 1 : generate code for key_192 ++ // k == 2 : generate code for key_256 ++ for (int k = 0; k < 3; ++k) { ++ //multi blocks starts here ++ __ align(OptoLoopAlignment); ++ __ BIND(L_multiBlock_loopTop[k]); ++ __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left ++ __ jcc(Assembler::less, L_singleBlockLoopTop[k]); ++ load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); ++ ++ //load, then increase counters ++ CTR_DoSix(movdqa, xmm_curr_counter); ++ inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]); ++ inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]); ++ inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]); ++ inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]); ++ inc_counter(rbx, xmm_result5, 0x05, L__incCounter[k][4]); ++ inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]); ++ CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR ++ CTR_DoSix(pxor, xmm_key_tmp0); //PXOR with Round 0 key ++ ++ //load two ROUND_KEYs at a time ++ for (int i = 1; i < rounds[k]; ) { ++ load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask); ++ load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask); ++ CTR_DoSix(aesenc, xmm_key_tmp1); ++ i++; ++ if (i != rounds[k]) { ++ CTR_DoSix(aesenc, xmm_key_tmp0); ++ } else { ++ CTR_DoSix(aesenclast, xmm_key_tmp0); ++ } ++ i++; ++ } ++ ++ // get next PARALLEL_FACTOR blocks into xmm_result registers ++ __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); ++ __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); ++ __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); ++ __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); ++ __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize)); ++ __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize)); ++ ++ __ pxor(xmm_result0, xmm_from0); ++ __ pxor(xmm_result1, xmm_from1); ++ __ pxor(xmm_result2, xmm_from2); ++ __ pxor(xmm_result3, xmm_from3); ++ __ pxor(xmm_result4, xmm_from4); ++ __ pxor(xmm_result5, xmm_from5); ++ ++ // store 6 results into the next 64 bytes of output ++ __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); ++ __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); ++ __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); ++ __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); ++ __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4); ++ __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5); ++ ++ __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text ++ __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length ++ __ jmp(L_multiBlock_loopTop[k]); ++ ++ // singleBlock starts here ++ __ align(OptoLoopAlignment); ++ __ BIND(L_singleBlockLoopTop[k]); ++ __ cmpptr(len_reg, 0); ++ __ jcc(Assembler::lessEqual, L_exit); ++ load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); ++ __ movdqa(xmm_result0, xmm_curr_counter); ++ inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]); ++ __ pshufb(xmm_result0, xmm_counter_shuf_mask); ++ __ pxor(xmm_result0, xmm_key_tmp0); ++ for (int i = 1; i < rounds[k]; i++) { ++ load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask); ++ __ aesenc(xmm_result0, xmm_key_tmp0); ++ } ++ load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask); ++ __ aesenclast(xmm_result0, xmm_key_tmp0); ++ __ cmpptr(len_reg, AESBlockSize); ++ __ jcc(Assembler::less, L_processTail_insr[k]); ++ __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); ++ __ pxor(xmm_result0, xmm_from0); ++ __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); ++ __ addptr(pos, AESBlockSize); ++ __ subptr(len_reg, AESBlockSize); ++ __ jmp(L_singleBlockLoopTop[k]); ++ __ BIND(L_processTail_insr[k]); ++ __ addptr(pos, len_reg); ++ __ testptr(len_reg, 8); ++ __ jcc(Assembler::zero, L_processTail_4_insr[k]); ++ __ subptr(pos,8); ++ __ pinsrq(xmm_from0, Address(from, pos), 0); ++ __ BIND(L_processTail_4_insr[k]); ++ __ testptr(len_reg, 4); ++ __ jcc(Assembler::zero, L_processTail_2_insr[k]); ++ __ subptr(pos,4); ++ __ pslldq(xmm_from0, 4); ++ __ pinsrd(xmm_from0, Address(from, pos), 0); ++ __ BIND(L_processTail_2_insr[k]); ++ __ testptr(len_reg, 2); ++ __ jcc(Assembler::zero, L_processTail_1_insr[k]); ++ __ subptr(pos, 2); ++ __ pslldq(xmm_from0, 2); ++ __ pinsrw(xmm_from0, Address(from, pos), 0); ++ __ BIND(L_processTail_1_insr[k]); ++ __ testptr(len_reg, 1); ++ __ jcc(Assembler::zero, L_processTail_exit_insr[k]); ++ __ subptr(pos, 1); ++ __ pslldq(xmm_from0, 1); ++ __ pinsrb(xmm_from0, Address(from, pos), 0); ++ __ BIND(L_processTail_exit_insr[k]); ++ ++ __ movdqu(Address(saved_encCounter_start, 0), xmm_result0); ++ __ pxor(xmm_result0, xmm_from0); ++ ++ __ testptr(len_reg, 8); ++ __ jcc(Assembler::zero, L_processTail_4_extr[k]); ++ __ pextrq(Address(to, pos), xmm_result0, 0); ++ __ psrldq(xmm_result0, 8); ++ __ addptr(pos, 8); ++ __ BIND(L_processTail_4_extr[k]); ++ __ testptr(len_reg, 4); ++ __ jcc(Assembler::zero, L_processTail_2_extr[k]); ++ __ pextrd(Address(to, pos), xmm_result0, 0); ++ __ psrldq(xmm_result0, 4); ++ __ addptr(pos, 4); ++ __ BIND(L_processTail_2_extr[k]); ++ __ testptr(len_reg, 2); ++ __ jcc(Assembler::zero, L_processTail_1_extr[k]); ++ __ pextrw(Address(to, pos), xmm_result0, 0); ++ __ psrldq(xmm_result0, 2); ++ __ addptr(pos, 2); ++ __ BIND(L_processTail_1_extr[k]); ++ __ testptr(len_reg, 1); ++ __ jcc(Assembler::zero, L_processTail_exit_extr[k]); ++ __ pextrb(Address(to, pos), xmm_result0, 0); ++ ++ __ BIND(L_processTail_exit_extr[k]); ++ __ movl(Address(used_addr, 0), len_reg); ++ __ jmp(L_exit); ++ ++ } ++ ++ __ BIND(L_exit); ++ __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back. ++ __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back ++ __ pop(rbx); // pop the saved RBX. ++#ifdef _WIN64 ++ // restore regs belonging to calling function ++ for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { ++ __ movdqu(as_XMMRegister(i), xmm_save(i)); ++ } ++ __ movl(rax, len_mem); ++ __ movptr(r13, r13_save); ++ __ movptr(r14, r14_save); ++#else ++ __ pop(rax); // return 'len' ++#endif ++ __ leave(); // required for proper stackwalking of RuntimeStub frame ++ __ ret(0); ++ return start; ++ } + + // byte swap x86 long + address generate_ghash_long_swap_mask() { +@@ -4239,12 +4574,15 @@ class StubGenerator: public StubCodeGenerator { + // don't bother generating these AES intrinsic stubs unless global flag is set + if (UseAESIntrinsics) { + StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others +- + StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); + StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); + StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); + StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); + } ++ if (UseAESCTRIntrinsics){ ++ StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask(); ++ StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel(); ++ } + + // Generate GHASH intrinsics code + if (UseGHASHIntrinsics) { +diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp +index 9b0d8fc75..617879377 100644 +--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp ++++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp +@@ -33,6 +33,7 @@ + + address StubRoutines::x86::_verify_mxcsr_entry = NULL; + address StubRoutines::x86::_key_shuffle_mask_addr = NULL; ++address StubRoutines::x86::_counter_shuffle_mask_addr = NULL; + address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL; + address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL; + +diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp +index bb160486c..70b5a34ac 100644 +--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp ++++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp +@@ -33,6 +33,10 @@ + static address _verify_mxcsr_entry; + // shuffle mask for fixing up 128-bit words consisting of big-endian 32-bit integers + static address _key_shuffle_mask_addr; ++ ++ //shuffle mask for big-endian 128-bit integers ++ static address _counter_shuffle_mask_addr; ++ + // masks and table for CRC32 + static uint64_t _crc_by128_masks[]; + static juint _crc_table[]; +@@ -43,6 +47,7 @@ + public: + static address verify_mxcsr_entry() { return _verify_mxcsr_entry; } + static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; } ++ static address counter_shuffle_mask_addr() { return _counter_shuffle_mask_addr; } + static address crc_by128_masks_addr() { return (address)_crc_by128_masks; } + static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; } + static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; } +diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp +index bca5d493c..538f83e69 100644 +--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp ++++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp +@@ -31,7 +31,7 @@ + + enum platform_dependent_constants { + code_size1 = 9000, // simply increase if too small (assembler will crash if too small) +- code_size2 = 22000 // simply increase if too small (assembler will crash if too small) ++ code_size2 = 25800 // simply increase if too small (assembler will crash if too small) + }; + + class x86 { +diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp +index b048fd74e..f963cd2f8 100644 +--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp ++++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp +@@ -33,7 +33,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _ + + enum platform_dependent_constants { + code_size1 = 19000, // simply increase if too small (assembler will crash if too small) +- code_size2 = 24000 // simply increase if too small (assembler will crash if too small) ++ code_size2 = 27000 // simply increase if too small (assembler will crash if too small) + }; + + class x86 { +diff --git a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp +index 46b3e32ea..ce3037d76 100644 +--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp ++++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp +@@ -573,6 +573,28 @@ void VM_Version::get_processor_features() { + } + FLAG_SET_DEFAULT(UseAESIntrinsics, false); + } ++ ++ // --AES-CTR begins-- ++ if (!UseAESIntrinsics) { ++ if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { ++ warning("AES-CTR intrinsics require UseAESIntrinsics flag to be enabled. Intrinsics will be disabled."); ++ FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); ++ } ++ } else { ++ if(supports_sse4_1() && UseSSE >= 4) { ++ if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { ++ FLAG_SET_DEFAULT(UseAESCTRIntrinsics, true); ++ } ++ } else { ++ // The AES-CTR intrinsic stubs require AES instruction support (of course) ++ // but also require sse4.1 mode or higher for instructions it use. ++ if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { ++ warning("X86 AES-CTR intrinsics require SSE4.1 instructions or higher. Intrinsics will be disabled."); ++ } ++ FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); ++ } ++ } ++ // --AES-CTR ends-- + } + } else if (UseAES || UseAESIntrinsics) { + if (UseAES && !FLAG_IS_DEFAULT(UseAES)) { +@@ -583,6 +605,10 @@ void VM_Version::get_processor_features() { + warning("AES intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseAESIntrinsics, false); + } ++ if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { ++ warning("AES-CTR intrinsics are not available on this CPU"); ++ FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); ++ } + } + + // Use CLMUL instructions if available. +@@ -606,6 +632,16 @@ void VM_Version::get_processor_features() { + FLAG_SET_DEFAULT(UseCRC32Intrinsics, false); + } + ++ if (UseAESIntrinsics) { ++ if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) { ++ UseAESCTRIntrinsics = true; ++ } ++ } else if (UseAESCTRIntrinsics) { ++ if (!FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) ++ warning("AES/CTR intrinsics are not available on this CPU"); ++ FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); ++ } ++ + // GHASH/GCM intrinsics + if (UseCLMUL && (UseSSE > 2)) { + if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { +diff --git a/hotspot/src/share/vm/classfile/vmSymbols.hpp b/hotspot/src/share/vm/classfile/vmSymbols.hpp +index 942d172a1..4ca2a3ad4 100644 +--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp ++++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp +@@ -846,6 +846,10 @@ + do_name( decrypt_name, "implDecrypt") \ + do_signature(byteArray_int_int_byteArray_int_signature, "([BII[BI)I") \ + \ ++ do_class(com_sun_crypto_provider_counterMode, "com/sun/crypto/provider/CounterMode") \ ++ do_intrinsic(_counterMode_AESCrypt, com_sun_crypto_provider_counterMode, crypt_name, byteArray_int_int_byteArray_int_signature, F_R) \ ++ do_name( crypt_name, "implCrypt") \ ++ \ + /* support for sun.security.provider.SHA */ \ + do_class(sun_security_provider_sha, "sun/security/provider/SHA") \ + do_intrinsic(_sha_implCompress, sun_security_provider_sha, implCompress_name, implCompress_signature, F_R) \ +diff --git a/hotspot/src/share/vm/opto/escape.cpp b/hotspot/src/share/vm/opto/escape.cpp +index 6f8ffe608..a0e497f08 100644 +--- a/hotspot/src/share/vm/opto/escape.cpp ++++ b/hotspot/src/share/vm/opto/escape.cpp +@@ -952,6 +952,7 @@ void ConnectionGraph::process_call_arguments(CallNode *call) { + strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 || + strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 || + strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_decryptAESCrypt") == 0 || ++ strcmp(call->as_CallLeaf()->_name, "counterMode_AESCrypt") == 0 || + strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 || + strcmp(call->as_CallLeaf()->_name, "sha1_implCompress") == 0 || + strcmp(call->as_CallLeaf()->_name, "sha1_implCompressMB") == 0 || +diff --git a/hotspot/src/share/vm/opto/library_call.cpp b/hotspot/src/share/vm/opto/library_call.cpp +index bb721f6f1..2add82dd1 100644 +--- a/hotspot/src/share/vm/opto/library_call.cpp ++++ b/hotspot/src/share/vm/opto/library_call.cpp +@@ -196,6 +196,7 @@ class LibraryCallKit : public GraphKit { + return generate_method_call(method_id, true, false); + } + Node * load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static); ++ Node * field_address_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static, ciInstanceKlass * fromKls); + + Node* make_string_method_node(int opcode, Node* str1_start, Node* cnt1, Node* str2_start, Node* cnt2); + Node* make_string_method_node(int opcode, Node* str1, Node* str2); +@@ -309,7 +310,9 @@ class LibraryCallKit : public GraphKit { + bool inline_reference_get(); + bool inline_aescrypt_Block(vmIntrinsics::ID id); + bool inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id); ++ bool inline_counterMode_AESCrypt(vmIntrinsics::ID id); + Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting); ++ Node* inline_counterMode_AESCrypt_predicate(); + Node* get_key_start_from_aescrypt_object(Node* aescrypt_object); + Node* get_original_key_start_from_aescrypt_object(Node* aescrypt_object); + bool inline_ghash_processBlocks(); +@@ -558,6 +561,13 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) { + predicates = 1; + break; + ++ case vmIntrinsics::_counterMode_AESCrypt: ++ if (!UseAESCTRIntrinsics) { ++ return NULL; ++ } ++ predicates = 1; ++ break; ++ + case vmIntrinsics::_sha_implCompress: + if (!UseSHA1Intrinsics) return NULL; + break; +@@ -950,6 +960,9 @@ bool LibraryCallKit::try_to_inline(int predicate) { + case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: + return inline_cipherBlockChaining_AESCrypt(intrinsic_id()); + ++ case vmIntrinsics::_counterMode_AESCrypt: ++ return inline_counterMode_AESCrypt(intrinsic_id()); ++ + case vmIntrinsics::_sha_implCompress: + case vmIntrinsics::_sha2_implCompress: + case vmIntrinsics::_sha5_implCompress: +@@ -1021,6 +1034,8 @@ Node* LibraryCallKit::try_to_predicate(int predicate) { + return inline_cipherBlockChaining_AESCrypt_predicate(false); + case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt: + return inline_cipherBlockChaining_AESCrypt_predicate(true); ++ case vmIntrinsics::_counterMode_AESCrypt: ++ return inline_counterMode_AESCrypt_predicate(); + case vmIntrinsics::_digestBase_implCompressMB: + return inline_digestBase_implCompressMB_predicate(predicate); + +@@ -6581,6 +6596,39 @@ Node * LibraryCallKit::load_field_from_object(Node * fromObj, const char * field + return loadedField; + } + ++Node * LibraryCallKit::field_address_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, ++ bool is_exact = true, bool is_static = false, ++ ciInstanceKlass * fromKls = NULL) { ++ if (fromKls == NULL) { ++ const TypeInstPtr* tinst = _gvn.type(fromObj)->isa_instptr(); ++ assert(tinst != NULL, "obj is null"); ++ assert(tinst->klass()->is_loaded(), "obj is not loaded"); ++ assert(!is_exact || tinst->klass_is_exact(), "klass not exact"); ++ fromKls = tinst->klass()->as_instance_klass(); ++ } ++ else { ++ assert(is_static, "only for static field access"); ++ } ++ ciField* field = fromKls->get_field_by_name(ciSymbol::make(fieldName), ++ ciSymbol::make(fieldTypeString), ++ is_static); ++ ++ assert(field != NULL, "undefined field"); ++ assert(!field->is_volatile(), "not defined for volatile fields"); ++ ++ if (is_static) { ++ const TypeInstPtr* tip = TypeInstPtr::make(fromKls->java_mirror()); ++ fromObj = makecon(tip); ++ } ++ ++ // Next code copied from Parse::do_get_xxx(): ++ ++ // Compute address and memory type. ++ int offset = field->offset_in_bytes(); ++ Node *adr = basic_plus_adr(fromObj, fromObj, offset); ++ ++ return adr; ++} + + //------------------------------inline_aescrypt_Block----------------------- + bool LibraryCallKit::inline_aescrypt_Block(vmIntrinsics::ID id) { +@@ -6747,6 +6795,90 @@ bool LibraryCallKit::inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id) { + return true; + } + ++//------------------------------inline_counterMode_AESCrypt----------------------- ++bool LibraryCallKit::inline_counterMode_AESCrypt(vmIntrinsics::ID id) { ++ assert(UseAES, "need AES instruction support"); ++ if (!UseAESCTRIntrinsics) return false; ++ ++ address stubAddr = NULL; ++ const char *stubName = NULL; ++ if (id == vmIntrinsics::_counterMode_AESCrypt) { ++ stubAddr = StubRoutines::counterMode_AESCrypt(); ++ stubName = "counterMode_AESCrypt"; ++ } ++ if (stubAddr == NULL) return false; ++ ++ Node* counterMode_object = argument(0); ++ Node* src = argument(1); ++ Node* src_offset = argument(2); ++ Node* len = argument(3); ++ Node* dest = argument(4); ++ Node* dest_offset = argument(5); ++ ++ // (1) src and dest are arrays. ++ const Type* src_type = src->Value(&_gvn); ++ const Type* dest_type = dest->Value(&_gvn); ++ const TypeAryPtr* top_src = src_type->isa_aryptr(); ++ const TypeAryPtr* top_dest = dest_type->isa_aryptr(); ++ assert(top_src != NULL && top_src->klass() != NULL && ++ top_dest != NULL && top_dest->klass() != NULL, "args are strange"); ++ ++ // checks are the responsibility of the caller ++ Node* src_start = src; ++ Node* dest_start = dest; ++ if (src_offset != NULL || dest_offset != NULL) { ++ assert(src_offset != NULL && dest_offset != NULL, ""); ++ src_start = array_element_address(src, src_offset, T_BYTE); ++ dest_start = array_element_address(dest, dest_offset, T_BYTE); ++ } ++ ++ // if we are in this set of code, we "know" the embeddedCipher is an AESCrypt object ++ // (because of the predicated logic executed earlier). ++ // so we cast it here safely. ++ // this requires a newer class file that has this array as littleEndian ints, otherwise we revert to java ++ Node* embeddedCipherObj = load_field_from_object(counterMode_object, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false); ++ if (embeddedCipherObj == NULL) return false; ++ // cast it to what we know it will be at runtime ++ const TypeInstPtr* tinst = _gvn.type(counterMode_object)->isa_instptr(); ++ assert(tinst != NULL, "CTR obj is null"); ++ assert(tinst->klass()->is_loaded(), "CTR obj is not loaded"); ++ ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt")); ++ assert(klass_AESCrypt->is_loaded(), "predicate checks that this class is loaded"); ++ ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass(); ++ const TypeKlassPtr* aklass = TypeKlassPtr::make(instklass_AESCrypt); ++ const TypeOopPtr* xtype = aklass->as_instance_type(); ++ Node* aescrypt_object = new (C) CheckCastPPNode(control(), embeddedCipherObj, xtype); ++ aescrypt_object = _gvn.transform(aescrypt_object); ++ // we need to get the start of the aescrypt_object's expanded key array ++ Node* k_start = get_key_start_from_aescrypt_object(aescrypt_object); ++ if (k_start == NULL) return false; ++ // similarly, get the start address of the r vector ++ Node* obj_counter = load_field_from_object(counterMode_object, "counter", "[B", /*is_exact*/ false); ++ if (obj_counter == NULL) return false; ++ Node* cnt_start = array_element_address(obj_counter, intcon(0), T_BYTE); ++ ++ Node* saved_encCounter = load_field_from_object(counterMode_object, "encryptedCounter", "[B", /*is_exact*/ false); ++ if (saved_encCounter == NULL) return false; ++ Node* saved_encCounter_start = array_element_address(saved_encCounter, intcon(0), T_BYTE); ++ Node* used = field_address_from_object(counterMode_object, "used", "I", /*is_exact*/ false); ++ ++ Node* ctrCrypt; ++ if (Matcher::pass_original_key_for_aes()) { ++ // no SPARC version for AES/CTR intrinsics now. ++ return false; ++ } ++ // Call the stub, passing src_start, dest_start, k_start, r_start and src_len ++ ctrCrypt = make_runtime_call(RC_LEAF|RC_NO_FP, ++ OptoRuntime::counterMode_aescrypt_Type(), ++ stubAddr, stubName, TypePtr::BOTTOM, ++ src_start, dest_start, k_start, cnt_start, len, saved_encCounter_start, used); ++ ++ // return cipher length (int) ++ Node* retvalue = _gvn.transform(new (C) ProjNode(ctrCrypt, TypeFunc::Parms)); ++ set_result(retvalue); ++ return true; ++} ++ + //------------------------------get_key_start_from_aescrypt_object----------------------- + Node * LibraryCallKit::get_key_start_from_aescrypt_object(Node *aescrypt_object) { + #ifdef PPC64 +@@ -6841,6 +6973,48 @@ Node* LibraryCallKit::inline_cipherBlockChaining_AESCrypt_predicate(bool decrypt + return _gvn.transform(region); + } + ++//----------------------------inline_counterMode_AESCrypt_predicate---------------------------- ++// Return node representing slow path of predicate check. ++// the pseudo code we want to emulate with this predicate is: ++// for encryption: ++// if (embeddedCipherObj instanceof AESCrypt) do_intrinsic, else do_javapath ++// for decryption: ++// if ((embeddedCipherObj instanceof AESCrypt) && (cipher!=plain)) do_intrinsic, else do_javapath ++// note cipher==plain is more conservative than the original java code but that's OK ++// ++ ++Node* LibraryCallKit::inline_counterMode_AESCrypt_predicate() { ++ // The receiver was checked for NULL already. ++ Node* objCTR = argument(0); ++ ++ // Load embeddedCipher field of CipherBlockChaining object. ++ Node* embeddedCipherObj = load_field_from_object(objCTR, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false); ++ ++ // get AESCrypt klass for instanceOf check ++ // AESCrypt might not be loaded yet if some other SymmetricCipher got us to this compile point ++ // will have same classloader as CipherBlockChaining object ++ const TypeInstPtr* tinst = _gvn.type(objCTR)->isa_instptr(); ++ assert(tinst != NULL, "CTRobj is null"); ++ assert(tinst->klass()->is_loaded(), "CTRobj is not loaded"); ++ ++ // we want to do an instanceof comparison against the AESCrypt class ++ ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt")); ++ if (!klass_AESCrypt->is_loaded()) { ++ // if AESCrypt is not even loaded, we never take the intrinsic fast path ++ Node* ctrl = control(); ++ set_control(top()); // no regular fast path ++ return ctrl; ++ } ++ ++ ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass(); ++ Node* instof = gen_instanceof(embeddedCipherObj, makecon(TypeKlassPtr::make(instklass_AESCrypt))); ++ Node* cmp_instof = _gvn.transform(new (C) CmpINode(instof, intcon(1))); ++ Node* bool_instof = _gvn.transform(new (C) BoolNode(cmp_instof, BoolTest::ne)); ++ Node* instof_false = generate_guard(bool_instof, NULL, PROB_MIN); ++ ++ return instof_false; // even if it is NULL ++} ++ + //------------------------------inline_ghash_processBlocks + bool LibraryCallKit::inline_ghash_processBlocks() { + address stubAddr; +diff --git a/hotspot/src/share/vm/opto/runtime.cpp b/hotspot/src/share/vm/opto/runtime.cpp +index 0a86211ba..1c51be19b 100644 +--- a/hotspot/src/share/vm/opto/runtime.cpp ++++ b/hotspot/src/share/vm/opto/runtime.cpp +@@ -1021,6 +1021,35 @@ const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() { + return TypeFunc::make(domain, range); + } + ++//for counterMode calls of aescrypt encrypt/decrypt, four pointers and a length, returning int ++const TypeFunc* OptoRuntime::counterMode_aescrypt_Type() { ++ // create input type (domain) ++ int num_args = 7; ++ if (Matcher::pass_original_key_for_aes()) { ++ num_args = 8; ++ } ++ int argcnt = num_args; ++ const Type** fields = TypeTuple::fields(argcnt); ++ int argp = TypeFunc::Parms; ++ fields[argp++] = TypePtr::NOTNULL; // src ++ fields[argp++] = TypePtr::NOTNULL; // dest ++ fields[argp++] = TypePtr::NOTNULL; // k array ++ fields[argp++] = TypePtr::NOTNULL; // counter array ++ fields[argp++] = TypeInt::INT; // src len ++ fields[argp++] = TypePtr::NOTNULL; // saved_encCounter ++ fields[argp++] = TypePtr::NOTNULL; // saved used addr ++ if (Matcher::pass_original_key_for_aes()) { ++ fields[argp++] = TypePtr::NOTNULL; // original k array ++ } ++ assert(argp == TypeFunc::Parms + argcnt, "correct decoding"); ++ const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields); ++ // returning cipher len (int) ++ fields = TypeTuple::fields(1); ++ fields[TypeFunc::Parms + 0] = TypeInt::INT; ++ const TypeTuple* range = TypeTuple::make(TypeFunc::Parms + 1, fields); ++ return TypeFunc::make(domain, range); ++} ++ + /* + * void implCompress(byte[] buf, int ofs) + */ +diff --git a/hotspot/src/share/vm/opto/runtime.hpp b/hotspot/src/share/vm/opto/runtime.hpp +index 47133d58c..f27e7d507 100644 +--- a/hotspot/src/share/vm/opto/runtime.hpp ++++ b/hotspot/src/share/vm/opto/runtime.hpp +@@ -299,6 +299,7 @@ private: + + static const TypeFunc* aescrypt_block_Type(); + static const TypeFunc* cipherBlockChaining_aescrypt_Type(); ++ static const TypeFunc* counterMode_aescrypt_Type(); + + static const TypeFunc* sha_implCompress_Type(); + static const TypeFunc* digestBase_implCompressMB_Type(); +diff --git a/hotspot/src/share/vm/runtime/globals.hpp b/hotspot/src/share/vm/runtime/globals.hpp +index 65dfcf69b..91e52f033 100644 +--- a/hotspot/src/share/vm/runtime/globals.hpp ++++ b/hotspot/src/share/vm/runtime/globals.hpp +@@ -734,6 +734,9 @@ class CommandLineFlags { + product(bool, UseAESIntrinsics, false, \ + "Use intrinsics for AES versions of crypto") \ + \ ++ product(bool, UseAESCTRIntrinsics, false, \ ++ "Use intrinsics for the paralleled version of AES/CTR crypto") \ ++ \ + product(bool, UseSHA1Intrinsics, false, \ + "Use intrinsics for SHA-1 crypto hash function") \ + \ +diff --git a/hotspot/src/share/vm/runtime/stubRoutines.cpp b/hotspot/src/share/vm/runtime/stubRoutines.cpp +index f2106d13a..d66237137 100644 +--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp ++++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp +@@ -124,6 +124,7 @@ address StubRoutines::_aescrypt_encryptBlock = NULL; + address StubRoutines::_aescrypt_decryptBlock = NULL; + address StubRoutines::_cipherBlockChaining_encryptAESCrypt = NULL; + address StubRoutines::_cipherBlockChaining_decryptAESCrypt = NULL; ++address StubRoutines::_counterMode_AESCrypt = NULL; + address StubRoutines::_ghash_processBlocks = NULL; + + address StubRoutines::_sha1_implCompress = NULL; +diff --git a/hotspot/src/share/vm/runtime/stubRoutines.hpp b/hotspot/src/share/vm/runtime/stubRoutines.hpp +index 16075d9f4..9fb589540 100644 +--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp ++++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp +@@ -202,6 +202,7 @@ class StubRoutines: AllStatic { + static address _aescrypt_decryptBlock; + static address _cipherBlockChaining_encryptAESCrypt; + static address _cipherBlockChaining_decryptAESCrypt; ++ static address _counterMode_AESCrypt; + static address _ghash_processBlocks; + + static address _sha1_implCompress; +@@ -370,6 +371,7 @@ class StubRoutines: AllStatic { + static address aescrypt_decryptBlock() { return _aescrypt_decryptBlock; } + static address cipherBlockChaining_encryptAESCrypt() { return _cipherBlockChaining_encryptAESCrypt; } + static address cipherBlockChaining_decryptAESCrypt() { return _cipherBlockChaining_decryptAESCrypt; } ++ static address counterMode_AESCrypt() { return _counterMode_AESCrypt; } + static address ghash_processBlocks() { return _ghash_processBlocks; } + + static address sha1_implCompress() { return _sha1_implCompress; } +diff --git a/hotspot/src/share/vm/runtime/vmStructs.cpp b/hotspot/src/share/vm/runtime/vmStructs.cpp +index 3f2bfeb74..842b5840d 100644 +--- a/hotspot/src/share/vm/runtime/vmStructs.cpp ++++ b/hotspot/src/share/vm/runtime/vmStructs.cpp +@@ -815,6 +815,7 @@ typedef TwoOopHashtable SymbolTwoOopHashtable; + static_field(StubRoutines, _aescrypt_decryptBlock, address) \ + static_field(StubRoutines, _cipherBlockChaining_encryptAESCrypt, address) \ + static_field(StubRoutines, _cipherBlockChaining_decryptAESCrypt, address) \ ++ static_field(StubRoutines, _counterMode_AESCrypt, address) \ + static_field(StubRoutines, _ghash_processBlocks, address) \ + static_field(StubRoutines, _updateBytesCRC32, address) \ + static_field(StubRoutines, _crc_table_adr, address) \ +diff --git a/hotspot/test/compiler/7184394/TestAESBase.java b/hotspot/test/compiler/7184394/TestAESBase.java +index 5c3e6881e..afda2a1f7 100644 +--- a/hotspot/test/compiler/7184394/TestAESBase.java ++++ b/hotspot/test/compiler/7184394/TestAESBase.java +@@ -106,8 +106,8 @@ abstract public class TestAESBase { + cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE"); + dCipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE"); + +- // CBC init +- if (mode.equals("CBC")) { ++ // CBC or CTR init ++ if (mode.equals("CBC") || mode.equals("CTR")) { + IvParameterSpec initVector = new IvParameterSpec(iv); + cipher.init(Cipher.ENCRYPT_MODE, key, initVector); + algParams = cipher.getParameters(); +diff --git a/hotspot/test/compiler/7184394/TestAESMain.java b/hotspot/test/compiler/7184394/TestAESMain.java +index ddd8eeaef..65949420a 100644 +--- a/hotspot/test/compiler/7184394/TestAESMain.java ++++ b/hotspot/test/compiler/7184394/TestAESMain.java +@@ -48,6 +48,13 @@ + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain ++ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR TestAESMain ++ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 TestAESMain ++ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencOutputOffset=1 TestAESMain ++ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DdecOutputOffset=1 TestAESMain ++ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 -DencOutputOffset=1 TestAESMain ++ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain ++ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain + * + * @author Tom Deneau + */ +diff --git a/jdk/src/share/classes/com/sun/crypto/provider/CounterMode.java b/jdk/src/share/classes/com/sun/crypto/provider/CounterMode.java +index aea9336c9..c2bd38a71 100644 +--- a/jdk/src/share/classes/com/sun/crypto/provider/CounterMode.java ++++ b/jdk/src/share/classes/com/sun/crypto/provider/CounterMode.java +@@ -39,10 +39,10 @@ import java.security.InvalidKeyException; + * @author Andreas Sterbenz + * @since 1.4.2 + */ +-final class CounterMode extends FeedbackCipher { ++class CounterMode extends FeedbackCipher { + + // current counter value +- private final byte[] counter; ++ final byte[] counter; + + // encrypted bytes of the previous counter value + private final byte[] encryptedCounter; +@@ -137,7 +137,7 @@ final class CounterMode extends FeedbackCipher { + * cipherOffset. + * + * @param in the buffer with the input data to be encrypted +- * @param inOffset the offset in plain ++ * @param inOff the offset in plain + * @param len the length of the input data + * @param out the buffer for the result + * @param outOff the offset in cipher +@@ -176,6 +176,11 @@ final class CounterMode extends FeedbackCipher { + RangeUtil.nullAndBoundsCheck(in, inOff, len); + RangeUtil.nullAndBoundsCheck(out, outOff, len); + ++ return implCrypt(in, inOff, len, out, outOff); ++ } ++ ++ // Implementation of crpyt() method. Possibly replaced with a compiler intrinsic. ++ private int implCrypt(byte[] in, int inOff, int len, byte[] out, int outOff) { + int result = len; + while (len-- > 0) { + if (used >= blockSize) { +diff --git a/jdk/src/share/classes/com/sun/crypto/provider/GCTR.java b/jdk/src/share/classes/com/sun/crypto/provider/GCTR.java +index f8a3eaa0a..6a394e448 100644 +--- a/jdk/src/share/classes/com/sun/crypto/provider/GCTR.java ++++ b/jdk/src/share/classes/com/sun/crypto/provider/GCTR.java +@@ -1,5 +1,5 @@ + /* +- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved. ++ * Copyright (c) 2013, 2017 Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it +@@ -29,52 +29,43 @@ + + package com.sun.crypto.provider; + +-import java.security.*; +-import javax.crypto.*; ++import javax.crypto.IllegalBlockSizeException; + import static com.sun.crypto.provider.AESConstants.AES_BLOCK_SIZE; + + /** + * This class represents the GCTR function defined in NIST 800-38D +- * under section 6.5. It needs to be constructed w/ an initialized +- * cipher object, and initial counter block(ICB). Given an input X +- * of arbitrary length, it processes and returns an output which has +- * the same length as X. The invariants of this class are: +- * +- * (1) The length of intialCounterBlk (and also of its clones, e.g., +- * fields counter and counterSave) is equal to AES_BLOCK_SIZE. +- * +- * (2) After construction, the field counter never becomes null, it +- * always contains a byte array of length AES_BLOCK_SIZE. ++ * under section 6.5. With a given cipher object and initial counter ++ * block, a counter mode operation is performed. Blocksize is limited ++ * to 16 bytes. + * + * If any invariant is broken, failures can occur because the + * AESCrypt.encryptBlock method can be intrinsified on the HotSpot VM + * (see JDK-8067648 for details). + * ++ * The counter mode operations can be intrinsified and parallelized ++ * by using CounterMode.implCrypt() if HotSpot VM supports it on the ++ * architecture. ++ * + *

This function is used in the implementation of GCM mode. + * + * @since 1.8 + */ +-final class GCTR { +- +- // these fields should not change after the object has been constructed +- private final SymmetricCipher aes; +- private final byte[] icb; +- +- // the current counter value +- private byte[] counter; ++final class GCTR extends CounterMode { + +- // needed for save/restore calls +- private byte[] counterSave = null; +- +- // NOTE: cipher should already be initialized + GCTR(SymmetricCipher cipher, byte[] initialCounterBlk) { +- this.aes = cipher; ++ super(cipher); + if (initialCounterBlk.length != AES_BLOCK_SIZE) { + throw new RuntimeException("length of initial counter block (" + initialCounterBlk.length + + ") not equal to AES_BLOCK_SIZE (" + AES_BLOCK_SIZE + ")"); + } +- this.icb = initialCounterBlk; +- this.counter = icb.clone(); ++ ++ iv = initialCounterBlk; ++ reset(); ++ } ++ ++ @Override ++ String getFeedback() { ++ return "GCTR"; + } + + // input must be multiples of 128-bit blocks when calling update +@@ -89,23 +80,11 @@ final class GCTR { + throw new RuntimeException("output buffer too small"); + } + +- byte[] encryptedCntr = new byte[AES_BLOCK_SIZE]; +- +- int numOfCompleteBlocks = inLen / AES_BLOCK_SIZE; +- for (int i = 0; i < numOfCompleteBlocks; i++) { +- aes.encryptBlock(counter, 0, encryptedCntr, 0); +- for (int n = 0; n < AES_BLOCK_SIZE; n++) { +- int index = (i * AES_BLOCK_SIZE + n); +- out[outOfs + index] = +- (byte) ((in[inOfs + index] ^ encryptedCntr[n])); +- } +- GaloisCounterMode.increment32(counter); +- } +- return inLen; ++ return encrypt(in, inOfs, inLen, out, outOfs); + } + + // input can be arbitrary size when calling doFinal +- protected int doFinal(byte[] in, int inOfs, int inLen, byte[] out, ++ int doFinal(byte[] in, int inOfs, int inLen, byte[] out, + int outOfs) throws IllegalBlockSizeException { + try { + if (inLen < 0) { +@@ -118,7 +97,7 @@ final class GCTR { + if (lastBlockSize != 0) { + // do the last partial block + byte[] encryptedCntr = new byte[AES_BLOCK_SIZE]; +- aes.encryptBlock(counter, 0, encryptedCntr, 0); ++ embeddedCipher.encryptBlock(counter, 0, encryptedCntr, 0); + for (int n = 0; n < lastBlockSize; n++) { + out[outOfs + completeBlkLen + n] = + (byte) ((in[inOfs + completeBlkLen + n] ^ +@@ -131,28 +110,4 @@ final class GCTR { + } + return inLen; + } +- +- /** +- * Resets the content of this object to when it's first constructed. +- */ +- void reset() { +- System.arraycopy(icb, 0, counter, 0, icb.length); +- counterSave = null; +- } +- +- /** +- * Save the current content of this object. +- */ +- void save() { +- this.counterSave = this.counter.clone(); +- } +- +- /** +- * Restores the content of this object to the previous saved one. +- */ +- void restore() { +- if (this.counterSave != null) { +- this.counter = this.counterSave; +- } +- } + } +diff --git a/jdk/src/share/classes/com/sun/crypto/provider/GHASH.java b/jdk/src/share/classes/com/sun/crypto/provider/GHASH.java +index dc42e6bbf..78f0723d7 100644 +--- a/jdk/src/share/classes/com/sun/crypto/provider/GHASH.java ++++ b/jdk/src/share/classes/com/sun/crypto/provider/GHASH.java +@@ -122,10 +122,10 @@ final class GHASH { + + } + +- /* subkeyH and state are stored in long[] for GHASH intrinsic use */ ++ /* subkeyHtbl and state are stored in long[] for GHASH intrinsic use */ + +- // hash subkey H; should not change after the object has been constructed +- private final long[] subkeyH; ++ // hashtable subkeyHtbl; holds 2*9 powers of subkeyH computed using carry-less multiplication ++ private long[] subkeyHtbl; + + // buffer for storing hash + private final long[] state; +@@ -147,9 +147,9 @@ final class GHASH { + throw new ProviderException("Internal error"); + } + state = new long[2]; +- this.subkeyH = new long[2]; +- this.subkeyH[0] = getLong(subkeyH, 0); +- this.subkeyH[1] = getLong(subkeyH, 8); ++ subkeyHtbl = new long[2*9]; ++ subkeyHtbl[0] = getLong(subkeyH, 0); ++ subkeyHtbl[1] = getLong(subkeyH, 8); + } + + /** +@@ -192,8 +192,8 @@ final class GHASH { + if (inLen == 0) { + return; + } +- ghashRangeCheck(in, inOfs, inLen, state, subkeyH); +- processBlocks(in, inOfs, inLen/AES_BLOCK_SIZE, state, subkeyH); ++ ghashRangeCheck(in, inOfs, inLen, state, subkeyHtbl); ++ processBlocks(in, inOfs, inLen/AES_BLOCK_SIZE, state, subkeyHtbl); + } + + private static void ghashRangeCheck(byte[] in, int inOfs, int inLen, long[] st, long[] subH) { +@@ -217,8 +217,8 @@ final class GHASH { + throw new RuntimeException("internal state has invalid length: " + + st.length); + } +- if (subH.length != 2) { +- throw new RuntimeException("internal subkeyH has invalid length: " + ++ if (subH.length != 18) { ++ throw new RuntimeException("internal subkeyHtbl has invalid length: " + + subH.length); + } + } +diff --git a/jdk/src/share/classes/sun/security/ssl/SSLSocketImpl.java b/jdk/src/share/classes/sun/security/ssl/SSLSocketImpl.java +index ab93e3097..dd2618455 100644 +--- a/jdk/src/share/classes/sun/security/ssl/SSLSocketImpl.java ++++ b/jdk/src/share/classes/sun/security/ssl/SSLSocketImpl.java +@@ -439,6 +439,8 @@ public final class SSLSocketImpl + if (!conContext.isNegotiated) { + readHandshakeRecord(); + } ++ } catch (InterruptedIOException iioe) { ++ handleException(iioe); + } catch (IOException ioe) { + throw conContext.fatal(Alert.HANDSHAKE_FAILURE, + "Couldn't kickstart handshaking", ioe); +@@ -1309,12 +1311,11 @@ public final class SSLSocketImpl + } + } catch (SSLException ssle) { + throw ssle; ++ } catch (InterruptedIOException iioe) { ++ // don't change exception in case of timeouts or interrupts ++ throw iioe; + } catch (IOException ioe) { +- if (!(ioe instanceof SSLException)) { +- throw new SSLException("readHandshakeRecord", ioe); +- } else { +- throw ioe; +- } ++ throw new SSLException("readHandshakeRecord", ioe); + } + } + +@@ -1375,6 +1376,9 @@ public final class SSLSocketImpl + } + } catch (SSLException ssle) { + throw ssle; ++ } catch (InterruptedIOException iioe) { ++ // don't change exception in case of timeouts or interrupts ++ throw iioe; + } catch (IOException ioe) { + if (!(ioe instanceof SSLException)) { + throw new SSLException("readApplicationRecord", ioe); +diff --git a/jdk/src/share/classes/sun/security/ssl/SSLSocketInputRecord.java b/jdk/src/share/classes/sun/security/ssl/SSLSocketInputRecord.java +index 401822759..ab5712acc 100644 +--- a/jdk/src/share/classes/sun/security/ssl/SSLSocketInputRecord.java ++++ b/jdk/src/share/classes/sun/security/ssl/SSLSocketInputRecord.java +@@ -26,6 +26,7 @@ + package sun.security.ssl; + + import java.io.EOFException; ++import java.io.InterruptedIOException; + import java.io.IOException; + import java.io.InputStream; + import java.io.OutputStream; +@@ -47,37 +48,31 @@ import sun.security.ssl.SSLCipher.SSLReadCipher; + final class SSLSocketInputRecord extends InputRecord implements SSLRecord { + private InputStream is = null; + private OutputStream os = null; +- private final byte[] temporary = new byte[1024]; ++ private final byte[] header = new byte[headerSize]; ++ private int headerOff = 0; ++ // Cache for incomplete record body. ++ private ByteBuffer recordBody = ByteBuffer.allocate(1024); + + private boolean formatVerified = false; // SSLv2 ruled out? + + // Cache for incomplete handshake messages. + private ByteBuffer handshakeBuffer = null; + +- private boolean hasHeader = false; // Had read the record header +- + SSLSocketInputRecord(HandshakeHash handshakeHash) { + super(handshakeHash, SSLReadCipher.nullTlsReadCipher()); + } + + @Override + int bytesInCompletePacket() throws IOException { +- if (!hasHeader) { +- // read exactly one record +- try { +- int really = read(is, temporary, 0, headerSize); +- if (really < 0) { +- // EOF: peer shut down incorrectly +- return -1; +- } +- } catch (EOFException eofe) { +- // The caller will handle EOF. +- return -1; +- } +- hasHeader = true; ++ // read header ++ try { ++ readHeader(); ++ } catch (EOFException eofe) { ++ // The caller will handle EOF. ++ return -1; + } + +- byte byteZero = temporary[0]; ++ byte byteZero = header[0]; + int len = 0; + + /* +@@ -93,9 +88,9 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { + * Last sanity check that it's not a wild record + */ + if (!ProtocolVersion.isNegotiable( +- temporary[1], temporary[2], false)) { ++ header[1], header[2], false)) { + throw new SSLException("Unrecognized record version " + +- ProtocolVersion.nameOf(temporary[1], temporary[2]) + ++ ProtocolVersion.nameOf(header[1], header[2]) + + " , plaintext connection?"); + } + +@@ -109,8 +104,8 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { + /* + * One of the SSLv3/TLS message types. + */ +- len = ((temporary[3] & 0xFF) << 8) + +- (temporary[4] & 0xFF) + headerSize; ++ len = ((header[3] & 0xFF) << 8) + ++ (header[4] & 0xFF) + headerSize; + } else { + /* + * Must be SSLv2 or something unknown. +@@ -121,11 +116,11 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { + */ + boolean isShort = ((byteZero & 0x80) != 0); + +- if (isShort && ((temporary[2] == 1) || (temporary[2] == 4))) { ++ if (isShort && ((header[2] == 1) || (header[2] == 4))) { + if (!ProtocolVersion.isNegotiable( +- temporary[3], temporary[4], false)) { ++ header[3], header[4], false)) { + throw new SSLException("Unrecognized record version " + +- ProtocolVersion.nameOf(temporary[3], temporary[4]) + ++ ProtocolVersion.nameOf(header[3], header[4]) + + " , plaintext connection?"); + } + +@@ -138,9 +133,9 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { + // + // int mask = (isShort ? 0x7F : 0x3F); + // len = ((byteZero & mask) << 8) + +- // (temporary[1] & 0xFF) + (isShort ? 2 : 3); ++ // (header[1] & 0xFF) + (isShort ? 2 : 3); + // +- len = ((byteZero & 0x7F) << 8) + (temporary[1] & 0xFF) + 2; ++ len = ((byteZero & 0x7F) << 8) + (header[1] & 0xFF) + 2; + } else { + // Gobblygook! + throw new SSLException( +@@ -160,34 +155,41 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { + return null; + } + +- if (!hasHeader) { +- // read exactly one record +- int really = read(is, temporary, 0, headerSize); +- if (really < 0) { +- throw new EOFException("SSL peer shut down incorrectly"); +- } +- hasHeader = true; +- } ++ // read header ++ readHeader(); + +- Plaintext plaintext = null; +- if (!formatVerified) { +- formatVerified = true; ++ Plaintext[] plaintext = null; ++ boolean cleanInBuffer = true; ++ try { ++ if (!formatVerified) { ++ formatVerified = true; + +- /* +- * The first record must either be a handshake record or an +- * alert message. If it's not, it is either invalid or an +- * SSLv2 message. +- */ +- if ((temporary[0] != ContentType.HANDSHAKE.id) && +- (temporary[0] != ContentType.ALERT.id)) { +- hasHeader = false; +- return handleUnknownRecord(temporary); ++ /* ++ * The first record must either be a handshake record or an ++ * alert message. If it's not, it is either invalid or an ++ * SSLv2 message. ++ */ ++ if ((header[0] != ContentType.HANDSHAKE.id) && ++ (header[0] != ContentType.ALERT.id)) { ++ plaintext = handleUnknownRecord(); ++ } + } +- } + +- // The record header should has consumed. +- hasHeader = false; +- return decodeInputRecord(temporary); ++ // The record header should has consumed. ++ if (plaintext == null) { ++ plaintext = decodeInputRecord(); ++ } ++ } catch(InterruptedIOException e) { ++ // do not clean header and recordBody in case of Socket Timeout ++ cleanInBuffer = false; ++ throw e; ++ } finally { ++ if (cleanInBuffer) { ++ headerOff = 0; ++ recordBody.clear(); ++ } ++ } ++ return plaintext; + } + + @Override +@@ -200,9 +202,7 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { + this.os = outputStream; + } + +- // Note that destination may be null +- private Plaintext[] decodeInputRecord( +- byte[] header) throws IOException, BadPaddingException { ++ private Plaintext[] decodeInputRecord() throws IOException, BadPaddingException { + byte contentType = header[0]; // pos: 0 + byte majorVersion = header[1]; // pos: 1 + byte minorVersion = header[2]; // pos: 2 +@@ -227,30 +227,27 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { + } + + // +- // Read a complete record. ++ // Read a complete record and store in the recordBody ++ // recordBody is used to cache incoming record and restore in case of ++ // read operation timedout + // +- ByteBuffer destination = ByteBuffer.allocate(headerSize + contentLen); +- int dstPos = destination.position(); +- destination.put(temporary, 0, headerSize); +- while (contentLen > 0) { +- int howmuch = Math.min(temporary.length, contentLen); +- int really = read(is, temporary, 0, howmuch); +- if (really < 0) { +- throw new EOFException("SSL peer shut down incorrectly"); ++ if (recordBody.position() == 0) { ++ if (recordBody.capacity() < contentLen) { ++ recordBody = ByteBuffer.allocate(contentLen); + } +- +- destination.put(temporary, 0, howmuch); +- contentLen -= howmuch; ++ recordBody.limit(contentLen); ++ } else { ++ contentLen = recordBody.remaining(); + } +- destination.flip(); +- destination.position(dstPos + headerSize); ++ readFully(contentLen); ++ recordBody.flip(); + + if (SSLLogger.isOn && SSLLogger.isOn("record")) { + SSLLogger.fine( + "READ: " + + ProtocolVersion.nameOf(majorVersion, minorVersion) + + " " + ContentType.nameOf(contentType) + ", length = " + +- destination.remaining()); ++ recordBody.remaining()); + } + + // +@@ -259,7 +256,7 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { + ByteBuffer fragment; + try { + Plaintext plaintext = +- readCipher.decrypt(contentType, destination, null); ++ readCipher.decrypt(contentType, recordBody, null); + fragment = plaintext.fragment; + contentType = plaintext.contentType; + } catch (BadPaddingException bpe) { +@@ -368,8 +365,7 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { + }; + } + +- private Plaintext[] handleUnknownRecord( +- byte[] header) throws IOException, BadPaddingException { ++ private Plaintext[] handleUnknownRecord() throws IOException, BadPaddingException { + byte firstByte = header[0]; + byte thirdByte = header[2]; + +@@ -411,32 +407,29 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { + } + + int msgLen = ((header[0] & 0x7F) << 8) | (header[1] & 0xFF); +- +- ByteBuffer destination = ByteBuffer.allocate(headerSize + msgLen); +- destination.put(temporary, 0, headerSize); +- msgLen -= 3; // had read 3 bytes of content as header +- while (msgLen > 0) { +- int howmuch = Math.min(temporary.length, msgLen); +- int really = read(is, temporary, 0, howmuch); +- if (really < 0) { +- throw new EOFException("SSL peer shut down incorrectly"); ++ if (recordBody.position() == 0) { ++ if (recordBody.capacity() < (headerSize + msgLen)) { ++ recordBody = ByteBuffer.allocate(headerSize + msgLen); + } +- +- destination.put(temporary, 0, howmuch); +- msgLen -= howmuch; ++ recordBody.limit(headerSize + msgLen); ++ recordBody.put(header, 0, headerSize); ++ } else { ++ msgLen = recordBody.remaining(); + } +- destination.flip(); ++ msgLen -= 3; // had read 3 bytes of content as header ++ readFully(msgLen); ++ recordBody.flip(); + + /* + * If we can map this into a V3 ClientHello, read and + * hash the rest of the V2 handshake, turn it into a + * V3 ClientHello message, and pass it up. + */ +- destination.position(2); // exclude the header +- handshakeHash.receive(destination); +- destination.position(0); ++ recordBody.position(2); // exclude the header ++ handshakeHash.receive(recordBody); ++ recordBody.position(0); + +- ByteBuffer converted = convertToClientHello(destination); ++ ByteBuffer converted = convertToClientHello(recordBody); + + if (SSLLogger.isOn && SSLLogger.isOn("packet")) { + SSLLogger.fine( +@@ -456,28 +449,42 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord { + } + } + +- // Read the exact bytes of data, otherwise, return -1. +- private static int read(InputStream is, +- byte[] buffer, int offset, int len) throws IOException { +- int n = 0; +- while (n < len) { +- int readLen = is.read(buffer, offset + n, len - n); +- if (readLen < 0) { +- if (SSLLogger.isOn && SSLLogger.isOn("packet")) { +- SSLLogger.fine("Raw read: EOF"); +- } +- return -1; ++ // Read the exact bytes of data, otherwise, throw IOException. ++ private int readFully(int len) throws IOException { ++ int end = len + recordBody.position(); ++ int off = recordBody.position(); ++ try { ++ while (off < end) { ++ off += read(is, recordBody.array(), off, end - off); + } ++ } finally { ++ recordBody.position(off); ++ } ++ return len; ++ } ++ ++ // Read SSE record header, otherwise, throw IOException. ++ private int readHeader() throws IOException { ++ while (headerOff < headerSize) { ++ headerOff += read(is, header, headerOff, headerSize - headerOff); ++ } ++ return headerSize; ++ } + ++ private static int read(InputStream is, byte[] buf, int off, int len) throws IOException { ++ int readLen = is.read(buf, off, len); ++ if (readLen < 0) { + if (SSLLogger.isOn && SSLLogger.isOn("packet")) { +- ByteBuffer bb = ByteBuffer.wrap(buffer, offset + n, readLen); +- SSLLogger.fine("Raw read", bb); ++ SSLLogger.fine("Raw read: EOF"); + } +- +- n += readLen; ++ throw new EOFException("SSL peer shut down incorrectly"); + } + +- return n; ++ if (SSLLogger.isOn && SSLLogger.isOn("packet")) { ++ ByteBuffer bb = ByteBuffer.wrap(buf, off, readLen); ++ SSLLogger.fine("Raw read", bb); ++ } ++ return readLen; + } + + // Try to use up the input stream without impact the performance too much. +diff --git a/jdk/src/share/classes/sun/security/ssl/SSLTransport.java b/jdk/src/share/classes/sun/security/ssl/SSLTransport.java +index b3d03b370..78e13ea2c 100644 +--- a/jdk/src/share/classes/sun/security/ssl/SSLTransport.java ++++ b/jdk/src/share/classes/sun/security/ssl/SSLTransport.java +@@ -27,6 +27,7 @@ package sun.security.ssl; + + import java.io.EOFException; + import java.io.IOException; ++import java.io.InterruptedIOException; + import java.nio.ByteBuffer; + import javax.crypto.AEADBadTagException; + import javax.crypto.BadPaddingException; +@@ -134,6 +135,9 @@ interface SSLTransport { + } catch (EOFException eofe) { + // rethrow EOFException, the call will handle it if neede. + throw eofe; ++ } catch (InterruptedIOException iioe) { ++ // don't close the Socket in case of timeouts or interrupts. ++ throw iioe; + } catch (IOException ioe) { + throw context.fatal(Alert.UNEXPECTED_MESSAGE, ioe); + } +diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMBench.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMBench.java +new file mode 100644 +index 000000000..258672f59 +--- /dev/null ++++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMBench.java +@@ -0,0 +1,128 @@ ++/* ++ * Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved. ++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ++ * ++ * This code is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 only, as ++ * published by the Free Software Foundation. ++ * ++ * This code is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * version 2 for more details (a copy is included in the LICENSE file that ++ * accompanied this code). ++ * ++ * You should have received a copy of the GNU General Public License version ++ * 2 along with this work; if not, write to the Free Software Foundation, ++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. ++ * ++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA ++ * or visit www.oracle.com if you need additional information or have any ++ * questions. ++ */ ++package org.openjdk.bench.javax.crypto.full; ++ ++import org.openjdk.jmh.annotations.Benchmark; ++import org.openjdk.jmh.annotations.Param; ++import org.openjdk.jmh.annotations.Setup; ++ ++import javax.crypto.Cipher; ++import javax.crypto.spec.GCMParameterSpec; ++import javax.crypto.spec.SecretKeySpec; ++ ++/** ++ * This performance tests runs AES/GCM encryption and decryption using byte[] ++ * as input and output buffers for single and multi-part testing. ++ * ++ * This test rotates the IV and creates a new GCMParameterSpec for each encrypt ++ * benchmark operation ++ */ ++ ++public class AESGCMBench extends CryptoBase { ++ ++ @Param({"128"}) ++ private int keyLength; ++ ++ @Param({"1024", "1500", "4096", "16384"}) ++ private int dataSize; ++ ++ byte[] encryptedData; ++ byte[] in, out; ++ private Cipher encryptCipher; ++ private Cipher decryptCipher; ++ SecretKeySpec ks; ++ GCMParameterSpec gcm_spec; ++ byte[] iv; ++ ++ private static final int IV_BUFFER_SIZE = 32; ++ private static final int IV_MODULO = IV_BUFFER_SIZE - 16; ++ int iv_index = 0; ++ int updateLen = 0; ++ ++ private int next_iv_index() { ++ int r = iv_index; ++ iv_index = (iv_index + 1) % IV_MODULO; ++ return r; ++ } ++ ++ @Setup ++ public void setup() throws Exception { ++ setupProvider(); ++ ++ // Setup key material ++ byte[] keystring = fillSecureRandom(new byte[keyLength / 8]); ++ ks = new SecretKeySpec(keystring, "AES"); ++ iv = fillSecureRandom(new byte[IV_BUFFER_SIZE]); ++ gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16); ++ ++ // Setup Cipher classes ++ encryptCipher = makeCipher(prov, "AES/GCM/NoPadding"); ++ encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec); ++ decryptCipher = makeCipher(prov, "AES/GCM/NoPadding"); ++ decryptCipher.init(Cipher.DECRYPT_MODE, ks, ++ encryptCipher.getParameters(). ++ getParameterSpec(GCMParameterSpec.class)); ++ ++ // Setup input/output buffers ++ in = fillRandom(new byte[dataSize]); ++ encryptedData = new byte[encryptCipher.getOutputSize(in.length)]; ++ out = new byte[encryptedData.length]; ++ encryptCipher.doFinal(in, 0, in.length, encryptedData, 0); ++ updateLen = in.length / 2; ++ ++ } ++ ++ @Benchmark ++ public void encrypt() throws Exception { ++ gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16); ++ encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec); ++ encryptCipher.doFinal(in, 0, in.length, out, 0); ++ } ++ ++ @Benchmark ++ public void encryptMultiPart() throws Exception { ++ gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16); ++ encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec); ++ int outOfs = encryptCipher.update(in, 0, updateLen, out, 0); ++ encryptCipher.doFinal(in, updateLen, in.length - updateLen, ++ out, outOfs); ++ } ++ ++ @Benchmark ++ public void decrypt() throws Exception { ++ decryptCipher.init(Cipher.DECRYPT_MODE, ks, ++ encryptCipher.getParameters(). ++ getParameterSpec(GCMParameterSpec.class)); ++ decryptCipher.doFinal(encryptedData, 0, encryptedData.length, out, 0); ++ } ++ ++ @Benchmark ++ public void decryptMultiPart() throws Exception { ++ decryptCipher.init(Cipher.DECRYPT_MODE, ks, ++ encryptCipher.getParameters(). ++ getParameterSpec(GCMParameterSpec.class)); ++ decryptCipher.update(encryptedData, 0, updateLen, out, 0); ++ decryptCipher.doFinal(encryptedData, updateLen, ++ encryptedData.length - updateLen, out, 0); ++ } ++} +\ No newline at end of file +diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMByteBuffer.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMByteBuffer.java +new file mode 100644 +index 000000000..cb6d20c51 +--- /dev/null ++++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMByteBuffer.java +@@ -0,0 +1,163 @@ ++/* ++ * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved. ++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ++ * ++ * This code is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 only, as ++ * published by the Free Software Foundation. Oracle designates this ++ * particular file as subject to the "Classpath" exception as provided ++ * by Oracle in the LICENSE file that accompanied this code. ++ * ++ * This code is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * version 2 for more details (a copy is included in the LICENSE file that ++ * accompanied this code). ++ * ++ * You should have received a copy of the GNU General Public License version ++ * 2 along with this work; if not, write to the Free Software Foundation, ++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. ++ * ++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA ++ * or visit www.oracle.com if you need additional information or have any ++ * questions. ++ */ ++package org.openjdk.bench.javax.crypto.full; ++ ++import org.openjdk.jmh.annotations.Benchmark; ++import org.openjdk.jmh.annotations.Param; ++import org.openjdk.jmh.annotations.Setup; ++ ++import javax.crypto.Cipher; ++import javax.crypto.spec.GCMParameterSpec; ++import javax.crypto.spec.SecretKeySpec; ++import java.nio.ByteBuffer; ++ ++/** ++ * This performance tests runs AES/GCM encryption and decryption using heap and ++ * direct ByteBuffers as input and output buffers for single and multi-part ++ * operations. ++ * ++ * This test rotates the IV and creates a new GCMParameterSpec for each encrypt ++ * benchmark operation ++ */ ++ ++public class AESGCMByteBuffer extends CryptoBase { ++ ++ @Param({"128"}) ++ private int keyLength; ++ ++ @Param({"1024", "1500", "4096", "16384"}) ++ private int dataSize; ++ ++ @Param({"direct", "heap"}) ++ private String dataMethod; ++ ++ byte[] data; ++ ByteBuffer encryptedData; ++ ByteBuffer in, out; ++ private Cipher encryptCipher; ++ private Cipher decryptCipher; ++ SecretKeySpec ks; ++ GCMParameterSpec gcm_spec; ++ byte[] iv; ++ ++ private static final int IV_BUFFER_SIZE = 32; ++ private static final int IV_MODULO = IV_BUFFER_SIZE - 16; ++ int iv_index = 0; ++ int updateLen = 0; ++ ++ private int next_iv_index() { ++ int r = iv_index; ++ iv_index = (iv_index + 1) % IV_MODULO; ++ return r; ++ } ++ ++ @Setup ++ public void setup() throws Exception { ++ setupProvider(); ++ ++ // Setup key material ++ byte[] keystring = fillSecureRandom(new byte[keyLength / 8]); ++ ks = new SecretKeySpec(keystring, "AES"); ++ iv = fillSecureRandom(new byte[IV_BUFFER_SIZE]); ++ gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16); ++ ++ // Setup Cipher classes ++ encryptCipher = makeCipher(prov, "AES/GCM/NoPadding"); ++ encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec); ++ decryptCipher = makeCipher(prov, "AES/GCM/NoPadding"); ++ decryptCipher.init(Cipher.DECRYPT_MODE, ks, ++ encryptCipher.getParameters(). ++ getParameterSpec(GCMParameterSpec.class)); ++ ++ // Setup input/output buffers ++ data = fillRandom(new byte[dataSize]); ++ if (dataMethod.equalsIgnoreCase("direct")) { ++ in = ByteBuffer.allocateDirect(data.length); ++ in.put(data); ++ in.flip(); ++ encryptedData = ByteBuffer.allocateDirect( ++ encryptCipher.getOutputSize(data.length)); ++ out = ByteBuffer.allocateDirect(encryptedData.capacity()); ++ } else if (dataMethod.equalsIgnoreCase("heap")) { ++ in = ByteBuffer.wrap(data); ++ encryptedData = ByteBuffer.allocate( ++ encryptCipher.getOutputSize(data.length)); ++ out = ByteBuffer.allocate(encryptedData.capacity()); ++ } ++ ++ encryptCipher.doFinal(in, encryptedData); ++ encryptedData.flip(); ++ in.flip(); ++ updateLen = in.remaining() / 2; ++ } ++ ++ @Benchmark ++ public void encrypt() throws Exception { ++ gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16); ++ encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec); ++ encryptCipher.doFinal(in, out); ++ out.flip(); ++ in.flip(); ++ } ++ ++ @Benchmark ++ public void encryptMultiPart() throws Exception { ++ gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16); ++ encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec); ++ in.limit(updateLen); ++ encryptCipher.update(in, out); ++ in.limit(in.capacity()); ++ encryptCipher.doFinal(in, out); ++ out.flip(); ++ in.flip(); ++ } ++ ++ @Benchmark ++ public void decrypt() throws Exception { ++ decryptCipher.init(Cipher.DECRYPT_MODE, ks, ++ encryptCipher.getParameters(). ++ getParameterSpec(GCMParameterSpec.class)); ++ decryptCipher.doFinal(encryptedData, out); ++ encryptedData.flip(); ++ out.flip(); ++ } ++ ++ @Benchmark ++ public void decryptMultiPart() throws Exception { ++ decryptCipher.init(Cipher.DECRYPT_MODE, ks, ++ encryptCipher.getParameters(). ++ getParameterSpec(GCMParameterSpec.class)); ++ ++ int len = encryptedData.remaining(); ++ encryptedData.limit(updateLen); ++ decryptCipher.update(encryptedData, out); ++ encryptedData.limit(len); ++ ++ decryptCipher.doFinal(encryptedData, out); ++ encryptedData.flip(); ++ out.flip(); ++ } ++ ++} +\ No newline at end of file +diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/full/CryptoBase.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/CryptoBase.java +new file mode 100644 +index 000000000..4af12703b +--- /dev/null ++++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/CryptoBase.java +@@ -0,0 +1,102 @@ ++/* ++ * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved. ++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ++ * ++ * This code is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 only, as ++ * published by the Free Software Foundation. ++ * ++ * This code is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * version 2 for more details (a copy is included in the LICENSE file that ++ * accompanied this code). ++ * ++ * You should have received a copy of the GNU General Public License version ++ * 2 along with this work; if not, write to the Free Software Foundation, ++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. ++ * ++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA ++ * or visit www.oracle.com if you need additional information or have any ++ * questions. ++ */ ++package org.openjdk.bench.javax.crypto.full; ++ ++import org.openjdk.jmh.annotations.BenchmarkMode; ++import org.openjdk.jmh.annotations.Fork; ++import org.openjdk.jmh.annotations.Measurement; ++import org.openjdk.jmh.annotations.Mode; ++import org.openjdk.jmh.annotations.OutputTimeUnit; ++import org.openjdk.jmh.annotations.Param; ++import org.openjdk.jmh.annotations.Scope; ++import org.openjdk.jmh.annotations.Setup; ++import org.openjdk.jmh.annotations.State; ++import org.openjdk.jmh.annotations.Warmup; ++ ++import javax.crypto.BadPaddingException; ++import javax.crypto.Cipher; ++import javax.crypto.IllegalBlockSizeException; ++import javax.crypto.NoSuchPaddingException; ++import java.security.NoSuchAlgorithmException; ++import java.security.Provider; ++import java.security.SecureRandom; ++import java.security.Security; ++import java.util.Random; ++import java.util.concurrent.TimeUnit; ++ ++ ++@Fork(jvmArgsAppend = {"-XX:+AlwaysPreTouch"}, value = 5) ++@Warmup(iterations = 3, time = 3) ++@Measurement(iterations = 8, time = 2) ++@OutputTimeUnit(TimeUnit.SECONDS) ++@State(Scope.Thread) ++@BenchmarkMode(Mode.Throughput) ++public class CryptoBase { ++ ++ @Param({""}) ++ private String provider; ++ ++ public Provider prov = null; ++ ++ @Setup ++ public void setupProvider() { ++ if (provider != null && !provider.isEmpty()) { ++ prov = Security.getProvider(provider); ++ if (prov == null) { ++ throw new RuntimeException("Can't find prodiver \"" + provider + "\""); ++ } ++ } ++ } ++ ++ public static Cipher makeCipher(Provider prov, String algorithm) throws NoSuchPaddingException, NoSuchAlgorithmException { ++ return (prov == null) ? Cipher.getInstance(algorithm) : Cipher.getInstance(algorithm, prov); ++ } ++ ++ public static byte[][] fillRandom(byte[][] data) { ++ Random rnd = new Random(); ++ for (byte[] d : data) { ++ rnd.nextBytes(d); ++ } ++ return data; ++ } ++ ++ public static byte[] fillRandom(byte[] data) { ++ Random rnd = new Random(); ++ rnd.nextBytes(data); ++ return data; ++ } ++ ++ public static byte[] fillSecureRandom(byte[] data) { ++ SecureRandom rnd = new SecureRandom(); ++ rnd.nextBytes(data); ++ return data; ++ } ++ ++ public static byte[][] fillEncrypted(byte[][] data, Cipher encryptCipher) throws BadPaddingException, IllegalBlockSizeException { ++ byte[][] encryptedData = new byte[data.length][]; ++ for (int i = 0; i < encryptedData.length; i++) { ++ encryptedData[i] = encryptCipher.doFinal(data[i]); ++ } ++ return encryptedData; ++ } ++} +\ No newline at end of file +diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMBench.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMBench.java +new file mode 100644 +index 000000000..a21b0c87f +--- /dev/null ++++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMBench.java +@@ -0,0 +1,36 @@ ++/* ++ * Copyright (c) 2015, 2021, Oracle and/or its affiliates. All rights reserved. ++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ++ * ++ * This code is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 only, as ++ * published by the Free Software Foundation. ++ * ++ * This code is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * version 2 for more details (a copy is included in the LICENSE file that ++ * accompanied this code). ++ * ++ * You should have received a copy of the GNU General Public License version ++ * 2 along with this work; if not, write to the Free Software Foundation, ++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. ++ * ++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA ++ * or visit www.oracle.com if you need additional information or have any ++ * questions. ++ */ ++package org.openjdk.bench.javax.crypto.small; ++ ++import org.openjdk.jmh.annotations.Param; ++ ++public class AESGCMBench extends ++ org.openjdk.bench.javax.crypto.full.AESGCMBench { ++ ++ @Param({"128"}) ++ private int keyLength; ++ ++ @Param({"1024"}) ++ private int dataSize; ++ ++} +\ No newline at end of file +diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMByteBuffer.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMByteBuffer.java +new file mode 100644 +index 000000000..2e389d300 +--- /dev/null ++++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMByteBuffer.java +@@ -0,0 +1,36 @@ ++/* ++ * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved. ++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ++ * ++ * This code is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License version 2 only, as ++ * published by the Free Software Foundation. ++ * ++ * This code is distributed in the hope that it will be useful, but WITHOUT ++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++ * version 2 for more details (a copy is included in the LICENSE file that ++ * accompanied this code). ++ * ++ * You should have received a copy of the GNU General Public License version ++ * 2 along with this work; if not, write to the Free Software Foundation, ++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. ++ * ++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA ++ * or visit www.oracle.com if you need additional information or have any ++ * questions. ++ */ ++package org.openjdk.bench.javax.crypto.small; ++ ++import org.openjdk.jmh.annotations.Param; ++ ++public class AESGCMByteBuffer extends ++ org.openjdk.bench.javax.crypto.full.AESGCMByteBuffer { ++ ++ @Param({"128"}) ++ private int keyLength; ++ ++ @Param({"1024"}) ++ private int dataSize; ++ ++} +\ No newline at end of file +diff --git a/jdk/test/sun/security/ssl/SSLSocketImpl/ClientTimeout.java b/jdk/test/sun/security/ssl/SSLSocketImpl/ClientTimeout.java +index 3eb1d7b89..7678cc71f 100644 +--- a/jdk/test/sun/security/ssl/SSLSocketImpl/ClientTimeout.java ++++ b/jdk/test/sun/security/ssl/SSLSocketImpl/ClientTimeout.java +@@ -26,8 +26,7 @@ + + /* + * @test +- * @bug 4836493 +- * @ignore need further evaluation ++ * @bug 4836493 8239798 + * @summary Socket timeouts for SSLSockets causes data corruption. + * @run main/othervm ClientTimeout + */ +diff --git a/jdk/test/sun/security/ssl/SSLSocketImpl/SSLExceptionForIOIssue.java b/jdk/test/sun/security/ssl/SSLSocketImpl/SSLExceptionForIOIssue.java +index 3e626a257..5578ea725 100644 +--- a/jdk/test/sun/security/ssl/SSLSocketImpl/SSLExceptionForIOIssue.java ++++ b/jdk/test/sun/security/ssl/SSLSocketImpl/SSLExceptionForIOIssue.java +@@ -36,7 +36,7 @@ + + import javax.net.ssl.*; + import java.io.*; +-import java.net.InetAddress; ++import java.net.*; + + public class SSLExceptionForIOIssue implements SSLContextTemplate { + +@@ -139,7 +139,7 @@ public class SSLExceptionForIOIssue implements SSLContextTemplate { + } catch (SSLProtocolException | SSLHandshakeException sslhe) { + clientException = sslhe; + System.err.println("unexpected client exception: " + sslhe); +- } catch (SSLException ssle) { ++ } catch (SSLException | SocketTimeoutException ssle) { + // the expected exception, ignore it + System.err.println("expected client exception: " + ssle); + } catch (Exception e) { +-- +2.17.1 + diff --git a/openjdk-1.8.0.spec b/openjdk-1.8.0.spec index a8643154f3b397beb3b9b85e550b762a0cc988ad..c5b3d0136c8361f780f0bba8f5d49df2e79c07eb 100644 --- a/openjdk-1.8.0.spec +++ b/openjdk-1.8.0.spec @@ -916,7 +916,7 @@ Provides: java-%{javaver}-%{origin}-accessibility%{?1} = %{epoch}:%{version}-%{r Name: java-%{javaver}-%{origin} Version: %{javaver}.%{updatever}.%{buildver} -Release: 3 +Release: 4 # java-1.5.0-ibm from jpackage.org set Epoch to 1 for unknown reasons # and this change was brought into RHEL-4. java-1.5.0-ibm packages # also included the epoch in their virtual provides. This created a @@ -1139,6 +1139,7 @@ Patch249: Improve_AlgorithmConstraints_checkAlgorithm_performance.patch Patch250: modify_coding_style_and_describe_error.patch Patch251: fix_wrap_memcpy_undefined_gcc10_3.patch Patch252: 8290705_fix_StringConcat_validate_mem_flow_asserts_with_unexpected_userStoreI.patch +Patch253: 8143925-enhancing-CounterMode.crypt-for-AESCrypt.patch ############################################# # @@ -1618,6 +1619,7 @@ pushd %{top_level_dir_name} %patch250 -p1 %patch251 -p1 %patch252 -p1 +%patch253 -p1 popd # System library fixes @@ -2242,6 +2244,10 @@ cjc.mainProgram(arg) %endif %changelog + +* Thu Sep 15 2022 kuenking111 - 1:1.8.0.342-b07.4 +- add 8143925-enhancing-CounterMode.crypt-for-AESCrypt.patch + * Fri Aug 5 2022 kuenking111 - 1:1.8.0.342-b07.3 - add 8290705_fix_StringConcat_validate_mem_flow_asserts_with_unexpected_userStoreI.patch - modified version.txt