diff --git a/8143925-enhancing-CounterMode.crypt-for-AESCrypt.patch b/8143925-enhancing-CounterMode.crypt-for-AESCrypt.patch
new file mode 100644
index 0000000000000000000000000000000000000000..81acb5c4ec2e49c9990e1cf2ed85d1ecad7211ac
--- /dev/null
+++ b/8143925-enhancing-CounterMode.crypt-for-AESCrypt.patch
@@ -0,0 +1,3938 @@
+From 02b097417275acaad294d71a852c2def2222be25 Mon Sep 17 00:00:00 2001
+From: kuenking111 <wangkun49@huawei.com>
+Date: Sat, 3 Sep 2022 14:17:50 +0000
+Subject: [PATCH 1/6] 8143925-enhancing-CounterMode.crypt-for-AESCrypt
+
+---
+ .../src/cpu/aarch64/vm/assembler_aarch64.hpp  |  35 +-
+ .../cpu/aarch64/vm/macroAssembler_aarch64.hpp |  17 +
+ .../aarch64/vm/macroAssembler_aarch64_aes.cpp | 685 ++++++++++++++++++
+ .../cpu/aarch64/vm/stubGenerator_aarch64.cpp  | 324 ++++++++-
+ .../cpu/aarch64/vm/stubRoutines_aarch64.hpp   |   2 +-
+ .../src/cpu/aarch64/vm/vm_version_aarch64.cpp |  13 +-
+ hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp     |   5 +
+ hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp |   5 +
+ hotspot/src/cpu/x86/vm/assembler_x86.cpp      |  74 +-
+ hotspot/src/cpu/x86/vm/assembler_x86.hpp      |  12 +
+ .../src/cpu/x86/vm/stubGenerator_x86_32.cpp   | 344 +++++++++
+ .../src/cpu/x86/vm/stubGenerator_x86_64.cpp   | 340 ++++++++-
+ hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp   |   1 +
+ hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp   |   5 +
+ .../src/cpu/x86/vm/stubRoutines_x86_32.hpp    |   2 +-
+ .../src/cpu/x86/vm/stubRoutines_x86_64.hpp    |   2 +-
+ hotspot/src/cpu/x86/vm/vm_version_x86.cpp     |  36 +
+ hotspot/src/share/vm/classfile/vmSymbols.hpp  |   4 +
+ hotspot/src/share/vm/opto/escape.cpp          |   1 +
+ hotspot/src/share/vm/opto/library_call.cpp    | 174 +++++
+ hotspot/src/share/vm/opto/runtime.cpp         |  29 +
+ hotspot/src/share/vm/opto/runtime.hpp         |   1 +
+ hotspot/src/share/vm/runtime/globals.hpp      |   3 +
+ hotspot/src/share/vm/runtime/stubRoutines.cpp |   1 +
+ hotspot/src/share/vm/runtime/stubRoutines.hpp |   2 +
+ hotspot/src/share/vm/runtime/vmStructs.cpp    |   1 +
+ .../test/compiler/7184394/TestAESBase.java    |   4 +-
+ .../test/compiler/7184394/TestAESMain.java    |   7 +
+ .../com/sun/crypto/provider/CounterMode.java  |  11 +-
+ .../classes/com/sun/crypto/provider/GCTR.java |  89 +--
+ .../com/sun/crypto/provider/GHASH.java        |  20 +-
+ .../sun/security/ssl/SSLSocketImpl.java       |  14 +-
+ .../security/ssl/SSLSocketInputRecord.java    | 215 +++---
+ .../sun/security/ssl/SSLTransport.java        |   4 +
+ .../bench/javax/crypto/full/AESGCMBench.java  | 128 ++++
+ .../javax/crypto/full/AESGCMByteBuffer.java   | 163 +++++
+ .../bench/javax/crypto/full/CryptoBase.java   | 102 +++
+ .../bench/javax/crypto/small/AESGCMBench.java |  36 +
+ .../javax/crypto/small/AESGCMByteBuffer.java  |  36 +
+ .../ssl/SSLSocketImpl/ClientTimeout.java      |   3 +-
+ .../SSLSocketImpl/SSLExceptionForIOIssue.java |   4 +-
+ 41 files changed, 2738 insertions(+), 216 deletions(-)
+ create mode 100644 hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64_aes.cpp
+ create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMBench.java
+ create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMByteBuffer.java
+ create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/full/CryptoBase.java
+ create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMBench.java
+ create mode 100644 jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMByteBuffer.java
+
+diff --git a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp
+index b0fa9b5fc..9202e61f8 100644
+--- a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp
++++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp
+@@ -146,6 +146,21 @@ REGISTER_DECLARATION(Register, esp,      r20);
+ 
+ #define assert_cond(ARG1) assert(ARG1, #ARG1)
+ 
++// In many places we've added C-style casts to silence compiler
++// warnings, for example when truncating a size_t to an int when we
++// know the size_t is a small struct. Such casts are risky because
++// they effectively disable useful compiler warnings. We can make our
++// lives safer with this function, which ensures that any cast is
++// reversible without loss of information. It doesn't check
++// everything: it isn't intended to make sure that pointer types are
++// compatible, for example.
++template <typename T2, typename T1>
++T2 checked_cast(T1 thing) {
++  T2 result = static_cast<T2>(thing);
++  assert(static_cast<T1>(result) == thing, "must be");
++  return result;
++}
++
+ namespace asm_util {
+   uint32_t encode_logical_immediate(bool is32, uint64_t imm);
+ };
+@@ -193,7 +208,7 @@ public:
+   static inline uint32_t extract(uint32_t val, int msb, int lsb) {
+     int nbits = msb - lsb + 1;
+     assert_cond(msb >= lsb);
+-    uint32_t mask = (1U << nbits) - 1;
++    uint32_t mask = checked_cast<uint32_t>(right_n_bits(nbits));
+     uint32_t result = val >> lsb;
+     result &= mask;
+     return result;
+@@ -208,7 +223,7 @@ public:
+     int nbits = msb - lsb + 1;
+     guarantee(val < (1U << nbits), "Field too big for insn");
+     assert_cond(msb >= lsb);
+-    unsigned mask = (1U << nbits) - 1;
++    unsigned mask = checked_cast<unsigned>(right_n_bits(nbits));
+     val <<= lsb;
+     mask <<= lsb;
+     unsigned target = *(unsigned *)a;
+@@ -222,7 +237,7 @@ public:
+     long chk = val >> (nbits - 1);
+     guarantee (chk == -1 || chk == 0, "Field too big for insn");
+     unsigned uval = val;
+-    unsigned mask = (1U << nbits) - 1;
++    unsigned mask = checked_cast<unsigned>(right_n_bits(nbits));
+     uval &= mask;
+     uval <<= lsb;
+     mask <<= lsb;
+@@ -234,9 +249,9 @@ public:
+ 
+   void f(unsigned val, int msb, int lsb) {
+     int nbits = msb - lsb + 1;
+-    guarantee(val < (1U << nbits), "Field too big for insn");
++    guarantee(val < (1ULL << nbits), "Field too big for insn");
+     assert_cond(msb >= lsb);
+-    unsigned mask = (1U << nbits) - 1;
++    unsigned mask = checked_cast<unsigned>(right_n_bits(nbits));
+     val <<= lsb;
+     mask <<= lsb;
+     insn |= val;
+@@ -255,7 +270,7 @@ public:
+     long chk = val >> (nbits - 1);
+     guarantee (chk == -1 || chk == 0, "Field too big for insn");
+     unsigned uval = val;
+-    unsigned mask = (1U << nbits) - 1;
++    unsigned mask = checked_cast<unsigned>(right_n_bits(nbits));
+     uval &= mask;
+     f(uval, lsb + nbits - 1, lsb);
+   }
+@@ -280,7 +295,7 @@ public:
+ 
+   unsigned get(int msb = 31, int lsb = 0) {
+     int nbits = msb - lsb + 1;
+-    unsigned mask = ((1U << nbits) - 1) << lsb;
++    unsigned mask = checked_cast<unsigned>(right_n_bits(nbits)) << lsb;
+     assert_cond((bits & mask) == mask);
+     return (insn & mask) >> lsb;
+   }
+@@ -1991,21 +2006,21 @@ public:
+     starti;
+     f(0,31), f((int)T & 1, 30);
+     f(op1, 29, 21), f(0, 20, 16), f(op2, 15, 12);
+-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
++    f((int)T >> 1, 11, 10), srf(Xn, 5), rf(Vt, 0);
+   }
+   void ld_st(FloatRegister Vt, SIMD_Arrangement T, Register Xn,
+              int imm, int op1, int op2) {
+     starti;
+     f(0,31), f((int)T & 1, 30);
+     f(op1 | 0b100, 29, 21), f(0b11111, 20, 16), f(op2, 15, 12);
+-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
++    f((int)T >> 1, 11, 10), srf(Xn, 5), rf(Vt, 0);
+   }
+   void ld_st(FloatRegister Vt, SIMD_Arrangement T, Register Xn,
+              Register Xm, int op1, int op2) {
+     starti;
+     f(0,31), f((int)T & 1, 30);
+     f(op1 | 0b100, 29, 21), rf(Xm, 16), f(op2, 15, 12);
+-    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
++    f((int)T >> 1, 11, 10), srf(Xn, 5), rf(Vt, 0);
+   }
+ 
+  void ld_st(FloatRegister Vt, SIMD_Arrangement T, Address a, int op1, int op2) {
+diff --git a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
+index 0ca694038..d334f1b69 100644
+--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
++++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
+@@ -1240,6 +1240,23 @@ public:
+   void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z,
+                        Register zlen, Register tmp1, Register tmp2, Register tmp3,
+                        Register tmp4, Register tmp5, Register tmp6, Register tmp7);
++  void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
++                        FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
++                        FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3);
++  void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
++                      FloatRegister p, FloatRegister z, FloatRegister t1);
++  void ghash_processBlocks_wide(address p, Register state, Register subkeyH,
++                                  Register data, Register blocks, int unrolls);
++  void ghash_modmul (FloatRegister result,
++                       FloatRegister result_lo, FloatRegister result_hi, FloatRegister b,
++                       FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p,
++                       FloatRegister t1, FloatRegister t2, FloatRegister t3);
++
++  void aesenc_loadkeys(Register key, Register keylen);
++  void aesecb_encrypt(Register from, Register to, Register keylen,
++                        FloatRegister data = v0, int unrolls = 1);
++  void aesecb_decrypt(Register from, Register to, Register key, Register keylen);
++  void aes_round(FloatRegister input, FloatRegister subkey);
+   // ISB may be needed because of a safepoint
+   void maybe_isb() { isb(); }
+ 
+diff --git a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64_aes.cpp b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64_aes.cpp
+new file mode 100644
+index 000000000..1db79c97a
+--- /dev/null
++++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64_aes.cpp
+@@ -0,0 +1,685 @@
++/*
++ * Copyright (c) 2003, 2021, Oracle and/or its affiliates. All rights reserved.
++ * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This code is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 only, as
++ * published by the Free Software Foundation.
++ *
++ * This code is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++ * version 2 for more details (a copy is included in the LICENSE file that
++ * accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License version
++ * 2 along with this work; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
++ *
++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
++ * or visit www.oracle.com if you need additional information or have any
++ * questions.
++ *
++ */
++
++#include "precompiled.hpp"
++
++#include "asm/assembler.hpp"
++#include "asm/assembler.inline.hpp"
++#include "macroAssembler_aarch64.hpp"
++#include "memory/resourceArea.hpp"
++#include "runtime/stubRoutines.hpp"
++
++void MacroAssembler::aesecb_decrypt(Register from, Register to, Register key, Register keylen) {
++  Label L_doLast;
++
++  ld1(v0, T16B, from); // get 16 bytes of input
++
++  ld1(v5, T16B, post(key, 16));
++  rev32(v5, T16B, v5);
++
++  ld1(v1, v2, v3, v4, T16B, post(key, 64));
++  rev32(v1, T16B, v1);
++  rev32(v2, T16B, v2);
++  rev32(v3, T16B, v3);
++  rev32(v4, T16B, v4);
++  aesd(v0, v1);
++  aesimc(v0, v0);
++  aesd(v0, v2);
++  aesimc(v0, v0);
++  aesd(v0, v3);
++  aesimc(v0, v0);
++  aesd(v0, v4);
++  aesimc(v0, v0);
++
++  ld1(v1, v2, v3, v4, T16B, post(key, 64));
++  rev32(v1, T16B, v1);
++  rev32(v2, T16B, v2);
++  rev32(v3, T16B, v3);
++  rev32(v4, T16B, v4);
++  aesd(v0, v1);
++  aesimc(v0, v0);
++  aesd(v0, v2);
++  aesimc(v0, v0);
++  aesd(v0, v3);
++  aesimc(v0, v0);
++  aesd(v0, v4);
++  aesimc(v0, v0);
++
++  ld1(v1, v2, T16B, post(key, 32));
++  rev32(v1, T16B, v1);
++  rev32(v2, T16B, v2);
++
++  cmpw(keylen, 44);
++  br(Assembler::EQ, L_doLast);
++
++  aesd(v0, v1);
++  aesimc(v0, v0);
++  aesd(v0, v2);
++  aesimc(v0, v0);
++
++  ld1(v1, v2, T16B, post(key, 32));
++  rev32(v1, T16B, v1);
++  rev32(v2, T16B, v2);
++
++  cmpw(keylen, 52);
++  br(Assembler::EQ, L_doLast);
++
++  aesd(v0, v1);
++  aesimc(v0, v0);
++  aesd(v0, v2);
++  aesimc(v0, v0);
++
++  ld1(v1, v2, T16B, post(key, 32));
++  rev32(v1, T16B, v1);
++  rev32(v2, T16B, v2);
++
++  bind(L_doLast);
++
++  aesd(v0, v1);
++  aesimc(v0, v0);
++  aesd(v0, v2);
++
++  eor(v0, T16B, v0, v5);
++
++  st1(v0, T16B, to);
++
++  // Preserve the address of the start of the key
++  sub(key, key, keylen, LSL, exact_log2(sizeof (jint)));
++}
++
++// Load expanded key into v17..v31
++void MacroAssembler::aesenc_loadkeys(Register key, Register keylen) {
++  Label L_loadkeys_44, L_loadkeys_52;
++  cmpw(keylen, 52);
++  br(Assembler::LO, L_loadkeys_44);
++  br(Assembler::EQ, L_loadkeys_52);
++
++  ld1(v17, v18,  T16B,  post(key, 32));
++  rev32(v17,  T16B, v17);
++  rev32(v18,  T16B, v18);
++  bind(L_loadkeys_52);
++  ld1(v19, v20,  T16B,  post(key, 32));
++  rev32(v19,  T16B, v19);
++  rev32(v20,  T16B, v20);
++  bind(L_loadkeys_44);
++  ld1(v21, v22, v23, v24,  T16B,  post(key, 64));
++  rev32(v21,  T16B, v21);
++  rev32(v22,  T16B, v22);
++  rev32(v23,  T16B, v23);
++  rev32(v24,  T16B, v24);
++  ld1(v25, v26, v27, v28,  T16B,  post(key, 64));
++  rev32(v25,  T16B, v25);
++  rev32(v26,  T16B, v26);
++  rev32(v27,  T16B, v27);
++  rev32(v28,  T16B, v28);
++  ld1(v29, v30, v31,  T16B, post(key, 48));
++  rev32(v29,  T16B, v29);
++  rev32(v30,  T16B, v30);
++  rev32(v31,  T16B, v31);
++
++  // Preserve the address of the start of the key
++  sub(key, key, keylen, LSL, exact_log2(sizeof (jint)));
++}
++
++// NeoverseTM N1Software Optimization Guide:
++// Adjacent AESE/AESMC instruction pairs and adjacent AESD/AESIMC
++// instruction pairs will exhibit the performance characteristics
++// described in Section 4.6.
++void MacroAssembler::aes_round(FloatRegister input, FloatRegister subkey) {
++  aese(input, subkey); aesmc(input, input);
++}
++
++// KernelGenerator
++//
++// The abstract base class of an unrolled function generator.
++// Subclasses override generate(), length(), and next() to generate
++// unrolled and interleaved functions.
++//
++// The core idea is that a subclass defines a method which generates
++// the base case of a function and a method to generate a clone of it,
++// shifted to a different set of registers. KernelGenerator will then
++// generate several interleaved copies of the function, with each one
++// using a different set of registers.
++
++// The subclass must implement three methods: length(), which is the
++// number of instruction bundles in the intrinsic, generate(int n)
++// which emits the nth instruction bundle in the intrinsic, and next()
++// which takes an instance of the generator and returns a version of it,
++// shifted to a new set of registers.
++
++class KernelGenerator: public MacroAssembler {
++protected:
++  const int _unrolls;
++public:
++  KernelGenerator(Assembler *as, int unrolls)
++    : MacroAssembler(as->code()), _unrolls(unrolls) { }
++  virtual void generate(int index) = 0;
++  virtual int length() = 0;
++  virtual KernelGenerator *next() = 0;
++  int unrolls() { return _unrolls; }
++  void unroll();
++};
++
++void KernelGenerator::unroll() {
++  ResourceMark rm;
++  KernelGenerator **generators
++    = NEW_RESOURCE_ARRAY(KernelGenerator *, unrolls());
++
++  generators[0] = this;
++  for (int i = 1; i < unrolls(); i++) {
++    generators[i] = generators[i-1]->next();
++  }
++
++  for (int j = 0; j < length(); j++) {
++    for (int i = 0; i < unrolls(); i++) {
++      generators[i]->generate(j);
++    }
++  }
++}
++
++// An unrolled and interleaved generator for AES encryption.
++class AESKernelGenerator: public KernelGenerator {
++  Register _from, _to;
++  const Register _keylen;
++  FloatRegister _data;
++  const FloatRegister _subkeys;
++  bool _once;
++  Label _rounds_44, _rounds_52;
++
++public:
++  AESKernelGenerator(Assembler *as, int unrolls,
++                     Register from, Register to, Register keylen, FloatRegister data,
++                     FloatRegister subkeys, bool once = true)
++    : KernelGenerator(as, unrolls),
++      _from(from), _to(to), _keylen(keylen), _data(data),
++      _subkeys(subkeys), _once(once) {
++  }
++
++  virtual void generate(int index) {
++    switch (index) {
++    case  0:
++      if (_from != noreg) {
++        ld1(_data, T16B, _from); // get 16 bytes of input
++      }
++      break;
++    case  1:
++      if (_once) {
++        cmpw(_keylen, 52);
++        br(Assembler::LO, _rounds_44);
++        br(Assembler::EQ, _rounds_52);
++      }
++      break;
++    case  2:  aes_round(_data, _subkeys +  0);  break;
++    case  3:  aes_round(_data, _subkeys +  1);  break;
++    case  4:
++      if (_once)  bind(_rounds_52);
++      break;
++    case  5:  aes_round(_data, _subkeys +  2);  break;
++    case  6:  aes_round(_data, _subkeys +  3);  break;
++    case  7:
++      if (_once)  bind(_rounds_44);
++      break;
++    case  8:  aes_round(_data, _subkeys +  4);  break;
++    case  9:  aes_round(_data, _subkeys +  5);  break;
++    case 10:  aes_round(_data, _subkeys +  6);  break;
++    case 11:  aes_round(_data, _subkeys +  7);  break;
++    case 12:  aes_round(_data, _subkeys +  8);  break;
++    case 13:  aes_round(_data, _subkeys +  9);  break;
++    case 14:  aes_round(_data, _subkeys + 10);  break;
++    case 15:  aes_round(_data, _subkeys + 11);  break;
++    case 16:  aes_round(_data, _subkeys + 12);  break;
++    case 17:  aese(_data, _subkeys + 13);  break;
++    case 18:  eor(_data, T16B, _data, _subkeys + 14);  break;
++    case 19:
++      if (_to != noreg) {
++        st1(_data, T16B, _to);
++      }
++      break;
++    default: ShouldNotReachHere();
++    }
++  }
++
++  virtual KernelGenerator *next() {
++    return new AESKernelGenerator(this, _unrolls,
++                                  _from, _to, _keylen,
++                                  _data + 1, _subkeys, /*once*/false);
++  }
++
++  virtual int length() { return 20; }
++};
++
++// Uses expanded key in v17..v31
++// Returns encrypted values in inputs.
++// If to != noreg, store value at to; likewise from
++// Preserves key, keylen
++// Increments from, to
++// Input data in v0, v1, ...
++// unrolls controls the number of times to unroll the generated function
++void MacroAssembler::aesecb_encrypt(Register from, Register to, Register keylen,
++                                    FloatRegister data, int unrolls) {
++  AESKernelGenerator(this, unrolls, from, to, keylen, data, v17) .unroll();
++}
++
++// ghash_multiply and ghash_reduce are the non-unrolled versions of
++// the GHASH function generators.
++void MacroAssembler::ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
++                                     FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
++                                     FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3) {
++  // Karatsuba multiplication performs a 128*128 -> 256-bit
++  // multiplication in three 128-bit multiplications and a few
++  // additions.
++  //
++  // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
++  // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
++  //
++  // Inputs:
++  //
++  // A0 in a.d[0]     (subkey)
++  // A1 in a.d[1]
++  // (A1+A0) in a1_xor_a0.d[0]
++  //
++  // B0 in b.d[0]     (state)
++  // B1 in b.d[1]
++
++  ext(tmp1, T16B, b, b, 0x08);
++  pmull2(result_hi, T1Q, b, a, T2D);  // A1*B1
++  eor(tmp1, T16B, tmp1, b);           // (B1+B0)
++  pmull(result_lo,  T1Q, b, a, T1D);  // A0*B0
++  pmull(tmp2, T1Q, tmp1, a1_xor_a0, T1D); // (A1+A0)(B1+B0)
++
++  ext(tmp1, T16B, result_lo, result_hi, 0x08);
++  eor(tmp3, T16B, result_hi, result_lo); // A1*B1+A0*B0
++  eor(tmp2, T16B, tmp2, tmp1);
++  eor(tmp2, T16B, tmp2, tmp3);
++
++  // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
++  ins(result_hi, D, tmp2, 0, 1);
++  ins(result_lo, D, tmp2, 1, 0);
++}
++
++void MacroAssembler::ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
++                  FloatRegister p, FloatRegister vzr, FloatRegister t1) {
++  const FloatRegister t0 = result;
++
++  // The GCM field polynomial f is z^128 + p(z), where p =
++  // z^7+z^2+z+1.
++  //
++  //    z^128 === -p(z)  (mod (z^128 + p(z)))
++  //
++  // so, given that the product we're reducing is
++  //    a == lo + hi * z^128
++  // substituting,
++  //      === lo - hi * p(z)  (mod (z^128 + p(z)))
++  //
++  // we reduce by multiplying hi by p(z) and subtracting the result
++  // from (i.e. XORing it with) lo.  Because p has no nonzero high
++  // bits we can do this with two 64-bit multiplications, lo*p and
++  // hi*p.
++
++  pmull2(t0, T1Q, hi, p, T2D);
++  ext(t1, T16B, t0, vzr, 8);
++  eor(hi, T16B, hi, t1);
++  ext(t1, T16B, vzr, t0, 8);
++  eor(lo, T16B, lo, t1);
++  pmull(t0, T1Q, hi, p, T1D);
++  eor(result, T16B, lo, t0);
++}
++
++class GHASHMultiplyGenerator: public KernelGenerator {
++  FloatRegister _result_lo, _result_hi, _b,
++    _a, _vzr, _a1_xor_a0, _p,
++    _tmp1, _tmp2, _tmp3;
++
++public:
++  GHASHMultiplyGenerator(Assembler *as, int unrolls,
++                         FloatRegister result_lo, FloatRegister result_hi,
++                         /* offsetted registers */
++                         FloatRegister b,
++                         /* non-offsetted (shared) registers */
++                         FloatRegister a, FloatRegister a1_xor_a0, FloatRegister p, FloatRegister vzr,
++                         /* offseted (temp) registers */
++                         FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3)
++    : KernelGenerator(as, unrolls),
++      _result_lo(result_lo), _result_hi(result_hi), _b(b),
++      _a(a), _vzr(vzr), _a1_xor_a0(a1_xor_a0), _p(p),
++      _tmp1(tmp1), _tmp2(tmp2), _tmp3(tmp3) { }
++
++  static const int register_stride = 7;
++
++  virtual void generate(int index) {
++    // Karatsuba multiplication performs a 128*128 -> 256-bit
++    // multiplication in three 128-bit multiplications and a few
++    // additions.
++    //
++    // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
++    // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
++    //
++    // Inputs:
++    //
++    // A0 in a.d[0]     (subkey)
++    // A1 in a.d[1]
++    // (A1+A0) in a1_xor_a0.d[0]
++    //
++    // B0 in b.d[0]     (state)
++    // B1 in b.d[1]
++
++    switch (index) {
++      case  0:  ext(_tmp1, T16B, _b, _b, 0x08);  break;
++      case  1:  pmull2(_result_hi, T1Q, _b, _a, T2D);  // A1*B1
++        break;
++      case  2:  eor(_tmp1, T16B, _tmp1, _b);           // (B1+B0)
++        break;
++      case  3:  pmull(_result_lo,  T1Q, _b, _a, T1D);  // A0*B0
++        break;
++      case  4:  pmull(_tmp2, T1Q, _tmp1, _a1_xor_a0, T1D); // (A1+A0)(B1+B0)
++        break;
++
++      case  5:  ext(_tmp1, T16B, _result_lo, _result_hi, 0x08);  break;
++      case  6:  eor(_tmp3, T16B, _result_hi, _result_lo); // A1*B1+A0*B0
++        break;
++      case  7:  eor(_tmp2, T16B, _tmp2, _tmp1);  break;
++      case  8:  eor(_tmp2, T16B, _tmp2, _tmp3);  break;
++
++        // Register pair <_result_hi:_result_lo> holds the _result of carry-less multiplication
++      case  9:  ins(_result_hi, D, _tmp2, 0, 1);  break;
++      case 10:  ins(_result_lo, D, _tmp2, 1, 0);  break;
++      default: ShouldNotReachHere();
++    }
++  }
++
++  virtual KernelGenerator *next() {
++    GHASHMultiplyGenerator *result
++      = new GHASHMultiplyGenerator(this, _unrolls, _result_lo, _result_hi,
++                                   _b, _a, _a1_xor_a0, _p, _vzr,
++                                   _tmp1, _tmp2, _tmp3);
++    result->_result_lo += register_stride;
++    result->_result_hi += register_stride;
++    result->_b += register_stride;
++    result->_tmp1 += register_stride;
++    result->_tmp2 += register_stride;
++    result->_tmp3 += register_stride;
++    return result;
++  }
++
++  virtual int length() { return 11; }
++};
++
++// Reduce the 128-bit product in hi:lo by the GCM field polynomial.
++// The FloatRegister argument called data is optional: if it is a
++// valid register, we interleave LD1 instructions with the
++// reduction. This is to reduce latency next time around the loop.
++class GHASHReduceGenerator: public KernelGenerator {
++  FloatRegister _result, _lo, _hi, _p, _vzr, _data, _t1;
++  int _once;
++public:
++  GHASHReduceGenerator(Assembler *as, int unrolls,
++                       /* offsetted registers */
++                       FloatRegister result, FloatRegister lo, FloatRegister hi,
++                       /* non-offsetted (shared) registers */
++                       FloatRegister p, FloatRegister vzr, FloatRegister data,
++                       /* offseted (temp) registers */
++                       FloatRegister t1)
++    : KernelGenerator(as, unrolls),
++      _result(result), _lo(lo), _hi(hi),
++      _p(p), _vzr(vzr), _data(data), _t1(t1), _once(true) { }
++
++  static const int register_stride = 7;
++
++  virtual void generate(int index) {
++    const FloatRegister t0 = _result;
++
++    switch (index) {
++      // The GCM field polynomial f is z^128 + p(z), where p =
++      // z^7+z^2+z+1.
++      //
++      //    z^128 === -p(z)  (mod (z^128 + p(z)))
++      //
++      // so, given that the product we're reducing is
++      //    a == lo + hi * z^128
++      // substituting,
++      //      === lo - hi * p(z)  (mod (z^128 + p(z)))
++      //
++      // we reduce by multiplying hi by p(z) and subtracting the _result
++      // from (i.e. XORing it with) lo.  Because p has no nonzero high
++      // bits we can do this with two 64-bit multiplications, lo*p and
++      // hi*p.
++
++      case  0:  pmull2(t0, T1Q, _hi, _p, T2D);  break;
++      case  1:  ext(_t1, T16B, t0, _vzr, 8);  break;
++      case  2:  eor(_hi, T16B, _hi, _t1);  break;
++      case  3:  ext(_t1, T16B, _vzr, t0, 8);  break;
++      case  4:  eor(_lo, T16B, _lo, _t1);  break;
++      case  5:  pmull(t0, T1Q, _hi, _p, T1D);  break;
++      case  6:  eor(_result, T16B, _lo, t0);  break;
++      default: ShouldNotReachHere();
++    }
++
++    // Sprinkle load instructions into the generated instructions
++    if (_data->is_valid() && _once) {
++      assert(length() >= unrolls(), "not enough room for inteleaved loads");
++      if (index < unrolls()) {
++        ld1((_data + index*register_stride), T16B, post(r2, 0x10));
++      }
++    }
++  }
++
++  virtual KernelGenerator *next() {
++    GHASHReduceGenerator *result
++      = new GHASHReduceGenerator(this, _unrolls,
++                                 _result, _lo, _hi, _p, _vzr, _data, _t1);
++    result->_result += register_stride;
++    result->_hi += register_stride;
++    result->_lo += register_stride;
++    result->_t1 += register_stride;
++    result->_once = false;
++    return result;
++  }
++
++ int length() { return 7; }
++};
++
++// Perform a GHASH multiply/reduce on a single FloatRegister.
++void MacroAssembler::ghash_modmul(FloatRegister result,
++                                  FloatRegister result_lo, FloatRegister result_hi, FloatRegister b,
++                                  FloatRegister a, FloatRegister vzr, FloatRegister a1_xor_a0, FloatRegister p,
++                                  FloatRegister t1, FloatRegister t2, FloatRegister t3) {
++  ghash_multiply(result_lo, result_hi, a, b, a1_xor_a0, t1, t2, t3);
++  ghash_reduce(result, result_lo, result_hi, p, vzr, t1);
++}
++
++// Interleaved GHASH processing.
++//
++// Clobbers all vector registers.
++//
++void MacroAssembler::ghash_processBlocks_wide(address field_polynomial, Register state,
++                                              Register subkeyH,
++                                              Register data, Register blocks, int unrolls) {
++  int register_stride = 7;
++
++  // Bafflingly, GCM uses little-endian for the byte order, but
++  // big-endian for the bit order.  For example, the polynomial 1 is
++  // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
++  //
++  // So, we must either reverse the bytes in each word and do
++  // everything big-endian or reverse the bits in each byte and do
++  // it little-endian.  On AArch64 it's more idiomatic to reverse
++  // the bits in each byte (we have an instruction, RBIT, to do
++  // that) and keep the data in little-endian bit order throught the
++  // calculation, bit-reversing the inputs and outputs.
++
++  assert(unrolls * register_stride < 32, "out of registers");
++
++  FloatRegister a1_xor_a0 = v28;
++  FloatRegister Hprime = v29;
++  FloatRegister vzr = v30;
++  FloatRegister p = v31;
++  eor(vzr, T16B, vzr, vzr); // zero register
++
++  ldrq(p, field_polynomial);    // The field polynomial
++
++  ldrq(v0, Address(state));
++  ldrq(Hprime, Address(subkeyH));
++
++  rev64(v0, T16B, v0);          // Bit-reverse words in state and subkeyH
++  rbit(v0, T16B, v0);
++  rev64(Hprime, T16B, Hprime);
++  rbit(Hprime, T16B, Hprime);
++
++  // Powers of H -> Hprime
++
++  Label already_calculated, done;
++  {
++    // The first time around we'll have to calculate H**2, H**3, etc.
++    // Look at the largest power of H in the subkeyH array to see if
++    // it's already been calculated.
++    ldp(rscratch1, rscratch2, Address(subkeyH, 16 * (unrolls - 1)));
++    orr(rscratch1, rscratch1, rscratch2);
++    cbnz(rscratch1, already_calculated);
++
++    orr(v6, T16B, Hprime, Hprime);  // Start with H in v6 and Hprime
++    for (int i = 1; i < unrolls; i++) {
++      ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
++      eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);    // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
++      ghash_modmul(/*result*/v6, /*result_lo*/v5, /*result_hi*/v4, /*b*/v6,
++                   Hprime, vzr, a1_xor_a0, p,
++                   /*temps*/v1, v3, v2);
++      rev64(v1, T16B, v6);
++      rbit(v1, T16B, v1);
++      strq(v1, Address(subkeyH, 16 * i));
++    }
++    b(done);
++  }
++  {
++    bind(already_calculated);
++
++    // Load the largest power of H we need into v6.
++    ldrq(v6, Address(subkeyH, 16 * (unrolls - 1)));
++    rev64(v6, T16B, v6);
++    rbit(v6, T16B, v6);
++  }
++  bind(done);
++
++  orr(Hprime, T16B, v6, v6);     // Move H ** unrolls into Hprime
++
++  // Hprime contains (H ** 1, H ** 2, ... H ** unrolls)
++  // v0 contains the initial state. Clear the others.
++  for (int i = 1; i < unrolls; i++) {
++    int ofs = register_stride * i;
++    eor(ofs+v0, T16B, ofs+v0, ofs+v0); // zero each state register
++  }
++
++  ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
++  eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);    // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
++
++  // Load #unrolls blocks of data
++  for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) {
++    ld1(v2+ofs, T16B, post(data, 0x10));
++  }
++
++  // Register assignments, replicated across 4 clones, v0 ... v23
++  //
++  // v0: input / output: current state, result of multiply/reduce
++  // v1: temp
++  // v2: input: one block of data (the ciphertext)
++  //     also used as a temp once the data has been consumed
++  // v3: temp
++  // v4: output: high part of product
++  // v5: output: low part ...
++  // v6: unused
++  //
++  // Not replicated:
++  //
++  // v28: High part of H xor low part of H'
++  // v29: H' (hash subkey)
++  // v30: zero
++  // v31: Reduction polynomial of the Galois field
++
++  // Inner loop.
++  // Do the whole load/add/multiply/reduce over all our data except
++  // the last few rows.
++  {
++    Label L_ghash_loop;
++    bind(L_ghash_loop);
++
++    // Prefetching doesn't help here. In fact, on Neoverse N1 it's worse.
++    // prfm(Address(data, 128), PLDL1KEEP);
++
++    // Xor data into current state
++    for (int ofs = 0; ofs < unrolls * register_stride; ofs += register_stride) {
++      rbit((v2+ofs), T16B, (v2+ofs));
++      eor((v2+ofs), T16B, v0+ofs, (v2+ofs));   // bit-swapped data ^ bit-swapped state
++    }
++
++    // Generate fully-unrolled multiply-reduce in two stages.
++
++    (new GHASHMultiplyGenerator(this, unrolls,
++                                /*result_lo*/v5, /*result_hi*/v4, /*data*/v2,
++                                Hprime, a1_xor_a0, p, vzr,
++                                /*temps*/v1, v3, /* reuse b*/v2))->unroll();
++
++    // NB: GHASHReduceGenerator also loads the next #unrolls blocks of
++    // data into v0, v0+ofs, the current state.
++    (new GHASHReduceGenerator (this, unrolls,
++                               /*result*/v0, /*lo*/v5, /*hi*/v4, p, vzr,
++                               /*data*/v2, /*temp*/v3))->unroll();
++
++    sub(blocks, blocks, unrolls);
++    cmp(blocks, (unsigned char)(unrolls * 2));
++    br(GE, L_ghash_loop);
++  }
++
++  // Merge the #unrolls states.  Note that the data for the next
++  // iteration has already been loaded into v4, v4+ofs, etc...
++
++  // First, we multiply/reduce each clone by the appropriate power of H.
++  for (int i = 0; i < unrolls; i++) {
++    int ofs = register_stride * i;
++    ldrq(Hprime, Address(subkeyH, 16 * (unrolls - i - 1)));
++
++    rbit(v2+ofs, T16B, v2+ofs);
++    eor(v2+ofs, T16B, ofs+v0, v2+ofs);   // bit-swapped data ^ bit-swapped state
++
++    rev64(Hprime, T16B, Hprime);
++    rbit(Hprime, T16B, Hprime);
++    ext(a1_xor_a0, T16B, Hprime, Hprime, 0x08); // long-swap subkeyH into a1_xor_a0
++    eor(a1_xor_a0, T16B, a1_xor_a0, Hprime);    // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
++    ghash_modmul(/*result*/v0+ofs, /*result_lo*/v5+ofs, /*result_hi*/v4+ofs, /*b*/v2+ofs,
++                 Hprime, vzr, a1_xor_a0, p,
++                 /*temps*/v1+ofs, v3+ofs, /* reuse b*/v2+ofs);
++  }
++
++  // Then we sum the results.
++  for (int i = 0; i < unrolls - 1; i++) {
++    int ofs = register_stride * i;
++    eor(v0, T16B, v0, v0 + register_stride + ofs);
++  }
++
++  sub(blocks, blocks, (unsigned char)unrolls);
++
++  // And finally bit-reverse the state back to big endian.
++  rev64(v0, T16B, v0);
++  rbit(v0, T16B, v0);
++  st1(v0, T16B, state);
++}
+\ No newline at end of file
+diff --git a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
+index 2e2e8ae78..c024dec55 100644
+--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
++++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
+@@ -2804,6 +2804,266 @@ class StubGenerator: public StubCodeGenerator {
+       return start;
+   }
+ 
++  // CTR AES crypt.
++    // Arguments:
++    //
++    // Inputs:
++    //   c_rarg0   - source byte array address
++    //   c_rarg1   - destination byte array address
++    //   c_rarg2   - K (key) in little endian int array
++    //   c_rarg3   - counter vector byte array address
++    //   c_rarg4   - input length
++    //   c_rarg5   - saved encryptedCounter start
++    //   c_rarg6   - saved used length
++    //
++    // Output:
++    //   r0       - input length
++    //
++    address generate_counterMode_AESCrypt() {
++      const Register in = c_rarg0;
++      const Register out = c_rarg1;
++      const Register key = c_rarg2;
++      const Register counter = c_rarg3;
++      const Register saved_len = c_rarg4, len = r10;
++      const Register saved_encrypted_ctr = c_rarg5;
++      const Register used_ptr = c_rarg6, used = r12;
++
++      const Register offset = r7;
++      const Register keylen = r11;
++
++      const unsigned char block_size = 16;
++      const int bulk_width = 4;
++      // NB: bulk_width can be 4 or 8. 8 gives slightly faster
++      // performance with larger data sizes, but it also means that the
++      // fast path isn't used until you have at least 8 blocks, and up
++      // to 127 bytes of data will be executed on the slow path. For
++      // that reason, and also so as not to blow away too much icache, 4
++      // blocks seems like a sensible compromise.
++
++      // Algorithm:
++      //
++      //    if (len == 0) {
++      //        goto DONE;
++      //    }
++      //    int result = len;
++      //    do {
++      //        if (used >= blockSize) {
++      //            if (len >= bulk_width * blockSize) {
++      //                CTR_large_block();
++      //                if (len == 0)
++      //                    goto DONE;
++      //            }
++      //            for (;;) {
++      //                16ByteVector v0 = counter;
++      //                embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
++      //                used = 0;
++      //                if (len < blockSize)
++      //                    break;    /* goto NEXT */
++      //                16ByteVector v1 = load16Bytes(in, offset);
++      //                v1 = v1 ^ encryptedCounter;
++      //                store16Bytes(out, offset);
++      //                used = blockSize;
++      //                offset += blockSize;
++      //                len -= blockSize;
++      //                if (len == 0)
++      //                    goto DONE;
++      //            }
++      //        }
++      //      NEXT:
++      //        out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
++      //        len--;
++      //    } while (len != 0);
++      //  DONE:
++      //    return result;
++      //
++      // CTR_large_block()
++      //    Wide bulk encryption of whole blocks.
++
++      __ align(CodeEntryAlignment);
++      StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
++      const address start = __ pc();
++      __ enter();
++
++      Label DONE, CTR_large_block, large_block_return;
++      __ ldrw(used, Address(used_ptr));
++      __ cbzw(saved_len, DONE);
++
++      __ mov(len, saved_len);
++      __ mov(offset, 0);
++
++      // Compute #rounds for AES based on the length of the key array
++      __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
++
++      __ aesenc_loadkeys(key, keylen);
++
++      {
++        Label L_CTR_loop, NEXT;
++
++        __ bind(L_CTR_loop);
++
++        __ cmp(used, block_size);
++        __ br(__ LO, NEXT);
++
++        // Maybe we have a lot of data
++        __ subsw(rscratch1, len, bulk_width * block_size);
++        __ br(__ HS, CTR_large_block);
++        __ BIND(large_block_return);
++        __ cbzw(len, DONE);
++
++        // Setup the counter
++        __ movi(v4, __ T4S, 0);
++        __ movi(v5, __ T4S, 1);
++        __ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
++
++        __ ld1(v0, __ T16B, counter); // Load the counter into v0
++        __ rev32(v16, __ T16B, v0);
++        __ addv(v16, __ T4S, v16, v4);
++        __ rev32(v16, __ T16B, v16);
++        __ st1(v16, __ T16B, counter); // Save the incremented counter back
++
++        {
++          // We have fewer than bulk_width blocks of data left. Encrypt
++          // them one by one until there is less than a full block
++          // remaining, being careful to save both the encrypted counter
++          // and the counter.
++
++          Label inner_loop;
++          __ bind(inner_loop);
++          // Counter to encrypt is in v0
++          __ aesecb_encrypt(noreg, noreg, keylen);
++          __ st1(v0, __ T16B, saved_encrypted_ctr);
++
++          // Do we have a remaining full block?
++
++          __ mov(used, 0);
++          __ cmp(len, block_size);
++          __ br(__ LO, NEXT);
++
++          // Yes, we have a full block
++          __ ldrq(v1, Address(in, offset));
++          __ eor(v1, __ T16B, v1, v0);
++          __ strq(v1, Address(out, offset));
++          __ mov(used, block_size);
++          __ add(offset, offset, block_size);
++
++          __ subw(len, len, block_size);
++          __ cbzw(len, DONE);
++
++          // Increment the counter, store it back
++          __ orr(v0, __ T16B, v16, v16);
++          __ rev32(v16, __ T16B, v16);
++          __ addv(v16, __ T4S, v16, v4);
++          __ rev32(v16, __ T16B, v16);
++          __ st1(v16, __ T16B, counter); // Save the incremented counter back
++
++          __ b(inner_loop);
++        }
++
++        __ BIND(NEXT);
++
++        // Encrypt a single byte, and loop.
++        // We expect this to be a rare event.
++        __ ldrb(rscratch1, Address(in, offset));
++        __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
++        __ eor(rscratch1, rscratch1, rscratch2);
++        __ strb(rscratch1, Address(out, offset));
++        __ add(offset, offset, 1);
++        __ add(used, used, 1);
++        __ subw(len, len,1);
++        __ cbnzw(len, L_CTR_loop);
++      }
++
++      __ bind(DONE);
++      __ strw(used, Address(used_ptr));
++      __ mov(r0, saved_len);
++
++      __ leave(); // required for proper stackwalking of RuntimeStub frame
++      __ ret(lr);
++
++      // Bulk encryption
++
++      __ BIND (CTR_large_block);
++      assert(bulk_width == 4 || bulk_width == 8, "must be");
++
++      if (bulk_width == 8) {
++        __ sub(sp, sp, 4 * 16);
++        __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
++      }
++      __ sub(sp, sp, 4 * 16);
++      __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
++      RegSet saved_regs = (RegSet::of(in, out, offset)
++                           + RegSet::of(saved_encrypted_ctr, used_ptr, len));
++      __ push(saved_regs, sp);
++      __ andr(len, len, -16 * bulk_width);  // 8/4 encryptions, 16 bytes per encryption
++      __ add(in, in, offset);
++      __ add(out, out, offset);
++
++      // Keys should already be loaded into the correct registers
++
++      __ ld1(v0, __ T16B, counter); // v0 contains the first counter
++      __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
++
++      // AES/CTR loop
++      {
++        Label L_CTR_loop;
++        __ BIND(L_CTR_loop);
++
++        // Setup the counters
++        __ movi(v8, __ T4S, 0);
++        __ movi(v9, __ T4S, 1);
++        __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
++
++        for (FloatRegister f = v0; f < v0 + bulk_width; f++) {
++          __ rev32(f, __ T16B, v16);
++          __ addv(v16, __ T4S, v16, v8);
++        }
++
++        __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
++
++        // Encrypt the counters
++        __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
++
++        if (bulk_width == 8) {
++          __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
++        }
++
++        // XOR the encrypted counters with the inputs
++        for (int i = 0; i < bulk_width; i++) {
++          __ eor(v0 + i, __ T16B, v0 + i, v8 + i);
++        }
++
++        // Write the encrypted data
++        __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
++        if (bulk_width == 8) {
++          __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
++        }
++
++        __ subw(len, len, 16 * bulk_width);
++        __ cbnzw(len, L_CTR_loop);
++      }
++
++      // Save the counter back where it goes
++      __ rev32(v16, __ T16B, v16);
++      __ st1(v16, __ T16B, counter);
++
++      __ pop(saved_regs, sp);
++
++      __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
++      if (bulk_width == 8) {
++        __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
++      }
++
++      __ andr(rscratch1, len, -16 * bulk_width);
++      __ sub(len, len, rscratch1);
++      __ add(offset, offset, rscratch1);
++      __ mov(used, 16);
++      __ strw(used, Address(used_ptr));
++      __ b(large_block_return);
++
++      return start;
++    }
++
++
+   // Arguments:
+   //
+   // Inputs:
+@@ -3677,6 +3937,56 @@ class StubGenerator: public StubCodeGenerator {
+     return start;
+   }
+ 
++  address generate_ghash_processBlocks_wide() {
++    address small = generate_ghash_processBlocks();
++
++    StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
++    __ align(wordSize * 2);
++    address p = __ pc();
++    __ emit_int64(0x87);  // The low-order bits of the field
++                            // polynomial (i.e. p = z^7+z^2+z+1)
++                            // repeated in the low and high parts of a
++                            // 128-bit vector
++    __ emit_int64(0x87);
++
++    __ align(CodeEntryAlignment);
++    address start = __ pc();
++
++    Register state   = c_rarg0;
++    Register subkeyH = c_rarg1;
++    Register data    = c_rarg2;
++    Register blocks  = c_rarg3;
++
++    const int unroll = 4;
++
++    __ cmp(blocks, (unsigned char)(unroll * 2));
++    __ br(__ LT, small);
++
++    if (unroll > 1) {
++    // Save state before entering routine
++      __ sub(sp, sp, 4 * 16);
++      __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
++      __ sub(sp, sp, 4 * 16);
++      __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
++    }
++
++    __ ghash_processBlocks_wide(p, state, subkeyH, data, blocks, unroll);
++
++    if (unroll > 1) {
++      // And restore state
++      __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
++      __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
++    }
++
++    __ cmp(blocks, 0u);
++    __ br(__ GT, small);
++
++    __ ret(lr);
++
++    return start;
++  }
++
++
+   // Continuation point for throwing of implicit exceptions that are
+   // not handled in the current activation. Fabricates an exception
+   // oop and initiates normal exception dispatching in this
+@@ -4687,6 +4997,15 @@ class StubGenerator: public StubCodeGenerator {
+       StubRoutines::_montgomerySquare = g.generate_multiply();
+     }
+ 
++    // generate GHASH intrinsics code
++    if (UseGHASHIntrinsics) {
++      if (UseAESCTRIntrinsics) {
++        StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
++      } else {
++        StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
++      }
++    }
++
+     if (UseAESIntrinsics) {
+       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+@@ -4694,9 +5013,8 @@ class StubGenerator: public StubCodeGenerator {
+       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
+     }
+ 
+-    // generate GHASH intrinsics code
+-    if (UseGHASHIntrinsics) {
+-      StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
++    if (UseAESCTRIntrinsics) {
++      StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
+     }
+ 
+     if (UseSHA1Intrinsics) {
+diff --git a/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp
+index d1c312ab3..05619ce7f 100644
+--- a/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp
++++ b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp
+@@ -37,7 +37,7 @@ static bool    returns_to_call_stub(address return_pc)   {
+ 
+ enum platform_dependent_constants {
+   code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
+-  code_size2 = 22000           // simply increase if too small (assembler will crash if too small)
++  code_size2 = 32000           // simply increase if too small (assembler will crash if too small)
+ };
+ 
+ class aarch64 {
+diff --git a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
+index 9808337a0..de636fb83 100644
+--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
++++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
+@@ -233,12 +233,21 @@ void VM_Version::get_processor_features() {
+       warning("UseAESIntrinsics enabled, but UseAES not, enabling");
+       UseAES = true;
+     }
++    if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
++      FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
++    }
+   } else {
+     if (UseAES) {
+-      warning("UseAES specified, but not supported on this CPU");
++      warning("AES instructions are not available on this CPU");
++      FLAG_SET_DEFAULT(UseAES, false);
+     }
+     if (UseAESIntrinsics) {
+-      warning("UseAESIntrinsics specified, but not supported on this CPU");
++      warning("AES intrinsics are not available on this CPU");
++      FLAG_SET_DEFAULT(UseAESIntrinsics, false);
++    }
++    if (UseAESCTRIntrinsics) {
++      warning("AES/CTR intrinsics are not available on this CPU");
++      FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
+     }
+   }
+ 
+diff --git a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
+index b5ce1cfa9..fea8b1f87 100644
+--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
++++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
+@@ -194,6 +194,11 @@ void VM_Version::initialize() {
+     FLAG_SET_DEFAULT(UseAESIntrinsics, false);
+   }
+ 
++  if (UseAESCTRIntrinsics) {
++    warning("AES/CTR intrinsics are not available on this CPU");
++    FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
++  }
++
+   if (UseGHASHIntrinsics) {
+     warning("GHASH intrinsics are not available on this CPU");
+     FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+diff --git a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
+index bd893e138..08d7a7311 100644
+--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
++++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
+@@ -319,6 +319,11 @@ void VM_Version::initialize() {
+     }
+   }
+ 
++  if (UseAESCTRIntrinsics) {
++    warning("AES/CTR intrinsics are not available on this CPU");
++    FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
++  }
++
+   // GHASH/GCM intrinsics
+   if (has_vis3() && (UseVIS > 2)) {
+     if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
+diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.cpp b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+index 1759ecdfd..ddc1acfd8 100644
+--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
++++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+@@ -2373,20 +2373,52 @@ void Assembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) {
+ 
+ void Assembler::pextrd(Register dst, XMMRegister src, int imm8) {
+   assert(VM_Version::supports_sse4_1(), "");
+-  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, false);
++  int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, false);
+   emit_int8(0x16);
+   emit_int8((unsigned char)(0xC0 | encode));
+   emit_int8(imm8);
+ }
+ 
++void Assembler::pextrd(Address dst, XMMRegister src, int imm8) {
++  assert(VM_Version::supports_sse4_1(), "");
++  simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, false);
++  emit_int8(0x16);
++  emit_operand(src, dst);
++  emit_int8(imm8);
++}
++
+ void Assembler::pextrq(Register dst, XMMRegister src, int imm8) {
+   assert(VM_Version::supports_sse4_1(), "");
+-  int encode = simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, true);
++  int encode = simd_prefix_and_encode(src, xnoreg, as_XMMRegister(dst->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, true);
+   emit_int8(0x16);
+   emit_int8((unsigned char)(0xC0 | encode));
+   emit_int8(imm8);
+ }
+ 
++void Assembler::pextrq(Address dst, XMMRegister src, int imm8) {
++  assert(VM_Version::supports_sse4_1(), "");
++  simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A, true);
++  emit_int8(0x16);
++  emit_operand(src, dst);
++  emit_int8(imm8);
++}
++
++void Assembler::pextrw(Address dst, XMMRegister src, int imm8) {
++  assert(VM_Version::supports_sse4_1(), "");
++  simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A);
++  emit_int8((unsigned char)0x15);
++  emit_operand(src, dst);
++  emit_int8(imm8);
++}
++
++void Assembler::pextrb(Address dst, XMMRegister src, int imm8) {
++  assert(VM_Version::supports_sse4_1(), "");
++  simd_prefix(src, xnoreg, dst, VEX_SIMD_66, VEX_OPCODE_0F_3A);
++  emit_int8(0x14);
++  emit_operand(src, dst);
++  emit_int8(imm8);
++}
++
+ void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) {
+   assert(VM_Version::supports_sse4_1(), "");
+   int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, false);
+@@ -2395,6 +2427,14 @@ void Assembler::pinsrd(XMMRegister dst, Register src, int imm8) {
+   emit_int8(imm8);
+ }
+ 
++void Assembler::pinsrd(XMMRegister dst, Address src, int imm8) {
++  assert(VM_Version::supports_sse4_1(), "");
++  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, false);
++  emit_int8(0x22);
++  emit_operand(dst,src);
++  emit_int8(imm8);
++}
++
+ void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) {
+   assert(VM_Version::supports_sse4_1(), "");
+   int encode = simd_prefix_and_encode(dst, dst, as_XMMRegister(src->encoding()), VEX_SIMD_66, VEX_OPCODE_0F_3A, true);
+@@ -2403,6 +2443,30 @@ void Assembler::pinsrq(XMMRegister dst, Register src, int imm8) {
+   emit_int8(imm8);
+ }
+ 
++void Assembler::pinsrq(XMMRegister dst, Address src, int imm8) {
++  assert(VM_Version::supports_sse4_1(), "");
++  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, true);
++  emit_int8(0x22);
++  emit_operand(dst, src);
++  emit_int8(imm8);
++}
++
++void Assembler::pinsrw(XMMRegister dst, Address src, int imm8) {
++  assert(VM_Version::supports_sse2(), "");
++  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F);
++  emit_int8((unsigned char)0xC4);
++  emit_operand(dst, src);
++  emit_int8(imm8);
++}
++
++void Assembler::pinsrb(XMMRegister dst, Address src, int imm8) {
++  assert(VM_Version::supports_sse4_1(), "");
++  simd_prefix(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A);
++  emit_int8(0x20);
++  emit_operand(dst, src);
++  emit_int8(imm8);
++}
++
+ void Assembler::pmovzxbw(XMMRegister dst, Address src) {
+   assert(VM_Version::supports_sse4_1(), "");
+   InstructionMark im(this);
+@@ -3075,6 +3139,12 @@ void Assembler::xorl(Register dst, Register src) {
+   emit_arith(0x33, 0xC0, dst, src);
+ }
+ 
++void Assembler::xorb(Register dst, Address src) {
++  InstructionMark im(this);
++  prefix(src, dst);
++  emit_int8(0x32);
++  emit_operand(dst, src);
++}
+ 
+ // AVX 3-operands scalar float-point arithmetic instructions
+ 
+diff --git a/hotspot/src/cpu/x86/vm/assembler_x86.hpp b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
+index 5ea01311e..c2e70bc2a 100644
+--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp
++++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
+@@ -1479,10 +1479,20 @@ private:
+   // SSE 4.1 extract
+   void pextrd(Register dst, XMMRegister src, int imm8);
+   void pextrq(Register dst, XMMRegister src, int imm8);
++  void pextrd(Address dst, XMMRegister src, int imm8);
++  void pextrq(Address dst, XMMRegister src, int imm8);
++  void pextrb(Address dst, XMMRegister src, int imm8);
++  // SSE 2 extract
++  void pextrw(Address dst, XMMRegister src, int imm8);
+ 
+   // SSE 4.1 insert
+   void pinsrd(XMMRegister dst, Register src, int imm8);
+   void pinsrq(XMMRegister dst, Register src, int imm8);
++  void pinsrd(XMMRegister dst, Address src, int imm8);
++  void pinsrq(XMMRegister dst, Address src, int imm8);
++  void pinsrb(XMMRegister dst, Address src, int imm8);
++  // SSE 2 insert
++  void pinsrw(XMMRegister dst, Address src, int imm8);
+ 
+   // SSE4.1 packed move
+   void pmovzxbw(XMMRegister dst, XMMRegister src);
+@@ -1687,6 +1697,8 @@ private:
+   void xorl(Register dst, Address src);
+   void xorl(Register dst, Register src);
+ 
++  void xorb(Register dst, Address src);
++
+   void xorq(Register dst, Address src);
+   void xorq(Register dst, Register src);
+ 
+diff --git a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+index 2e5599807..f555f3326 100644
+--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
++++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+@@ -2153,6 +2153,17 @@ class StubGenerator: public StubCodeGenerator {
+     return start;
+   }
+ 
++  address generate_counter_shuffle_mask() {
++    __ align(16);
++    StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
++    address start = __ pc();
++    __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
++    __ emit_data(0x08090a0b, relocInfo::none, 0);
++    __ emit_data(0x04050607, relocInfo::none, 0);
++    __ emit_data(0x00010203, relocInfo::none, 0);
++    return start;
++  }
++
+   // Utility routine for loading a 128-bit key word in little endian format
+   // can optionally specify that the shuffle mask is already in an xmmregister
+   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+@@ -2178,6 +2189,31 @@ class StubGenerator: public StubCodeGenerator {
+     __ aesdec(xmmdst, xmmtmp);
+   }
+ 
++  // Utility routine for increase 128bit counter (iv in CTR mode)
++  //  XMM_128bit,  D3, D2, D1, D0
++  void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
++    __ pextrd(reg, xmmdst, 0x0);
++    __ addl(reg, inc_delta);
++    __ pinsrd(xmmdst, reg, 0x0);
++    __ jcc(Assembler::carryClear, next_block); // jump if no carry
++
++    __ pextrd(reg, xmmdst, 0x01); // Carry-> D1
++    __ addl(reg, 0x01);
++    __ pinsrd(xmmdst, reg, 0x01);
++    __ jcc(Assembler::carryClear, next_block); // jump if no carry
++
++    __ pextrd(reg, xmmdst, 0x02); // Carry-> D2
++    __ addl(reg, 0x01);
++    __ pinsrd(xmmdst, reg, 0x02);
++    __ jcc(Assembler::carryClear, next_block); // jump if no carry
++
++    __ pextrd(reg, xmmdst, 0x03); // Carry -> D3
++    __ addl(reg, 0x01);
++    __ pinsrd(xmmdst, reg, 0x03);
++
++    __ BIND(next_block);          // next instruction
++  }
++
+ 
+   // Arguments:
+   //
+@@ -2719,6 +2755,309 @@ class StubGenerator: public StubCodeGenerator {
+     return start;
+   }
+ 
++
++  // CTR AES crypt.
++  // In 32-bit stub, parallelize 4 blocks at a time
++  // Arguments:
++  //
++  // Inputs:
++  //   c_rarg0   - source byte array address
++  //   c_rarg1   - destination byte array address
++  //   c_rarg2   - K (key) in little endian int array
++  //   c_rarg3   - counter vector byte array address
++  //   c_rarg4   - input length
++  //
++  // Output:
++  //   rax       - input length
++  //
++  address generate_counterMode_AESCrypt_Parallel() {
++    assert(UseAES, "need AES instructions and misaligned SSE support");
++    __ align(CodeEntryAlignment);
++    StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
++    address start = __ pc();
++    const Register from        = rsi;      // source array address
++    const Register to          = rdx;      // destination array address
++    const Register key         = rcx;      // key array address
++    const Register counter     = rdi;      // counter byte array initialized from initvector array address
++
++    // and left with the results of the last encryption block
++    const Register len_reg     = rbx;
++    const Register pos         = rax;
++
++    __ enter(); // required for proper stackwalking of RuntimeStub frame
++    handleSOERegisters(true /*saving*/); // save rbx, rsi, rdi
++
++    // load registers from incoming parameters
++    const Address  from_param(rbp, 8+0);
++    const Address  to_param  (rbp, 8+4);
++    const Address  key_param (rbp, 8+8);
++    const Address  rvec_param (rbp, 8+12);
++    const Address  len_param  (rbp, 8+16);
++    const Address  saved_counter_param(rbp, 8 + 20);
++    const Address  used_addr_param(rbp, 8 + 24);
++
++    __ movptr(from , from_param);
++    __ movptr(to   , to_param);
++    //__ movptr(key, key_param);
++    //__ movptr(counter, rvec_param);
++    __ movptr(len_reg , len_param);
++    //__ movptr(pos, 0);
++
++    // Use the partially used encrpyted counter from last invocation
++    Label L_exit_preLoop, L_preLoop_start;
++
++    // Use the registers 'counter' and 'key' here in this preloop
++    // to hold of last 2 params 'used' and 'saved_encCounter_start'
++    Register used = counter;
++    Register saved_encCounter_start = key;
++    Register used_addr = saved_encCounter_start;
++
++    __ movptr(used_addr, used_addr_param);
++    __ movptr(used, Address(used_addr, 0));
++    __ movptr(saved_encCounter_start, saved_counter_param);
++
++    __ BIND(L_preLoop_start);
++    __ cmpptr(used, 16);
++    __ jcc(Assembler::aboveEqual, L_exit_preLoop);
++    __ cmpptr(len_reg, 0);
++    __ jcc(Assembler::lessEqual, L_exit_preLoop);
++    __ movb(rax, Address(saved_encCounter_start, used));
++    __ xorb(rax, Address(from, 0));
++    __ movb(Address(to, 0), rax);
++    __ addptr(from, 1);
++    __ addptr(to, 1);
++    __ addptr(used, 1);
++    __ subptr(len_reg, 1);
++
++    __ jmp(L_preLoop_start);
++
++    __ BIND(L_exit_preLoop);
++    __ movptr(used_addr, used_addr_param);
++    __ movptr(used_addr, used_addr_param);
++    __ movl(Address(used_addr, 0), used);
++
++    // load the parameters 'key' and 'counter'
++    __ movptr(key, key_param);
++    __ movptr(counter, rvec_param);
++
++    // xmm register assignments for the loops below
++    const XMMRegister xmm_curr_counter      = xmm0;
++    const XMMRegister xmm_counter_shuf_mask = xmm1;  // need to be reloaded
++    const XMMRegister xmm_key_shuf_mask     = xmm2;  // need to be reloaded
++    const XMMRegister xmm_key               = xmm3;
++    const XMMRegister xmm_result0           = xmm4;
++    const XMMRegister xmm_result1           = xmm5;
++    const XMMRegister xmm_result2           = xmm6;
++    const XMMRegister xmm_result3           = xmm7;
++    const XMMRegister xmm_from0             = xmm1;   //reuse XMM register
++    const XMMRegister xmm_from1             = xmm2;
++    const XMMRegister xmm_from2             = xmm3;
++    const XMMRegister xmm_from3             = xmm4;
++
++    //for key_128, key_192, key_256
++    const int rounds[3] = {10, 12, 14};
++    Label L_singleBlockLoopTop[3];
++    Label L_multiBlock_loopTop[3];
++    Label L_key192_top, L_key256_top;
++    Label L_incCounter[3][4]; // 3: different key length,  4: 4 blocks at a time
++    Label L_incCounter_single[3]; //for single block, key128, key192, key256
++    Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
++    Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
++
++    Label L_exit;
++    const int PARALLEL_FACTOR = 4;  //because of the limited register number
++
++    // initialize counter with initial counter
++    __ movdqu(xmm_curr_counter, Address(counter, 0x00));
++    __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
++    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled for increase
++
++    // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
++    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
++    __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
++    __ cmpl(rax, 52);
++    __ jcc(Assembler::equal, L_key192_top);
++    __ cmpl(rax, 60);
++    __ jcc(Assembler::equal, L_key256_top);
++
++    //key128 begins here
++    __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
++
++#define CTR_DoFour(opc, src_reg)               \
++    __ opc(xmm_result0, src_reg);              \
++    __ opc(xmm_result1, src_reg);              \
++    __ opc(xmm_result2, src_reg);              \
++    __ opc(xmm_result3, src_reg);
++
++    // k == 0 :  generate code for key_128
++    // k == 1 :  generate code for key_192
++    // k == 2 :  generate code for key_256
++    for (int k = 0; k < 3; ++k) {
++      //multi blocks starts here
++      __ align(OptoLoopAlignment);
++      __ BIND(L_multiBlock_loopTop[k]);
++      __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
++      __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
++
++      __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
++      __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
++
++      //load, then increase counters
++      CTR_DoFour(movdqa, xmm_curr_counter);
++      __ push(rbx);
++      inc_counter(rbx, xmm_result1, 0x01, L_incCounter[k][0]);
++      inc_counter(rbx, xmm_result2, 0x02, L_incCounter[k][1]);
++      inc_counter(rbx, xmm_result3, 0x03, L_incCounter[k][2]);
++      inc_counter(rbx, xmm_curr_counter, 0x04, L_incCounter[k][3]);
++      __ pop (rbx);
++
++      load_key(xmm_key, key, 0x00, xmm_key_shuf_mask); // load Round 0 key. interleaving for better performance
++
++      CTR_DoFour(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
++      CTR_DoFour(pxor, xmm_key);   //PXOR with Round 0 key
++
++      for (int i = 1; i < rounds[k]; ++i) {
++        load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
++        CTR_DoFour(aesenc, xmm_key);
++      }
++      load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
++      CTR_DoFour(aesenclast, xmm_key);
++
++      // get next PARALLEL_FACTOR blocks into xmm_from registers
++      __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
++      __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
++      __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
++
++      // PXOR with input text
++      __ pxor(xmm_result0, xmm_from0); //result0 is xmm4
++      __ pxor(xmm_result1, xmm_from1);
++      __ pxor(xmm_result2, xmm_from2);
++
++      // store PARALLEL_FACTOR results into the next 64 bytes of output
++      __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
++      __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
++      __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
++
++      // do it here after xmm_result0 is saved, because xmm_from3 reuse the same register of xmm_result0.
++      __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
++      __ pxor(xmm_result3, xmm_from3);
++      __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
++
++      __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
++      __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
++      __ jmp(L_multiBlock_loopTop[k]);
++
++      // singleBlock starts here
++      __ align(OptoLoopAlignment);
++      __ BIND(L_singleBlockLoopTop[k]);
++      __ cmpptr(len_reg, 0);
++      __ jcc(Assembler::equal, L_exit);
++      __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
++      __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
++      __ movdqa(xmm_result0, xmm_curr_counter);
++      load_key(xmm_key, key, 0x00, xmm_key_shuf_mask);
++      __ push(rbx);//rbx is used for increasing counter
++      inc_counter(rbx, xmm_curr_counter, 0x01, L_incCounter_single[k]);
++      __ pop (rbx);
++      __ pshufb(xmm_result0, xmm_counter_shuf_mask);
++      __ pxor(xmm_result0, xmm_key);
++      for (int i = 1; i < rounds[k]; i++) {
++        load_key(xmm_key, key, (0x10 * i), xmm_key_shuf_mask);
++        __ aesenc(xmm_result0, xmm_key);
++      }
++      load_key(xmm_key, key, (0x10 * rounds[k]), xmm_key_shuf_mask);
++      __ aesenclast(xmm_result0, xmm_key);
++      __ cmpptr(len_reg, AESBlockSize);
++      __ jcc(Assembler::less, L_processTail_insr[k]);
++        __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
++        __ pxor(xmm_result0, xmm_from0);
++        __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
++        __ addptr(pos, AESBlockSize);
++        __ subptr(len_reg, AESBlockSize);
++        __ jmp(L_singleBlockLoopTop[k]);
++
++      __ BIND(L_processTail_insr[k]);
++        __ addptr(pos, len_reg);
++        __ testptr(len_reg, 8);
++        __ jcc(Assembler::zero, L_processTail_4_insr[k]);
++          __ subptr(pos,8);
++          __ pinsrd(xmm_from0, Address(from, pos), 0);
++          __ pinsrd(xmm_from0, Address(from, pos, Address::times_1, 4), 1);
++        __ BIND(L_processTail_4_insr[k]);
++        __ testptr(len_reg, 4);
++        __ jcc(Assembler::zero, L_processTail_2_insr[k]);
++          __ subptr(pos,4);
++          __ pslldq(xmm_from0, 4);
++          __ pinsrd(xmm_from0, Address(from, pos), 0);
++        __ BIND(L_processTail_2_insr[k]);
++        __ testptr(len_reg, 2);
++        __ jcc(Assembler::zero, L_processTail_1_insr[k]);
++          __ subptr(pos, 2);
++          __ pslldq(xmm_from0, 2);
++          __ pinsrw(xmm_from0, Address(from, pos), 0);
++        __ BIND(L_processTail_1_insr[k]);
++        __ testptr(len_reg, 1);
++        __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
++          __ subptr(pos, 1);
++          __ pslldq(xmm_from0, 1);
++          __ pinsrb(xmm_from0, Address(from, pos), 0);
++        __ BIND(L_processTail_exit_insr[k]);
++
++        __ movptr(saved_encCounter_start, saved_counter_param);
++        __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);
++        __ pxor(xmm_result0, xmm_from0);
++
++        __ testptr(len_reg, 8);
++        __ jcc(Assembler::zero, L_processTail_4_extr[k]);
++          __ pextrd(Address(to, pos), xmm_result0, 0);
++          __ pextrd(Address(to, pos, Address::times_1, 4), xmm_result0, 1);
++          __ psrldq(xmm_result0, 8);
++          __ addptr(pos, 8);
++        __ BIND(L_processTail_4_extr[k]);
++        __ testptr(len_reg, 4);
++        __ jcc(Assembler::zero, L_processTail_2_extr[k]);
++          __ pextrd(Address(to, pos), xmm_result0, 0);
++          __ psrldq(xmm_result0, 4);
++          __ addptr(pos, 4);
++        __ BIND(L_processTail_2_extr[k]);
++        __ testptr(len_reg, 2);
++        __ jcc(Assembler::zero, L_processTail_1_extr[k]);
++          __ pextrb(Address(to, pos), xmm_result0, 0);
++          __ pextrb(Address(to, pos, Address::times_1, 1), xmm_result0, 1);
++          __ psrldq(xmm_result0, 2);
++          __ addptr(pos, 2);
++        __ BIND(L_processTail_1_extr[k]);
++        __ testptr(len_reg, 1);
++        __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
++          __ pextrb(Address(to, pos), xmm_result0, 0);
++
++        __ BIND(L_processTail_exit_extr[k]);
++        __ movptr(used_addr, used_addr_param);
++        __ movl(Address(used_addr, 0), len_reg);
++        __ jmp(L_exit);
++    }
++
++    __ BIND(L_exit);
++    __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
++    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
++    __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
++    handleSOERegisters(false /*restoring*/);
++    __ movptr(rax, len_param); // return length
++    __ leave();                // required for proper stackwalking of RuntimeStub frame
++    __ ret(0);
++
++    __ BIND (L_key192_top);
++    __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
++    __ jmp(L_multiBlock_loopTop[1]); //key192
++
++    __ BIND (L_key256_top);
++    __ movptr(pos, 0); // init pos before L_multiBlock_loopTop
++    __ jmp(L_multiBlock_loopTop[2]); //key192
++
++    return start;
++  }
++
++
+   // byte swap x86 long
+   address generate_ghash_long_swap_mask() {
+     __ align(CodeEntryAlignment);
+@@ -3181,6 +3520,11 @@ class StubGenerator: public StubCodeGenerator {
+       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
+     }
+ 
++    if (UseAESCTRIntrinsics) {
++      StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
++      StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
++    }
++
+     // Generate GHASH intrinsics code
+     if (UseGHASHIntrinsics) {
+       StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
+diff --git a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+index c5811b28b..254f63392 100644
+--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
++++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+@@ -3010,6 +3010,15 @@ class StubGenerator: public StubCodeGenerator {
+     return start;
+   }
+ 
++  address generate_counter_shuffle_mask() {
++    __ align(16);
++    StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
++    address start = __ pc();
++    __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
++    __ emit_data64(0x0001020304050607, relocInfo::none);
++    return start;
++  }
++
+   // Utility routine for loading a 128-bit key word in little endian format
+   // can optionally specify that the shuffle mask is already in an xmmregister
+   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
+@@ -3021,6 +3030,18 @@ class StubGenerator: public StubCodeGenerator {
+     }
+   }
+ 
++  // Utility routine for increase 128bit counter (iv in CTR mode)
++  void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
++    __ pextrq(reg, xmmdst, 0x0);
++    __ addq(reg, inc_delta);
++    __ pinsrq(xmmdst, reg, 0x0);
++    __ jcc(Assembler::carryClear, next_block); // jump if no carry
++    __ pextrq(reg, xmmdst, 0x01); // Carry
++    __ addq(reg, 0x01);
++    __ pinsrq(xmmdst, reg, 0x01); //Carry end
++    __ BIND(next_block);          // next instruction
++  }
++
+   // Arguments:
+   //
+   // Inputs:
+@@ -3639,6 +3660,320 @@ class StubGenerator: public StubCodeGenerator {
+     return start;
+   }
+ 
++  // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
++  // to hide instruction latency
++  //
++  // Arguments:
++  //
++  // Inputs:
++  //   c_rarg0   - source byte array address
++  //   c_rarg1   - destination byte array address
++  //   c_rarg2   - K (key) in little endian int array
++  //   c_rarg3   - counter vector byte array address
++  //   Linux
++  //     c_rarg4   -          input length
++  //     c_rarg5   -          saved encryptedCounter start
++  //     rbp + 6 * wordSize - saved used length
++  //   Windows
++  //     rbp + 6 * wordSize - input length
++  //     rbp + 7 * wordSize - saved encryptedCounter start
++  //     rbp + 8 * wordSize - saved used length
++  //
++  // Output:
++  //   rax       - input length
++  //
++  address generate_counterMode_AESCrypt_Parallel() {
++    assert(UseAES, "need AES instructions and misaligned SSE support");
++    __ align(CodeEntryAlignment);
++    StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
++    address start = __ pc();
++    const Register from = c_rarg0; // source array address
++    const Register to = c_rarg1; // destination array address
++    const Register key = c_rarg2; // key array address
++    const Register counter = c_rarg3; // counter byte array initialized from counter array address
++    // and left with the results of the last encryption block
++#ifndef _WIN64
++    const Register len_reg = c_rarg4;
++    const Register saved_encCounter_start = c_rarg5;
++    const Register used_addr = r10;
++    const Address  used_mem(rbp, 2 * wordSize);
++    const Register used = r11;
++#else
++    const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
++    const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
++    const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
++    const Register len_reg = r10; // pick the first volatile windows register
++    const Register saved_encCounter_start = r11;
++    const Register used_addr = r13;
++    const Register used = r14;
++#endif
++    const Register pos = rax;
++
++    const int PARALLEL_FACTOR = 6;
++    const XMMRegister xmm_counter_shuf_mask = xmm0;
++    const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
++    const XMMRegister xmm_curr_counter = xmm2;
++
++    const XMMRegister xmm_key_tmp0 = xmm3;
++    const XMMRegister xmm_key_tmp1 = xmm4;
++
++    // registers holding the four results in the parallelized loop
++    const XMMRegister xmm_result0 = xmm5;
++    const XMMRegister xmm_result1 = xmm6;
++    const XMMRegister xmm_result2 = xmm7;
++    const XMMRegister xmm_result3 = xmm8;
++    const XMMRegister xmm_result4 = xmm9;
++    const XMMRegister xmm_result5 = xmm10;
++
++    const XMMRegister xmm_from0 = xmm11;
++    const XMMRegister xmm_from1 = xmm12;
++    const XMMRegister xmm_from2 = xmm13;
++    const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
++    const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
++    const XMMRegister xmm_from5 = xmm4;
++
++    //for key_128, key_192, key_256
++    const int rounds[3] = {10, 12, 14};
++    Label L_exit_preLoop, L_preLoop_start;
++    Label L_multiBlock_loopTop[3];
++    Label L_singleBlockLoopTop[3];
++    Label L__incCounter[3][6]; //for 6 blocks
++    Label L__incCounter_single[3]; //for single block, key128, key192, key256
++    Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
++    Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
++
++    Label L_exit;
++
++    __ enter(); // required for proper stackwalking of RuntimeStub frame
++
++#ifdef _WIN64
++    // save the xmm registers which must be preserved 6-14
++    const int XMM_REG_NUM_KEY_LAST = 14;
++    __ subptr(rsp, -rsp_after_call_off * wordSize);
++    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
++      __ movdqu(xmm_save(i), as_XMMRegister(i));
++    }
++
++    const Address r13_save(rbp, rdi_off * wordSize);
++    const Address r14_save(rbp, rsi_off * wordSize);
++
++    __ movptr(r13_save, r13);
++    __ movptr(r14_save, r14);
++
++    // on win64, fill len_reg from stack position
++    __ movl(len_reg, len_mem);
++    __ movptr(saved_encCounter_start, saved_encCounter_mem);
++    __ movptr(used_addr, used_mem);
++    __ movl(used, Address(used_addr, 0));
++#else
++    __ push(len_reg); // Save
++    __ movptr(used_addr, used_mem);
++    __ movl(used, Address(used_addr, 0));
++#endif
++
++    __ push(rbx); // Save RBX
++    __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
++    __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
++    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
++    __ movptr(pos, 0);
++
++    // Use the partially used encrpyted counter from last invocation
++    __ BIND(L_preLoop_start);
++    __ cmpptr(used, 16);
++    __ jcc(Assembler::aboveEqual, L_exit_preLoop);
++      __ cmpptr(len_reg, 0);
++      __ jcc(Assembler::lessEqual, L_exit_preLoop);
++      __ movb(rbx, Address(saved_encCounter_start, used));
++      __ xorb(rbx, Address(from, pos));
++      __ movb(Address(to, pos), rbx);
++      __ addptr(pos, 1);
++      __ addptr(used, 1);
++      __ subptr(len_reg, 1);
++
++    __ jmp(L_preLoop_start);
++
++    __ BIND(L_exit_preLoop);
++    __ movl(Address(used_addr, 0), used);
++
++    // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
++    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
++    __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
++    __ cmpl(rbx, 52);
++    __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
++    __ cmpl(rbx, 60);
++    __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
++
++#define CTR_DoSix(opc, src_reg)                \
++    __ opc(xmm_result0, src_reg);              \
++    __ opc(xmm_result1, src_reg);              \
++    __ opc(xmm_result2, src_reg);              \
++    __ opc(xmm_result3, src_reg);              \
++    __ opc(xmm_result4, src_reg);              \
++    __ opc(xmm_result5, src_reg);
++
++    // k == 0 :  generate code for key_128
++    // k == 1 :  generate code for key_192
++    // k == 2 :  generate code for key_256
++    for (int k = 0; k < 3; ++k) {
++      //multi blocks starts here
++      __ align(OptoLoopAlignment);
++      __ BIND(L_multiBlock_loopTop[k]);
++      __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
++      __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
++      load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
++
++      //load, then increase counters
++      CTR_DoSix(movdqa, xmm_curr_counter);
++      inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
++      inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
++      inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
++      inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
++      inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
++      inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
++      CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
++      CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
++
++      //load two ROUND_KEYs at a time
++      for (int i = 1; i < rounds[k]; ) {
++        load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
++        load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
++        CTR_DoSix(aesenc, xmm_key_tmp1);
++        i++;
++        if (i != rounds[k]) {
++          CTR_DoSix(aesenc, xmm_key_tmp0);
++        } else {
++          CTR_DoSix(aesenclast, xmm_key_tmp0);
++        }
++        i++;
++      }
++
++      // get next PARALLEL_FACTOR blocks into xmm_result registers
++      __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
++      __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
++      __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
++      __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
++      __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
++      __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
++
++      __ pxor(xmm_result0, xmm_from0);
++      __ pxor(xmm_result1, xmm_from1);
++      __ pxor(xmm_result2, xmm_from2);
++      __ pxor(xmm_result3, xmm_from3);
++      __ pxor(xmm_result4, xmm_from4);
++      __ pxor(xmm_result5, xmm_from5);
++
++      // store 6 results into the next 64 bytes of output
++      __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
++      __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
++      __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
++      __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
++      __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
++      __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
++
++      __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
++      __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
++      __ jmp(L_multiBlock_loopTop[k]);
++
++      // singleBlock starts here
++      __ align(OptoLoopAlignment);
++      __ BIND(L_singleBlockLoopTop[k]);
++      __ cmpptr(len_reg, 0);
++      __ jcc(Assembler::lessEqual, L_exit);
++      load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
++      __ movdqa(xmm_result0, xmm_curr_counter);
++      inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
++      __ pshufb(xmm_result0, xmm_counter_shuf_mask);
++      __ pxor(xmm_result0, xmm_key_tmp0);
++      for (int i = 1; i < rounds[k]; i++) {
++        load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
++        __ aesenc(xmm_result0, xmm_key_tmp0);
++      }
++      load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
++      __ aesenclast(xmm_result0, xmm_key_tmp0);
++      __ cmpptr(len_reg, AESBlockSize);
++      __ jcc(Assembler::less, L_processTail_insr[k]);
++        __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
++        __ pxor(xmm_result0, xmm_from0);
++        __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
++        __ addptr(pos, AESBlockSize);
++        __ subptr(len_reg, AESBlockSize);
++        __ jmp(L_singleBlockLoopTop[k]);
++      __ BIND(L_processTail_insr[k]);
++        __ addptr(pos, len_reg);
++        __ testptr(len_reg, 8);
++        __ jcc(Assembler::zero, L_processTail_4_insr[k]);
++          __ subptr(pos,8);
++          __ pinsrq(xmm_from0, Address(from, pos), 0);
++        __ BIND(L_processTail_4_insr[k]);
++        __ testptr(len_reg, 4);
++        __ jcc(Assembler::zero, L_processTail_2_insr[k]);
++          __ subptr(pos,4);
++          __ pslldq(xmm_from0, 4);
++          __ pinsrd(xmm_from0, Address(from, pos), 0);
++        __ BIND(L_processTail_2_insr[k]);
++        __ testptr(len_reg, 2);
++        __ jcc(Assembler::zero, L_processTail_1_insr[k]);
++          __ subptr(pos, 2);
++          __ pslldq(xmm_from0, 2);
++          __ pinsrw(xmm_from0, Address(from, pos), 0);
++        __ BIND(L_processTail_1_insr[k]);
++        __ testptr(len_reg, 1);
++        __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
++          __ subptr(pos, 1);
++          __ pslldq(xmm_from0, 1);
++          __ pinsrb(xmm_from0, Address(from, pos), 0);
++        __ BIND(L_processTail_exit_insr[k]);
++
++        __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);
++        __ pxor(xmm_result0, xmm_from0);
++
++        __ testptr(len_reg, 8);
++        __ jcc(Assembler::zero, L_processTail_4_extr[k]);
++          __ pextrq(Address(to, pos), xmm_result0, 0);
++          __ psrldq(xmm_result0, 8);
++          __ addptr(pos, 8);
++        __ BIND(L_processTail_4_extr[k]);
++        __ testptr(len_reg, 4);
++        __ jcc(Assembler::zero, L_processTail_2_extr[k]);
++          __ pextrd(Address(to, pos), xmm_result0, 0);
++          __ psrldq(xmm_result0, 4);
++          __ addptr(pos, 4);
++        __ BIND(L_processTail_2_extr[k]);
++        __ testptr(len_reg, 2);
++        __ jcc(Assembler::zero, L_processTail_1_extr[k]);
++          __ pextrw(Address(to, pos), xmm_result0, 0);
++          __ psrldq(xmm_result0, 2);
++          __ addptr(pos, 2);
++        __ BIND(L_processTail_1_extr[k]);
++        __ testptr(len_reg, 1);
++        __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
++          __ pextrb(Address(to, pos), xmm_result0, 0);
++
++        __ BIND(L_processTail_exit_extr[k]);
++        __ movl(Address(used_addr, 0), len_reg);
++        __ jmp(L_exit);
++
++    }
++
++    __ BIND(L_exit);
++    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
++    __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
++    __ pop(rbx); // pop the saved RBX.
++#ifdef _WIN64
++    // restore regs belonging to calling function
++    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
++      __ movdqu(as_XMMRegister(i), xmm_save(i));
++    }
++    __ movl(rax, len_mem);
++    __ movptr(r13, r13_save);
++    __ movptr(r14, r14_save);
++#else
++    __ pop(rax); // return 'len'
++#endif
++    __ leave(); // required for proper stackwalking of RuntimeStub frame
++    __ ret(0);
++    return start;
++  }
+ 
+   // byte swap x86 long
+   address generate_ghash_long_swap_mask() {
+@@ -4239,12 +4574,15 @@ class StubGenerator: public StubCodeGenerator {
+     // don't bother generating these AES intrinsic stubs unless global flag is set
+     if (UseAESIntrinsics) {
+       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
+-
+       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
+       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
+     }
++    if (UseAESCTRIntrinsics){
++      StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
++      StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
++    }
+ 
+     // Generate GHASH intrinsics code
+     if (UseGHASHIntrinsics) {
+diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp
+index 9b0d8fc75..617879377 100644
+--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp
++++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp
+@@ -33,6 +33,7 @@
+ 
+ address StubRoutines::x86::_verify_mxcsr_entry = NULL;
+ address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
++address StubRoutines::x86::_counter_shuffle_mask_addr = NULL;
+ address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
+ address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;
+ 
+diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp
+index bb160486c..70b5a34ac 100644
+--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp
++++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp
+@@ -33,6 +33,10 @@
+   static address _verify_mxcsr_entry;
+   // shuffle mask for fixing up 128-bit words consisting of big-endian 32-bit integers
+   static address _key_shuffle_mask_addr;
++
++  //shuffle mask for big-endian 128-bit integers
++  static address _counter_shuffle_mask_addr;
++
+   // masks and table for CRC32
+   static uint64_t _crc_by128_masks[];
+   static juint    _crc_table[];
+@@ -43,6 +47,7 @@
+  public:
+   static address verify_mxcsr_entry()    { return _verify_mxcsr_entry; }
+   static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; }
++  static address counter_shuffle_mask_addr() { return _counter_shuffle_mask_addr; }
+   static address crc_by128_masks_addr()  { return (address)_crc_by128_masks; }
+   static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
+   static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
+diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp
+index bca5d493c..538f83e69 100644
+--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp
++++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_32.hpp
+@@ -31,7 +31,7 @@
+ 
+ enum platform_dependent_constants {
+   code_size1 =  9000,           // simply increase if too small (assembler will crash if too small)
+-  code_size2 = 22000            // simply increase if too small (assembler will crash if too small)
++  code_size2 = 25800            // simply increase if too small (assembler will crash if too small)
+ };
+ 
+ class x86 {
+diff --git a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp
+index b048fd74e..f963cd2f8 100644
+--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp
++++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp
+@@ -33,7 +33,7 @@ static bool    returns_to_call_stub(address return_pc)   { return return_pc == _
+ 
+ enum platform_dependent_constants {
+   code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
+-  code_size2 = 24000           // simply increase if too small (assembler will crash if too small)
++  code_size2 = 27000           // simply increase if too small (assembler will crash if too small)
+ };
+ 
+ class x86 {
+diff --git a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
+index 46b3e32ea..ce3037d76 100644
+--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
++++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
+@@ -573,6 +573,28 @@ void VM_Version::get_processor_features() {
+         }
+         FLAG_SET_DEFAULT(UseAESIntrinsics, false);
+       }
++
++      // --AES-CTR begins--
++      if (!UseAESIntrinsics) {
++        if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
++          warning("AES-CTR intrinsics require UseAESIntrinsics flag to be enabled. Intrinsics will be disabled.");
++          FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
++        }
++      } else {
++        if(supports_sse4_1() && UseSSE >= 4) {
++          if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
++            FLAG_SET_DEFAULT(UseAESCTRIntrinsics, true);
++          }
++        } else {
++           // The AES-CTR intrinsic stubs require AES instruction support (of course)
++           // but also require sse4.1 mode or higher for instructions it use.
++          if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
++             warning("X86 AES-CTR intrinsics require SSE4.1 instructions or higher. Intrinsics will be disabled.");
++           }
++           FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
++        }
++      }
++      // --AES-CTR ends--
+     }
+   } else if (UseAES || UseAESIntrinsics) {
+     if (UseAES && !FLAG_IS_DEFAULT(UseAES)) {
+@@ -583,6 +605,10 @@ void VM_Version::get_processor_features() {
+       warning("AES intrinsics are not available on this CPU");
+       FLAG_SET_DEFAULT(UseAESIntrinsics, false);
+     }
++    if (UseAESCTRIntrinsics && !FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
++      warning("AES-CTR intrinsics are not available on this CPU");
++      FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
++    }
+   }
+ 
+   // Use CLMUL instructions if available.
+@@ -606,6 +632,16 @@ void VM_Version::get_processor_features() {
+     FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
+   }
+ 
++  if (UseAESIntrinsics) {
++    if (FLAG_IS_DEFAULT(UseAESCTRIntrinsics)) {
++      UseAESCTRIntrinsics = true;
++    }
++  } else if (UseAESCTRIntrinsics) {
++    if (!FLAG_IS_DEFAULT(UseAESCTRIntrinsics))
++        warning("AES/CTR intrinsics are not available on this CPU");
++    FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
++  }
++
+   // GHASH/GCM intrinsics
+   if (UseCLMUL && (UseSSE > 2)) {
+     if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
+diff --git a/hotspot/src/share/vm/classfile/vmSymbols.hpp b/hotspot/src/share/vm/classfile/vmSymbols.hpp
+index 942d172a1..4ca2a3ad4 100644
+--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp
++++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp
+@@ -846,6 +846,10 @@
+    do_name(     decrypt_name,                                      "implDecrypt")                                       \
+    do_signature(byteArray_int_int_byteArray_int_signature,         "([BII[BI)I")                                        \
+                                                                                                                         \
++  do_class(com_sun_crypto_provider_counterMode,      "com/sun/crypto/provider/CounterMode")                             \
++   do_intrinsic(_counterMode_AESCrypt, com_sun_crypto_provider_counterMode, crypt_name, byteArray_int_int_byteArray_int_signature, F_R)   \
++   do_name(     crypt_name,                                 "implCrypt")                                                    \
++                                                                                                                        \
+   /* support for sun.security.provider.SHA */                                                                           \
+   do_class(sun_security_provider_sha,                              "sun/security/provider/SHA")                         \
+   do_intrinsic(_sha_implCompress, sun_security_provider_sha, implCompress_name, implCompress_signature, F_R)            \
+diff --git a/hotspot/src/share/vm/opto/escape.cpp b/hotspot/src/share/vm/opto/escape.cpp
+index 6f8ffe608..a0e497f08 100644
+--- a/hotspot/src/share/vm/opto/escape.cpp
++++ b/hotspot/src/share/vm/opto/escape.cpp
+@@ -952,6 +952,7 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
+                   strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 ||
+                   strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 ||
+                   strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_decryptAESCrypt") == 0 ||
++                  strcmp(call->as_CallLeaf()->_name, "counterMode_AESCrypt") == 0 ||
+                   strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 ||
+                   strcmp(call->as_CallLeaf()->_name, "sha1_implCompress") == 0 ||
+                   strcmp(call->as_CallLeaf()->_name, "sha1_implCompressMB") == 0 ||
+diff --git a/hotspot/src/share/vm/opto/library_call.cpp b/hotspot/src/share/vm/opto/library_call.cpp
+index bb721f6f1..2add82dd1 100644
+--- a/hotspot/src/share/vm/opto/library_call.cpp
++++ b/hotspot/src/share/vm/opto/library_call.cpp
+@@ -196,6 +196,7 @@ class LibraryCallKit : public GraphKit {
+     return generate_method_call(method_id, true, false);
+   }
+   Node * load_field_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static);
++  Node * field_address_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString, bool is_exact, bool is_static, ciInstanceKlass * fromKls);
+ 
+   Node* make_string_method_node(int opcode, Node* str1_start, Node* cnt1, Node* str2_start, Node* cnt2);
+   Node* make_string_method_node(int opcode, Node* str1, Node* str2);
+@@ -309,7 +310,9 @@ class LibraryCallKit : public GraphKit {
+   bool inline_reference_get();
+   bool inline_aescrypt_Block(vmIntrinsics::ID id);
+   bool inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id);
++  bool inline_counterMode_AESCrypt(vmIntrinsics::ID id);
+   Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting);
++  Node* inline_counterMode_AESCrypt_predicate();
+   Node* get_key_start_from_aescrypt_object(Node* aescrypt_object);
+   Node* get_original_key_start_from_aescrypt_object(Node* aescrypt_object);
+   bool inline_ghash_processBlocks();
+@@ -558,6 +561,13 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
+     predicates = 1;
+     break;
+ 
++  case vmIntrinsics::_counterMode_AESCrypt:
++    if (!UseAESCTRIntrinsics) {
++      return NULL;
++    }
++    predicates = 1;
++    break;
++
+   case vmIntrinsics::_sha_implCompress:
+     if (!UseSHA1Intrinsics) return NULL;
+     break;
+@@ -950,6 +960,9 @@ bool LibraryCallKit::try_to_inline(int predicate) {
+   case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
+     return inline_cipherBlockChaining_AESCrypt(intrinsic_id());
+ 
++  case vmIntrinsics::_counterMode_AESCrypt:
++    return inline_counterMode_AESCrypt(intrinsic_id());
++
+   case vmIntrinsics::_sha_implCompress:
+   case vmIntrinsics::_sha2_implCompress:
+   case vmIntrinsics::_sha5_implCompress:
+@@ -1021,6 +1034,8 @@ Node* LibraryCallKit::try_to_predicate(int predicate) {
+     return inline_cipherBlockChaining_AESCrypt_predicate(false);
+   case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
+     return inline_cipherBlockChaining_AESCrypt_predicate(true);
++  case vmIntrinsics::_counterMode_AESCrypt:
++    return inline_counterMode_AESCrypt_predicate();
+   case vmIntrinsics::_digestBase_implCompressMB:
+     return inline_digestBase_implCompressMB_predicate(predicate);
+ 
+@@ -6581,6 +6596,39 @@ Node * LibraryCallKit::load_field_from_object(Node * fromObj, const char * field
+   return loadedField;
+ }
+ 
++Node * LibraryCallKit::field_address_from_object(Node * fromObj, const char * fieldName, const char * fieldTypeString,
++                                                 bool is_exact = true, bool is_static = false,
++                                                 ciInstanceKlass * fromKls = NULL) {
++  if (fromKls == NULL) {
++    const TypeInstPtr* tinst = _gvn.type(fromObj)->isa_instptr();
++    assert(tinst != NULL, "obj is null");
++    assert(tinst->klass()->is_loaded(), "obj is not loaded");
++    assert(!is_exact || tinst->klass_is_exact(), "klass not exact");
++    fromKls = tinst->klass()->as_instance_klass();
++  }
++  else {
++    assert(is_static, "only for static field access");
++  }
++  ciField* field = fromKls->get_field_by_name(ciSymbol::make(fieldName),
++    ciSymbol::make(fieldTypeString),
++    is_static);
++
++  assert(field != NULL, "undefined field");
++  assert(!field->is_volatile(), "not defined for volatile fields");
++
++  if (is_static) {
++    const TypeInstPtr* tip = TypeInstPtr::make(fromKls->java_mirror());
++    fromObj = makecon(tip);
++  }
++
++  // Next code  copied from Parse::do_get_xxx():
++
++  // Compute address and memory type.
++  int offset = field->offset_in_bytes();
++  Node *adr = basic_plus_adr(fromObj, fromObj, offset);
++
++  return adr;
++}
+ 
+ //------------------------------inline_aescrypt_Block-----------------------
+ bool LibraryCallKit::inline_aescrypt_Block(vmIntrinsics::ID id) {
+@@ -6747,6 +6795,90 @@ bool LibraryCallKit::inline_cipherBlockChaining_AESCrypt(vmIntrinsics::ID id) {
+   return true;
+ }
+ 
++//------------------------------inline_counterMode_AESCrypt-----------------------
++bool LibraryCallKit::inline_counterMode_AESCrypt(vmIntrinsics::ID id) {
++  assert(UseAES, "need AES instruction support");
++  if (!UseAESCTRIntrinsics) return false;
++
++  address stubAddr = NULL;
++  const char *stubName = NULL;
++  if (id == vmIntrinsics::_counterMode_AESCrypt) {
++    stubAddr = StubRoutines::counterMode_AESCrypt();
++    stubName = "counterMode_AESCrypt";
++  }
++  if (stubAddr == NULL) return false;
++
++  Node* counterMode_object = argument(0);
++  Node* src = argument(1);
++  Node* src_offset = argument(2);
++  Node* len = argument(3);
++  Node* dest = argument(4);
++  Node* dest_offset = argument(5);
++
++  // (1) src and dest are arrays.
++  const Type* src_type = src->Value(&_gvn);
++  const Type* dest_type = dest->Value(&_gvn);
++  const TypeAryPtr* top_src = src_type->isa_aryptr();
++  const TypeAryPtr* top_dest = dest_type->isa_aryptr();
++  assert(top_src != NULL && top_src->klass() != NULL &&
++         top_dest != NULL && top_dest->klass() != NULL, "args are strange");
++
++  // checks are the responsibility of the caller
++  Node* src_start = src;
++  Node* dest_start = dest;
++  if (src_offset != NULL || dest_offset != NULL) {
++    assert(src_offset != NULL && dest_offset != NULL, "");
++    src_start = array_element_address(src, src_offset, T_BYTE);
++    dest_start = array_element_address(dest, dest_offset, T_BYTE);
++  }
++
++  // if we are in this set of code, we "know" the embeddedCipher is an AESCrypt object
++  // (because of the predicated logic executed earlier).
++  // so we cast it here safely.
++  // this requires a newer class file that has this array as littleEndian ints, otherwise we revert to java
++  Node* embeddedCipherObj = load_field_from_object(counterMode_object, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false);
++  if (embeddedCipherObj == NULL) return false;
++  // cast it to what we know it will be at runtime
++  const TypeInstPtr* tinst = _gvn.type(counterMode_object)->isa_instptr();
++  assert(tinst != NULL, "CTR obj is null");
++  assert(tinst->klass()->is_loaded(), "CTR obj is not loaded");
++  ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt"));
++  assert(klass_AESCrypt->is_loaded(), "predicate checks that this class is loaded");
++  ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass();
++  const TypeKlassPtr* aklass = TypeKlassPtr::make(instklass_AESCrypt);
++  const TypeOopPtr* xtype = aklass->as_instance_type();
++  Node* aescrypt_object = new (C) CheckCastPPNode(control(), embeddedCipherObj, xtype);
++  aescrypt_object = _gvn.transform(aescrypt_object);
++  // we need to get the start of the aescrypt_object's expanded key array
++  Node* k_start = get_key_start_from_aescrypt_object(aescrypt_object);
++  if (k_start == NULL) return false;
++  // similarly, get the start address of the r vector
++  Node* obj_counter = load_field_from_object(counterMode_object, "counter", "[B", /*is_exact*/ false);
++  if (obj_counter == NULL) return false;
++  Node* cnt_start = array_element_address(obj_counter, intcon(0), T_BYTE);
++
++  Node* saved_encCounter = load_field_from_object(counterMode_object, "encryptedCounter", "[B", /*is_exact*/ false);
++  if (saved_encCounter == NULL) return false;
++  Node* saved_encCounter_start = array_element_address(saved_encCounter, intcon(0), T_BYTE);
++  Node* used = field_address_from_object(counterMode_object, "used", "I", /*is_exact*/ false);
++
++  Node* ctrCrypt;
++  if (Matcher::pass_original_key_for_aes()) {
++    // no SPARC version for AES/CTR intrinsics now.
++    return false;
++  }
++  // Call the stub, passing src_start, dest_start, k_start, r_start and src_len
++  ctrCrypt = make_runtime_call(RC_LEAF|RC_NO_FP,
++                               OptoRuntime::counterMode_aescrypt_Type(),
++                               stubAddr, stubName, TypePtr::BOTTOM,
++                               src_start, dest_start, k_start, cnt_start, len, saved_encCounter_start, used);
++
++  // return cipher length (int)
++  Node* retvalue = _gvn.transform(new (C) ProjNode(ctrCrypt, TypeFunc::Parms));
++  set_result(retvalue);
++  return true;
++}
++
+ //------------------------------get_key_start_from_aescrypt_object-----------------------
+ Node * LibraryCallKit::get_key_start_from_aescrypt_object(Node *aescrypt_object) {
+ #ifdef PPC64
+@@ -6841,6 +6973,48 @@ Node* LibraryCallKit::inline_cipherBlockChaining_AESCrypt_predicate(bool decrypt
+   return _gvn.transform(region);
+ }
+ 
++//----------------------------inline_counterMode_AESCrypt_predicate----------------------------
++// Return node representing slow path of predicate check.
++// the pseudo code we want to emulate with this predicate is:
++// for encryption:
++//    if (embeddedCipherObj instanceof AESCrypt) do_intrinsic, else do_javapath
++// for decryption:
++//    if ((embeddedCipherObj instanceof AESCrypt) && (cipher!=plain)) do_intrinsic, else do_javapath
++//    note cipher==plain is more conservative than the original java code but that's OK
++//
++
++Node* LibraryCallKit::inline_counterMode_AESCrypt_predicate() {
++  // The receiver was checked for NULL already.
++  Node* objCTR = argument(0);
++
++  // Load embeddedCipher field of CipherBlockChaining object.
++  Node* embeddedCipherObj = load_field_from_object(objCTR, "embeddedCipher", "Lcom/sun/crypto/provider/SymmetricCipher;", /*is_exact*/ false);
++
++  // get AESCrypt klass for instanceOf check
++  // AESCrypt might not be loaded yet if some other SymmetricCipher got us to this compile point
++  // will have same classloader as CipherBlockChaining object
++  const TypeInstPtr* tinst = _gvn.type(objCTR)->isa_instptr();
++  assert(tinst != NULL, "CTRobj is null");
++  assert(tinst->klass()->is_loaded(), "CTRobj is not loaded");
++
++  // we want to do an instanceof comparison against the AESCrypt class
++  ciKlass* klass_AESCrypt = tinst->klass()->as_instance_klass()->find_klass(ciSymbol::make("com/sun/crypto/provider/AESCrypt"));
++  if (!klass_AESCrypt->is_loaded()) {
++    // if AESCrypt is not even loaded, we never take the intrinsic fast path
++    Node* ctrl = control();
++    set_control(top()); // no regular fast path
++    return ctrl;
++  }
++
++  ciInstanceKlass* instklass_AESCrypt = klass_AESCrypt->as_instance_klass();
++  Node* instof = gen_instanceof(embeddedCipherObj, makecon(TypeKlassPtr::make(instklass_AESCrypt)));
++  Node* cmp_instof = _gvn.transform(new (C) CmpINode(instof, intcon(1)));
++  Node* bool_instof = _gvn.transform(new (C) BoolNode(cmp_instof, BoolTest::ne));
++  Node* instof_false = generate_guard(bool_instof, NULL, PROB_MIN);
++
++  return instof_false; // even if it is NULL
++}
++
+ //------------------------------inline_ghash_processBlocks
+ bool LibraryCallKit::inline_ghash_processBlocks() {
+   address stubAddr;
+diff --git a/hotspot/src/share/vm/opto/runtime.cpp b/hotspot/src/share/vm/opto/runtime.cpp
+index 0a86211ba..1c51be19b 100644
+--- a/hotspot/src/share/vm/opto/runtime.cpp
++++ b/hotspot/src/share/vm/opto/runtime.cpp
+@@ -1021,6 +1021,35 @@ const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() {
+   return TypeFunc::make(domain, range);
+ }
+ 
++//for counterMode calls of aescrypt encrypt/decrypt, four pointers and a length, returning int
++const TypeFunc* OptoRuntime::counterMode_aescrypt_Type() {
++  // create input type (domain)
++  int num_args = 7;
++  if (Matcher::pass_original_key_for_aes()) {
++    num_args = 8;
++  }
++  int argcnt = num_args;
++  const Type** fields = TypeTuple::fields(argcnt);
++  int argp = TypeFunc::Parms;
++  fields[argp++] = TypePtr::NOTNULL; // src
++  fields[argp++] = TypePtr::NOTNULL; // dest
++  fields[argp++] = TypePtr::NOTNULL; // k array
++  fields[argp++] = TypePtr::NOTNULL; // counter array
++  fields[argp++] = TypeInt::INT; // src len
++  fields[argp++] = TypePtr::NOTNULL; // saved_encCounter
++  fields[argp++] = TypePtr::NOTNULL; // saved used addr
++  if (Matcher::pass_original_key_for_aes()) {
++    fields[argp++] = TypePtr::NOTNULL; // original k array
++  }
++  assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
++  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields);
++  // returning cipher len (int)
++  fields = TypeTuple::fields(1);
++  fields[TypeFunc::Parms + 0] = TypeInt::INT;
++  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms + 1, fields);
++  return TypeFunc::make(domain, range);
++}
++
+ /*
+  * void implCompress(byte[] buf, int ofs)
+  */
+diff --git a/hotspot/src/share/vm/opto/runtime.hpp b/hotspot/src/share/vm/opto/runtime.hpp
+index 47133d58c..f27e7d507 100644
+--- a/hotspot/src/share/vm/opto/runtime.hpp
++++ b/hotspot/src/share/vm/opto/runtime.hpp
+@@ -299,6 +299,7 @@ private:
+ 
+   static const TypeFunc* aescrypt_block_Type();
+   static const TypeFunc* cipherBlockChaining_aescrypt_Type();
++  static const TypeFunc* counterMode_aescrypt_Type();
+ 
+   static const TypeFunc* sha_implCompress_Type();
+   static const TypeFunc* digestBase_implCompressMB_Type();
+diff --git a/hotspot/src/share/vm/runtime/globals.hpp b/hotspot/src/share/vm/runtime/globals.hpp
+index 65dfcf69b..91e52f033 100644
+--- a/hotspot/src/share/vm/runtime/globals.hpp
++++ b/hotspot/src/share/vm/runtime/globals.hpp
+@@ -734,6 +734,9 @@ class CommandLineFlags {
+   product(bool, UseAESIntrinsics, false,                                    \
+           "Use intrinsics for AES versions of crypto")                      \
+                                                                             \
++  product(bool, UseAESCTRIntrinsics, false,                                 \
++          "Use intrinsics for the paralleled version of AES/CTR crypto")    \
++                                                                            \
+   product(bool, UseSHA1Intrinsics, false,                                   \
+           "Use intrinsics for SHA-1 crypto hash function")                  \
+                                                                             \
+diff --git a/hotspot/src/share/vm/runtime/stubRoutines.cpp b/hotspot/src/share/vm/runtime/stubRoutines.cpp
+index f2106d13a..d66237137 100644
+--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp
++++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp
+@@ -124,6 +124,7 @@ address StubRoutines::_aescrypt_encryptBlock               = NULL;
+ address StubRoutines::_aescrypt_decryptBlock               = NULL;
+ address StubRoutines::_cipherBlockChaining_encryptAESCrypt = NULL;
+ address StubRoutines::_cipherBlockChaining_decryptAESCrypt = NULL;
++address StubRoutines::_counterMode_AESCrypt                = NULL;
+ address StubRoutines::_ghash_processBlocks                 = NULL;
+ 
+ address StubRoutines::_sha1_implCompress     = NULL;
+diff --git a/hotspot/src/share/vm/runtime/stubRoutines.hpp b/hotspot/src/share/vm/runtime/stubRoutines.hpp
+index 16075d9f4..9fb589540 100644
+--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp
++++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp
+@@ -202,6 +202,7 @@ class StubRoutines: AllStatic {
+   static address _aescrypt_decryptBlock;
+   static address _cipherBlockChaining_encryptAESCrypt;
+   static address _cipherBlockChaining_decryptAESCrypt;
++  static address _counterMode_AESCrypt;
+   static address _ghash_processBlocks;
+ 
+   static address _sha1_implCompress;
+@@ -370,6 +371,7 @@ class StubRoutines: AllStatic {
+   static address aescrypt_decryptBlock()                { return _aescrypt_decryptBlock; }
+   static address cipherBlockChaining_encryptAESCrypt()  { return _cipherBlockChaining_encryptAESCrypt; }
+   static address cipherBlockChaining_decryptAESCrypt()  { return _cipherBlockChaining_decryptAESCrypt; }
++  static address counterMode_AESCrypt() { return _counterMode_AESCrypt; }
+   static address ghash_processBlocks() { return _ghash_processBlocks; }
+ 
+   static address sha1_implCompress()     { return _sha1_implCompress; }
+diff --git a/hotspot/src/share/vm/runtime/vmStructs.cpp b/hotspot/src/share/vm/runtime/vmStructs.cpp
+index 3f2bfeb74..842b5840d 100644
+--- a/hotspot/src/share/vm/runtime/vmStructs.cpp
++++ b/hotspot/src/share/vm/runtime/vmStructs.cpp
+@@ -815,6 +815,7 @@ typedef TwoOopHashtable<Symbol*, mtClass>     SymbolTwoOopHashtable;
+      static_field(StubRoutines,                _aescrypt_decryptBlock,                        address)                               \
+      static_field(StubRoutines,                _cipherBlockChaining_encryptAESCrypt,          address)                               \
+      static_field(StubRoutines,                _cipherBlockChaining_decryptAESCrypt,          address)                               \
++     static_field(StubRoutines,                _counterMode_AESCrypt,                         address)                               \
+      static_field(StubRoutines,                _ghash_processBlocks,                          address)                               \
+      static_field(StubRoutines,                _updateBytesCRC32,                             address)                               \
+      static_field(StubRoutines,                _crc_table_adr,                                address)                               \
+diff --git a/hotspot/test/compiler/7184394/TestAESBase.java b/hotspot/test/compiler/7184394/TestAESBase.java
+index 5c3e6881e..afda2a1f7 100644
+--- a/hotspot/test/compiler/7184394/TestAESBase.java
++++ b/hotspot/test/compiler/7184394/TestAESBase.java
+@@ -106,8 +106,8 @@ abstract public class TestAESBase {
+       cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
+       dCipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
+ 
+-      // CBC init
+-      if (mode.equals("CBC")) {
++      // CBC or CTR init
++      if (mode.equals("CBC") || mode.equals("CTR")) {
+         IvParameterSpec initVector = new IvParameterSpec(iv);
+         cipher.init(Cipher.ENCRYPT_MODE, key, initVector);
+         algParams = cipher.getParameters();
+diff --git a/hotspot/test/compiler/7184394/TestAESMain.java b/hotspot/test/compiler/7184394/TestAESMain.java
+index ddd8eeaef..65949420a 100644
+--- a/hotspot/test/compiler/7184394/TestAESMain.java
++++ b/hotspot/test/compiler/7184394/TestAESMain.java
+@@ -48,6 +48,13 @@
+  * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
+  * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
+  * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
++ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR TestAESMain
++ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 TestAESMain
++ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencOutputOffset=1 TestAESMain
++ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DdecOutputOffset=1 TestAESMain
++ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
++ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
++ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=CTR -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
+  *
+  * @author Tom Deneau
+  */
+diff --git a/jdk/src/share/classes/com/sun/crypto/provider/CounterMode.java b/jdk/src/share/classes/com/sun/crypto/provider/CounterMode.java
+index aea9336c9..c2bd38a71 100644
+--- a/jdk/src/share/classes/com/sun/crypto/provider/CounterMode.java
++++ b/jdk/src/share/classes/com/sun/crypto/provider/CounterMode.java
+@@ -39,10 +39,10 @@ import java.security.InvalidKeyException;
+  * @author Andreas Sterbenz
+  * @since 1.4.2
+  */
+-final class CounterMode extends FeedbackCipher {
++class CounterMode extends FeedbackCipher {
+ 
+     // current counter value
+-    private final byte[] counter;
++    final byte[] counter;
+ 
+     // encrypted bytes of the previous counter value
+     private final byte[] encryptedCounter;
+@@ -137,7 +137,7 @@ final class CounterMode extends FeedbackCipher {
+      * <code>cipherOffset</code>.
+      *
+      * @param in the buffer with the input data to be encrypted
+-     * @param inOffset the offset in <code>plain</code>
++     * @param inOff the offset in <code>plain</code>
+      * @param len the length of the input data
+      * @param out the buffer for the result
+      * @param outOff the offset in <code>cipher</code>
+@@ -176,6 +176,11 @@ final class CounterMode extends FeedbackCipher {
+         RangeUtil.nullAndBoundsCheck(in, inOff, len);
+         RangeUtil.nullAndBoundsCheck(out, outOff, len);
+ 
++        return implCrypt(in, inOff, len, out, outOff);
++    }
++
++    // Implementation of crpyt() method. Possibly replaced with a compiler intrinsic.
++    private int implCrypt(byte[] in, int inOff, int len, byte[] out, int outOff) {
+         int result = len;
+         while (len-- > 0) {
+             if (used >= blockSize) {
+diff --git a/jdk/src/share/classes/com/sun/crypto/provider/GCTR.java b/jdk/src/share/classes/com/sun/crypto/provider/GCTR.java
+index f8a3eaa0a..6a394e448 100644
+--- a/jdk/src/share/classes/com/sun/crypto/provider/GCTR.java
++++ b/jdk/src/share/classes/com/sun/crypto/provider/GCTR.java
+@@ -1,5 +1,5 @@
+ /*
+- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
++ * Copyright (c) 2013, 2017 Oracle and/or its affiliates. All rights reserved.
+  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+  *
+  * This code is free software; you can redistribute it and/or modify it
+@@ -29,52 +29,43 @@
+ 
+ package com.sun.crypto.provider;
+ 
+-import java.security.*;
+-import javax.crypto.*;
++import javax.crypto.IllegalBlockSizeException;
+ import static com.sun.crypto.provider.AESConstants.AES_BLOCK_SIZE;
+ 
+ /**
+  * This class represents the GCTR function defined in NIST 800-38D
+- * under section 6.5. It needs to be constructed w/ an initialized
+- * cipher object, and initial counter block(ICB). Given an input X
+- * of arbitrary length, it processes and returns an output which has
+- * the same length as X. The invariants of this class are:
+- *
+- * (1) The length of intialCounterBlk (and also of its clones, e.g.,
+- * fields counter and counterSave) is equal to AES_BLOCK_SIZE.
+- *
+- * (2) After construction, the field counter never becomes null, it
+- * always contains a byte array of length AES_BLOCK_SIZE.
++ * under section 6.5.  With a given cipher object and initial counter
++ * block, a counter mode operation is performed.  Blocksize is limited
++ * to 16 bytes.
+  *
+  * If any invariant is broken, failures can occur because the
+  * AESCrypt.encryptBlock method can be intrinsified on the HotSpot VM
+  * (see JDK-8067648 for details).
+  *
++ * The counter mode operations can be intrinsified and parallelized
++ * by using CounterMode.implCrypt() if HotSpot VM supports it on the
++ * architecture.
++ *
+  * <p>This function is used in the implementation of GCM mode.
+  *
+  * @since 1.8
+  */
+-final class GCTR {
+-
+-    // these fields should not change after the object has been constructed
+-    private final SymmetricCipher aes;
+-    private final byte[] icb;
+-
+-    // the current counter value
+-    private byte[] counter;
++final class GCTR extends CounterMode {
+ 
+-    // needed for save/restore calls
+-    private byte[] counterSave = null;
+-
+-    // NOTE: cipher should already be initialized
+     GCTR(SymmetricCipher cipher, byte[] initialCounterBlk) {
+-        this.aes = cipher;
++        super(cipher);
+         if (initialCounterBlk.length != AES_BLOCK_SIZE) {
+             throw new RuntimeException("length of initial counter block (" + initialCounterBlk.length +
+                                        ") not equal to AES_BLOCK_SIZE (" + AES_BLOCK_SIZE + ")");
+         }
+-        this.icb = initialCounterBlk;
+-        this.counter = icb.clone();
++
++        iv = initialCounterBlk;
++        reset();
++    }
++
++    @Override
++    String getFeedback() {
++        return "GCTR";
+     }
+ 
+     // input must be multiples of 128-bit blocks when calling update
+@@ -89,23 +80,11 @@ final class GCTR {
+             throw new RuntimeException("output buffer too small");
+         }
+ 
+-        byte[] encryptedCntr = new byte[AES_BLOCK_SIZE];
+-
+-        int numOfCompleteBlocks = inLen / AES_BLOCK_SIZE;
+-        for (int i = 0; i < numOfCompleteBlocks; i++) {
+-            aes.encryptBlock(counter, 0, encryptedCntr, 0);
+-            for (int n = 0; n < AES_BLOCK_SIZE; n++) {
+-                int index = (i * AES_BLOCK_SIZE + n);
+-                out[outOfs + index] =
+-                    (byte) ((in[inOfs + index] ^ encryptedCntr[n]));
+-            }
+-            GaloisCounterMode.increment32(counter);
+-        }
+-        return inLen;
++        return encrypt(in, inOfs, inLen, out, outOfs);
+     }
+ 
+     // input can be arbitrary size when calling doFinal
+-    protected int doFinal(byte[] in, int inOfs, int inLen, byte[] out,
++    int doFinal(byte[] in, int inOfs, int inLen, byte[] out,
+                           int outOfs) throws IllegalBlockSizeException {
+         try {
+             if (inLen < 0) {
+@@ -118,7 +97,7 @@ final class GCTR {
+                 if (lastBlockSize != 0) {
+                     // do the last partial block
+                     byte[] encryptedCntr = new byte[AES_BLOCK_SIZE];
+-                    aes.encryptBlock(counter, 0, encryptedCntr, 0);
++                    embeddedCipher.encryptBlock(counter, 0, encryptedCntr, 0);
+                     for (int n = 0; n < lastBlockSize; n++) {
+                         out[outOfs + completeBlkLen + n] =
+                             (byte) ((in[inOfs + completeBlkLen + n] ^
+@@ -131,28 +110,4 @@ final class GCTR {
+         }
+         return inLen;
+     }
+-
+-    /**
+-     * Resets the content of this object to when it's first constructed.
+-     */
+-    void reset() {
+-        System.arraycopy(icb, 0, counter, 0, icb.length);
+-        counterSave = null;
+-    }
+-
+-    /**
+-     * Save the current content of this object.
+-     */
+-    void save() {
+-        this.counterSave = this.counter.clone();
+-    }
+-
+-    /**
+-     * Restores the content of this object to the previous saved one.
+-     */
+-    void restore() {
+-        if (this.counterSave != null) {
+-            this.counter = this.counterSave;
+-        }
+-    }
+ }
+diff --git a/jdk/src/share/classes/com/sun/crypto/provider/GHASH.java b/jdk/src/share/classes/com/sun/crypto/provider/GHASH.java
+index dc42e6bbf..78f0723d7 100644
+--- a/jdk/src/share/classes/com/sun/crypto/provider/GHASH.java
++++ b/jdk/src/share/classes/com/sun/crypto/provider/GHASH.java
+@@ -122,10 +122,10 @@ final class GHASH {
+ 
+     }
+ 
+-    /* subkeyH and state are stored in long[] for GHASH intrinsic use */
++    /* subkeyHtbl and state are stored in long[] for GHASH intrinsic use */
+ 
+-    // hash subkey H; should not change after the object has been constructed
+-    private final long[] subkeyH;
++    // hashtable subkeyHtbl; holds 2*9 powers of subkeyH computed using carry-less multiplication
++    private long[] subkeyHtbl;
+ 
+     // buffer for storing hash
+     private final long[] state;
+@@ -147,9 +147,9 @@ final class GHASH {
+             throw new ProviderException("Internal error");
+         }
+         state = new long[2];
+-        this.subkeyH = new long[2];
+-        this.subkeyH[0] = getLong(subkeyH, 0);
+-        this.subkeyH[1] = getLong(subkeyH, 8);
++        subkeyHtbl = new long[2*9];
++        subkeyHtbl[0] = getLong(subkeyH, 0);
++        subkeyHtbl[1] = getLong(subkeyH, 8);
+     }
+ 
+     /**
+@@ -192,8 +192,8 @@ final class GHASH {
+         if (inLen == 0) {
+             return;
+         }
+-        ghashRangeCheck(in, inOfs, inLen, state, subkeyH);
+-        processBlocks(in, inOfs, inLen/AES_BLOCK_SIZE, state, subkeyH);
++        ghashRangeCheck(in, inOfs, inLen, state, subkeyHtbl);
++        processBlocks(in, inOfs, inLen/AES_BLOCK_SIZE, state, subkeyHtbl);
+     }
+ 
+     private static void ghashRangeCheck(byte[] in, int inOfs, int inLen, long[] st, long[] subH) {
+@@ -217,8 +217,8 @@ final class GHASH {
+             throw new RuntimeException("internal state has invalid length: " +
+                                        st.length);
+         }
+-        if (subH.length != 2) {
+-            throw new RuntimeException("internal subkeyH has invalid length: " +
++        if (subH.length != 18) {
++            throw new RuntimeException("internal subkeyHtbl has invalid length: " +
+                                        subH.length);
+         }
+     }
+diff --git a/jdk/src/share/classes/sun/security/ssl/SSLSocketImpl.java b/jdk/src/share/classes/sun/security/ssl/SSLSocketImpl.java
+index ab93e3097..dd2618455 100644
+--- a/jdk/src/share/classes/sun/security/ssl/SSLSocketImpl.java
++++ b/jdk/src/share/classes/sun/security/ssl/SSLSocketImpl.java
+@@ -439,6 +439,8 @@ public final class SSLSocketImpl
+                 if (!conContext.isNegotiated) {
+                     readHandshakeRecord();
+                 }
++            } catch (InterruptedIOException iioe) {
++                handleException(iioe);
+             } catch (IOException ioe) {
+                 throw conContext.fatal(Alert.HANDSHAKE_FAILURE,
+                     "Couldn't kickstart handshaking", ioe);
+@@ -1309,12 +1311,11 @@ public final class SSLSocketImpl
+                 }
+             } catch (SSLException ssle) {
+                 throw ssle;
++            } catch (InterruptedIOException iioe) {
++                // don't change exception in case of timeouts or interrupts
++                throw iioe;
+             } catch (IOException ioe) {
+-                if (!(ioe instanceof SSLException)) {
+-                    throw new SSLException("readHandshakeRecord", ioe);
+-                } else {
+-                    throw ioe;
+-                }
++                throw new SSLException("readHandshakeRecord", ioe);
+             }
+         }
+ 
+@@ -1375,6 +1376,9 @@ public final class SSLSocketImpl
+                 }
+             } catch (SSLException ssle) {
+                 throw ssle;
++            } catch (InterruptedIOException iioe) {
++                // don't change exception in case of timeouts or interrupts
++                throw iioe;
+             } catch (IOException ioe) {
+                 if (!(ioe instanceof SSLException)) {
+                     throw new SSLException("readApplicationRecord", ioe);
+diff --git a/jdk/src/share/classes/sun/security/ssl/SSLSocketInputRecord.java b/jdk/src/share/classes/sun/security/ssl/SSLSocketInputRecord.java
+index 401822759..ab5712acc 100644
+--- a/jdk/src/share/classes/sun/security/ssl/SSLSocketInputRecord.java
++++ b/jdk/src/share/classes/sun/security/ssl/SSLSocketInputRecord.java
+@@ -26,6 +26,7 @@
+ package sun.security.ssl;
+ 
+ import java.io.EOFException;
++import java.io.InterruptedIOException;
+ import java.io.IOException;
+ import java.io.InputStream;
+ import java.io.OutputStream;
+@@ -47,37 +48,31 @@ import sun.security.ssl.SSLCipher.SSLReadCipher;
+ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
+     private InputStream is = null;
+     private OutputStream os = null;
+-    private final byte[] temporary = new byte[1024];
++    private final byte[] header = new byte[headerSize];
++    private int headerOff = 0;
++    // Cache for incomplete record body.
++    private ByteBuffer recordBody = ByteBuffer.allocate(1024);
+ 
+     private boolean formatVerified = false;     // SSLv2 ruled out?
+ 
+     // Cache for incomplete handshake messages.
+     private ByteBuffer handshakeBuffer = null;
+ 
+-    private boolean hasHeader = false;          // Had read the record header
+-
+     SSLSocketInputRecord(HandshakeHash handshakeHash) {
+         super(handshakeHash, SSLReadCipher.nullTlsReadCipher());
+     }
+ 
+     @Override
+     int bytesInCompletePacket() throws IOException {
+-        if (!hasHeader) {
+-            // read exactly one record
+-            try {
+-                int really = read(is, temporary, 0, headerSize);
+-                if (really < 0) {
+-                    // EOF: peer shut down incorrectly
+-                    return -1;
+-                }
+-            } catch (EOFException eofe) {
+-                // The caller will handle EOF.
+-                return -1;
+-            }
+-            hasHeader = true;
++        // read header
++        try {
++            readHeader();
++        } catch (EOFException eofe) {
++            // The caller will handle EOF.
++            return -1;
+         }
+ 
+-        byte byteZero = temporary[0];
++        byte byteZero = header[0];
+         int len = 0;
+ 
+         /*
+@@ -93,9 +88,9 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
+              * Last sanity check that it's not a wild record
+              */
+             if (!ProtocolVersion.isNegotiable(
+-                    temporary[1], temporary[2], false)) {
++                    header[1], header[2], false)) {
+                 throw new SSLException("Unrecognized record version " +
+-                        ProtocolVersion.nameOf(temporary[1], temporary[2]) +
++                        ProtocolVersion.nameOf(header[1], header[2]) +
+                         " , plaintext connection?");
+             }
+ 
+@@ -109,8 +104,8 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
+             /*
+              * One of the SSLv3/TLS message types.
+              */
+-            len = ((temporary[3] & 0xFF) << 8) +
+-                   (temporary[4] & 0xFF) + headerSize;
++            len = ((header[3] & 0xFF) << 8) +
++                    (header[4] & 0xFF) + headerSize;
+         } else {
+             /*
+              * Must be SSLv2 or something unknown.
+@@ -121,11 +116,11 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
+              */
+             boolean isShort = ((byteZero & 0x80) != 0);
+ 
+-            if (isShort && ((temporary[2] == 1) || (temporary[2] == 4))) {
++            if (isShort && ((header[2] == 1) || (header[2] == 4))) {
+                 if (!ProtocolVersion.isNegotiable(
+-                        temporary[3], temporary[4], false)) {
++                        header[3], header[4], false)) {
+                     throw new SSLException("Unrecognized record version " +
+-                            ProtocolVersion.nameOf(temporary[3], temporary[4]) +
++                            ProtocolVersion.nameOf(header[3], header[4]) +
+                             " , plaintext connection?");
+                 }
+ 
+@@ -138,9 +133,9 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
+                 //
+                 // int mask = (isShort ? 0x7F : 0x3F);
+                 // len = ((byteZero & mask) << 8) +
+-                //        (temporary[1] & 0xFF) + (isShort ? 2 : 3);
++                //        (header[1] & 0xFF) + (isShort ? 2 : 3);
+                 //
+-                len = ((byteZero & 0x7F) << 8) + (temporary[1] & 0xFF) + 2;
++                len = ((byteZero & 0x7F) << 8) + (header[1] & 0xFF) + 2;
+             } else {
+                 // Gobblygook!
+                 throw new SSLException(
+@@ -160,34 +155,41 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
+             return null;
+         }
+ 
+-        if (!hasHeader) {
+-            // read exactly one record
+-            int really = read(is, temporary, 0, headerSize);
+-            if (really < 0) {
+-                throw new EOFException("SSL peer shut down incorrectly");
+-            }
+-            hasHeader = true;
+-        }
++        // read header
++        readHeader();
+ 
+-        Plaintext plaintext = null;
+-        if (!formatVerified) {
+-            formatVerified = true;
++        Plaintext[] plaintext = null;
++        boolean cleanInBuffer = true;
++        try {
++            if (!formatVerified) {
++                formatVerified = true;
+ 
+-            /*
+-             * The first record must either be a handshake record or an
+-             * alert message. If it's not, it is either invalid or an
+-             * SSLv2 message.
+-             */
+-            if ((temporary[0] != ContentType.HANDSHAKE.id) &&
+-                (temporary[0] != ContentType.ALERT.id)) {
+-                hasHeader = false;
+-                return handleUnknownRecord(temporary);
++                /*
++                 * The first record must either be a handshake record or an
++                 * alert message. If it's not, it is either invalid or an
++                 * SSLv2 message.
++                 */
++                if ((header[0] != ContentType.HANDSHAKE.id) &&
++                        (header[0] != ContentType.ALERT.id)) {
++                    plaintext = handleUnknownRecord();
++                }
+             }
+-        }
+ 
+-        // The record header should has consumed.
+-        hasHeader = false;
+-        return decodeInputRecord(temporary);
++            // The record header should has consumed.
++            if (plaintext == null) {
++                plaintext = decodeInputRecord();
++            }
++        } catch(InterruptedIOException e) {
++            // do not clean header and recordBody in case of Socket Timeout
++            cleanInBuffer = false;
++            throw e;
++        } finally {
++            if (cleanInBuffer) {
++                headerOff = 0;
++                recordBody.clear();
++            }
++        }
++        return plaintext;
+     }
+ 
+     @Override
+@@ -200,9 +202,7 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
+         this.os = outputStream;
+     }
+ 
+-    // Note that destination may be null
+-    private Plaintext[] decodeInputRecord(
+-            byte[] header) throws IOException, BadPaddingException {
++    private Plaintext[] decodeInputRecord() throws IOException, BadPaddingException {
+         byte contentType = header[0];                   // pos: 0
+         byte majorVersion = header[1];                  // pos: 1
+         byte minorVersion = header[2];                  // pos: 2
+@@ -227,30 +227,27 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
+         }
+ 
+         //
+-        // Read a complete record.
++        // Read a complete record and store in the recordBody
++        // recordBody is used to cache incoming record and restore in case of
++        // read operation timedout
+         //
+-        ByteBuffer destination = ByteBuffer.allocate(headerSize + contentLen);
+-        int dstPos = destination.position();
+-        destination.put(temporary, 0, headerSize);
+-        while (contentLen > 0) {
+-            int howmuch = Math.min(temporary.length, contentLen);
+-            int really = read(is, temporary, 0, howmuch);
+-            if (really < 0) {
+-                throw new EOFException("SSL peer shut down incorrectly");
++        if (recordBody.position() == 0) {
++            if (recordBody.capacity() < contentLen) {
++                recordBody = ByteBuffer.allocate(contentLen);
+             }
+-
+-            destination.put(temporary, 0, howmuch);
+-            contentLen -= howmuch;
++            recordBody.limit(contentLen);
++        } else {
++            contentLen = recordBody.remaining();
+         }
+-        destination.flip();
+-        destination.position(dstPos + headerSize);
++        readFully(contentLen);
++        recordBody.flip();
+ 
+         if (SSLLogger.isOn && SSLLogger.isOn("record")) {
+             SSLLogger.fine(
+                     "READ: " +
+                     ProtocolVersion.nameOf(majorVersion, minorVersion) +
+                     " " + ContentType.nameOf(contentType) + ", length = " +
+-                    destination.remaining());
++                    recordBody.remaining());
+         }
+ 
+         //
+@@ -259,7 +256,7 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
+         ByteBuffer fragment;
+         try {
+             Plaintext plaintext =
+-                    readCipher.decrypt(contentType, destination, null);
++                    readCipher.decrypt(contentType, recordBody, null);
+             fragment = plaintext.fragment;
+             contentType = plaintext.contentType;
+         } catch (BadPaddingException bpe) {
+@@ -368,8 +365,7 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
+             };
+     }
+ 
+-    private Plaintext[] handleUnknownRecord(
+-            byte[] header) throws IOException, BadPaddingException {
++    private Plaintext[] handleUnknownRecord() throws IOException, BadPaddingException {
+         byte firstByte = header[0];
+         byte thirdByte = header[2];
+ 
+@@ -411,32 +407,29 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
+             }
+ 
+             int msgLen = ((header[0] & 0x7F) << 8) | (header[1] & 0xFF);
+-
+-            ByteBuffer destination = ByteBuffer.allocate(headerSize + msgLen);
+-            destination.put(temporary, 0, headerSize);
+-            msgLen -= 3;            // had read 3 bytes of content as header
+-            while (msgLen > 0) {
+-                int howmuch = Math.min(temporary.length, msgLen);
+-                int really = read(is, temporary, 0, howmuch);
+-                if (really < 0) {
+-                    throw new EOFException("SSL peer shut down incorrectly");
++            if (recordBody.position() == 0) {
++                if (recordBody.capacity() < (headerSize + msgLen)) {
++                    recordBody = ByteBuffer.allocate(headerSize + msgLen);
+                 }
+-
+-                destination.put(temporary, 0, howmuch);
+-                msgLen -= howmuch;
++                recordBody.limit(headerSize + msgLen);
++                recordBody.put(header, 0, headerSize);
++            } else {
++                msgLen = recordBody.remaining();
+             }
+-            destination.flip();
++            msgLen -= 3;            // had read 3 bytes of content as header
++            readFully(msgLen);
++            recordBody.flip();
+ 
+             /*
+              * If we can map this into a V3 ClientHello, read and
+              * hash the rest of the V2 handshake, turn it into a
+              * V3 ClientHello message, and pass it up.
+              */
+-            destination.position(2);     // exclude the header
+-            handshakeHash.receive(destination);
+-            destination.position(0);
++            recordBody.position(2);     // exclude the header
++            handshakeHash.receive(recordBody);
++            recordBody.position(0);
+ 
+-            ByteBuffer converted = convertToClientHello(destination);
++            ByteBuffer converted = convertToClientHello(recordBody);
+ 
+             if (SSLLogger.isOn && SSLLogger.isOn("packet")) {
+                 SSLLogger.fine(
+@@ -456,28 +449,42 @@ final class SSLSocketInputRecord extends InputRecord implements SSLRecord {
+         }
+     }
+ 
+-    // Read the exact bytes of data, otherwise, return -1.
+-    private static int read(InputStream is,
+-            byte[] buffer, int offset, int len) throws IOException {
+-        int n = 0;
+-        while (n < len) {
+-            int readLen = is.read(buffer, offset + n, len - n);
+-            if (readLen < 0) {
+-                if (SSLLogger.isOn && SSLLogger.isOn("packet")) {
+-                    SSLLogger.fine("Raw read: EOF");
+-                }
+-                return -1;
++    // Read the exact bytes of data, otherwise, throw IOException.
++    private int readFully(int len) throws IOException {
++        int end = len + recordBody.position();
++        int off = recordBody.position();
++        try {
++            while (off < end) {
++                off += read(is, recordBody.array(), off, end - off);
+             }
++        } finally {
++            recordBody.position(off);
++        }
++        return len;
++    }
++
++    // Read SSE record header, otherwise, throw IOException.
++    private int readHeader() throws IOException {
++        while (headerOff < headerSize) {
++            headerOff += read(is, header, headerOff, headerSize - headerOff);
++        }
++        return headerSize;
++    }
+ 
++    private static int read(InputStream is, byte[] buf, int off, int len)  throws IOException {
++        int readLen = is.read(buf, off, len);
++        if (readLen < 0) {
+             if (SSLLogger.isOn && SSLLogger.isOn("packet")) {
+-                ByteBuffer bb = ByteBuffer.wrap(buffer, offset + n, readLen);
+-                SSLLogger.fine("Raw read", bb);
++                SSLLogger.fine("Raw read: EOF");
+             }
+-
+-            n += readLen;
++            throw new EOFException("SSL peer shut down incorrectly");
+         }
+ 
+-        return n;
++        if (SSLLogger.isOn && SSLLogger.isOn("packet")) {
++            ByteBuffer bb = ByteBuffer.wrap(buf, off, readLen);
++            SSLLogger.fine("Raw read", bb);
++        }
++        return readLen;
+     }
+ 
+     // Try to use up the input stream without impact the performance too much.
+diff --git a/jdk/src/share/classes/sun/security/ssl/SSLTransport.java b/jdk/src/share/classes/sun/security/ssl/SSLTransport.java
+index b3d03b370..78e13ea2c 100644
+--- a/jdk/src/share/classes/sun/security/ssl/SSLTransport.java
++++ b/jdk/src/share/classes/sun/security/ssl/SSLTransport.java
+@@ -27,6 +27,7 @@ package sun.security.ssl;
+ 
+ import java.io.EOFException;
+ import java.io.IOException;
++import java.io.InterruptedIOException;
+ import java.nio.ByteBuffer;
+ import javax.crypto.AEADBadTagException;
+ import javax.crypto.BadPaddingException;
+@@ -134,6 +135,9 @@ interface SSLTransport {
+         } catch (EOFException eofe) {
+             // rethrow EOFException, the call will handle it if neede.
+             throw eofe;
++        } catch (InterruptedIOException iioe) {
++            // don't close the Socket in case of timeouts or interrupts.
++            throw iioe;
+         } catch (IOException ioe) {
+             throw context.fatal(Alert.UNEXPECTED_MESSAGE, ioe);
+         }
+diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMBench.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMBench.java
+new file mode 100644
+index 000000000..258672f59
+--- /dev/null
++++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMBench.java
+@@ -0,0 +1,128 @@
++/*
++ * Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved.
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This code is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 only, as
++ * published by the Free Software Foundation.
++ *
++ * This code is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++ * version 2 for more details (a copy is included in the LICENSE file that
++ * accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License version
++ * 2 along with this work; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
++ *
++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
++ * or visit www.oracle.com if you need additional information or have any
++ * questions.
++ */
++package org.openjdk.bench.javax.crypto.full;
++
++import org.openjdk.jmh.annotations.Benchmark;
++import org.openjdk.jmh.annotations.Param;
++import org.openjdk.jmh.annotations.Setup;
++
++import javax.crypto.Cipher;
++import javax.crypto.spec.GCMParameterSpec;
++import javax.crypto.spec.SecretKeySpec;
++
++/**
++ * This performance tests runs AES/GCM encryption and decryption using byte[]
++ * as input and output buffers for single and multi-part testing.
++ *
++ * This test rotates the IV and creates a new GCMParameterSpec for each encrypt
++ * benchmark operation
++ */
++
++public class AESGCMBench extends CryptoBase {
++
++    @Param({"128"})
++    private int keyLength;
++
++    @Param({"1024", "1500", "4096", "16384"})
++    private int dataSize;
++
++    byte[] encryptedData;
++    byte[] in, out;
++    private Cipher encryptCipher;
++    private Cipher decryptCipher;
++    SecretKeySpec ks;
++    GCMParameterSpec gcm_spec;
++    byte[] iv;
++
++    private static final int IV_BUFFER_SIZE = 32;
++    private static final int IV_MODULO = IV_BUFFER_SIZE - 16;
++    int iv_index = 0;
++    int updateLen = 0;
++
++    private int next_iv_index() {
++        int r = iv_index;
++        iv_index = (iv_index + 1) % IV_MODULO;
++        return r;
++    }
++
++    @Setup
++    public void setup() throws Exception {
++        setupProvider();
++
++        // Setup key material
++        byte[] keystring = fillSecureRandom(new byte[keyLength / 8]);
++        ks = new SecretKeySpec(keystring, "AES");
++        iv = fillSecureRandom(new byte[IV_BUFFER_SIZE]);
++        gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16);
++
++        // Setup Cipher classes
++        encryptCipher = makeCipher(prov, "AES/GCM/NoPadding");
++        encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec);
++        decryptCipher = makeCipher(prov, "AES/GCM/NoPadding");
++        decryptCipher.init(Cipher.DECRYPT_MODE, ks,
++                encryptCipher.getParameters().
++                        getParameterSpec(GCMParameterSpec.class));
++
++        // Setup input/output buffers
++        in = fillRandom(new byte[dataSize]);
++        encryptedData = new byte[encryptCipher.getOutputSize(in.length)];
++        out = new byte[encryptedData.length];
++        encryptCipher.doFinal(in, 0, in.length, encryptedData, 0);
++        updateLen = in.length / 2;
++
++    }
++
++    @Benchmark
++    public void encrypt() throws Exception {
++        gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16);
++        encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec);
++        encryptCipher.doFinal(in, 0, in.length, out, 0);
++    }
++
++    @Benchmark
++    public void encryptMultiPart() throws Exception {
++        gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16);
++        encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec);
++        int outOfs = encryptCipher.update(in, 0, updateLen, out, 0);
++        encryptCipher.doFinal(in, updateLen, in.length - updateLen,
++                out, outOfs);
++    }
++
++    @Benchmark
++    public void decrypt() throws Exception {
++        decryptCipher.init(Cipher.DECRYPT_MODE, ks,
++                encryptCipher.getParameters().
++                        getParameterSpec(GCMParameterSpec.class));
++        decryptCipher.doFinal(encryptedData, 0, encryptedData.length, out, 0);
++    }
++
++    @Benchmark
++    public void decryptMultiPart() throws Exception {
++        decryptCipher.init(Cipher.DECRYPT_MODE, ks,
++                encryptCipher.getParameters().
++                        getParameterSpec(GCMParameterSpec.class));
++        decryptCipher.update(encryptedData, 0, updateLen, out, 0);
++        decryptCipher.doFinal(encryptedData, updateLen,
++                encryptedData.length - updateLen, out, 0);
++    }
++}
+\ No newline at end of file
+diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMByteBuffer.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMByteBuffer.java
+new file mode 100644
+index 000000000..cb6d20c51
+--- /dev/null
++++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/AESGCMByteBuffer.java
+@@ -0,0 +1,163 @@
++/*
++ * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This code is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 only, as
++ * published by the Free Software Foundation.  Oracle designates this
++ * particular file as subject to the "Classpath" exception as provided
++ * by Oracle in the LICENSE file that accompanied this code.
++ *
++ * This code is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++ * version 2 for more details (a copy is included in the LICENSE file that
++ * accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License version
++ * 2 along with this work; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
++ *
++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
++ * or visit www.oracle.com if you need additional information or have any
++ * questions.
++ */
++package org.openjdk.bench.javax.crypto.full;
++
++import org.openjdk.jmh.annotations.Benchmark;
++import org.openjdk.jmh.annotations.Param;
++import org.openjdk.jmh.annotations.Setup;
++
++import javax.crypto.Cipher;
++import javax.crypto.spec.GCMParameterSpec;
++import javax.crypto.spec.SecretKeySpec;
++import java.nio.ByteBuffer;
++
++/**
++ * This performance tests runs AES/GCM encryption and decryption using heap and
++ * direct ByteBuffers as input and output buffers for single and multi-part
++ * operations.
++ *
++ * This test rotates the IV and creates a new GCMParameterSpec for each encrypt
++ * benchmark operation
++ */
++
++public class AESGCMByteBuffer extends CryptoBase {
++
++    @Param({"128"})
++    private int keyLength;
++
++    @Param({"1024", "1500", "4096", "16384"})
++    private int dataSize;
++
++    @Param({"direct", "heap"})
++    private String dataMethod;
++
++    byte[] data;
++    ByteBuffer encryptedData;
++    ByteBuffer in, out;
++    private Cipher encryptCipher;
++    private Cipher decryptCipher;
++    SecretKeySpec ks;
++    GCMParameterSpec gcm_spec;
++    byte[] iv;
++
++    private static final int IV_BUFFER_SIZE = 32;
++    private static final int IV_MODULO = IV_BUFFER_SIZE - 16;
++    int iv_index = 0;
++    int updateLen = 0;
++
++    private int next_iv_index() {
++        int r = iv_index;
++        iv_index = (iv_index + 1) % IV_MODULO;
++        return r;
++    }
++
++    @Setup
++    public void setup() throws Exception {
++        setupProvider();
++
++        // Setup key material
++        byte[] keystring = fillSecureRandom(new byte[keyLength / 8]);
++        ks = new SecretKeySpec(keystring, "AES");
++        iv = fillSecureRandom(new byte[IV_BUFFER_SIZE]);
++        gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16);
++
++        // Setup Cipher classes
++        encryptCipher = makeCipher(prov, "AES/GCM/NoPadding");
++        encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec);
++        decryptCipher = makeCipher(prov, "AES/GCM/NoPadding");
++        decryptCipher.init(Cipher.DECRYPT_MODE, ks,
++                encryptCipher.getParameters().
++                        getParameterSpec(GCMParameterSpec.class));
++
++        // Setup input/output buffers
++        data = fillRandom(new byte[dataSize]);
++        if (dataMethod.equalsIgnoreCase("direct")) {
++            in = ByteBuffer.allocateDirect(data.length);
++            in.put(data);
++            in.flip();
++            encryptedData = ByteBuffer.allocateDirect(
++                    encryptCipher.getOutputSize(data.length));
++            out = ByteBuffer.allocateDirect(encryptedData.capacity());
++        } else if (dataMethod.equalsIgnoreCase("heap")) {
++            in = ByteBuffer.wrap(data);
++            encryptedData = ByteBuffer.allocate(
++                    encryptCipher.getOutputSize(data.length));
++            out = ByteBuffer.allocate(encryptedData.capacity());
++        }
++
++        encryptCipher.doFinal(in, encryptedData);
++        encryptedData.flip();
++        in.flip();
++        updateLen = in.remaining() / 2;
++    }
++
++    @Benchmark
++    public void encrypt() throws Exception {
++        gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16);
++        encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec);
++        encryptCipher.doFinal(in, out);
++        out.flip();
++        in.flip();
++    }
++
++    @Benchmark
++    public void encryptMultiPart() throws Exception {
++        gcm_spec = new GCMParameterSpec(96, iv, next_iv_index(), 16);
++        encryptCipher.init(Cipher.ENCRYPT_MODE, ks, gcm_spec);
++        in.limit(updateLen);
++        encryptCipher.update(in, out);
++        in.limit(in.capacity());
++        encryptCipher.doFinal(in, out);
++        out.flip();
++        in.flip();
++    }
++
++    @Benchmark
++    public void decrypt() throws Exception {
++        decryptCipher.init(Cipher.DECRYPT_MODE, ks,
++                encryptCipher.getParameters().
++                        getParameterSpec(GCMParameterSpec.class));
++        decryptCipher.doFinal(encryptedData, out);
++        encryptedData.flip();
++        out.flip();
++    }
++
++    @Benchmark
++    public void decryptMultiPart() throws Exception {
++        decryptCipher.init(Cipher.DECRYPT_MODE, ks,
++                encryptCipher.getParameters().
++                        getParameterSpec(GCMParameterSpec.class));
++
++        int len = encryptedData.remaining();
++        encryptedData.limit(updateLen);
++        decryptCipher.update(encryptedData, out);
++        encryptedData.limit(len);
++
++        decryptCipher.doFinal(encryptedData, out);
++        encryptedData.flip();
++        out.flip();
++    }
++
++}
+\ No newline at end of file
+diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/full/CryptoBase.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/CryptoBase.java
+new file mode 100644
+index 000000000..4af12703b
+--- /dev/null
++++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/full/CryptoBase.java
+@@ -0,0 +1,102 @@
++/*
++ * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This code is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 only, as
++ * published by the Free Software Foundation.
++ *
++ * This code is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++ * version 2 for more details (a copy is included in the LICENSE file that
++ * accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License version
++ * 2 along with this work; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
++ *
++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
++ * or visit www.oracle.com if you need additional information or have any
++ * questions.
++ */
++package org.openjdk.bench.javax.crypto.full;
++
++import org.openjdk.jmh.annotations.BenchmarkMode;
++import org.openjdk.jmh.annotations.Fork;
++import org.openjdk.jmh.annotations.Measurement;
++import org.openjdk.jmh.annotations.Mode;
++import org.openjdk.jmh.annotations.OutputTimeUnit;
++import org.openjdk.jmh.annotations.Param;
++import org.openjdk.jmh.annotations.Scope;
++import org.openjdk.jmh.annotations.Setup;
++import org.openjdk.jmh.annotations.State;
++import org.openjdk.jmh.annotations.Warmup;
++
++import javax.crypto.BadPaddingException;
++import javax.crypto.Cipher;
++import javax.crypto.IllegalBlockSizeException;
++import javax.crypto.NoSuchPaddingException;
++import java.security.NoSuchAlgorithmException;
++import java.security.Provider;
++import java.security.SecureRandom;
++import java.security.Security;
++import java.util.Random;
++import java.util.concurrent.TimeUnit;
++
++
++@Fork(jvmArgsAppend = {"-XX:+AlwaysPreTouch"}, value = 5)
++@Warmup(iterations = 3, time = 3)
++@Measurement(iterations = 8, time = 2)
++@OutputTimeUnit(TimeUnit.SECONDS)
++@State(Scope.Thread)
++@BenchmarkMode(Mode.Throughput)
++public class CryptoBase {
++
++    @Param({""})
++    private String provider;
++
++    public Provider prov = null;
++
++    @Setup
++    public void setupProvider() {
++        if (provider != null && !provider.isEmpty()) {
++            prov = Security.getProvider(provider);
++            if (prov == null) {
++                throw new RuntimeException("Can't find prodiver \"" + provider + "\"");
++            }
++        }
++    }
++
++    public static Cipher makeCipher(Provider prov, String algorithm) throws NoSuchPaddingException, NoSuchAlgorithmException {
++        return (prov == null) ? Cipher.getInstance(algorithm) : Cipher.getInstance(algorithm, prov);
++    }
++
++    public static byte[][] fillRandom(byte[][] data) {
++        Random rnd = new Random();
++        for (byte[] d : data) {
++            rnd.nextBytes(d);
++        }
++        return data;
++    }
++
++    public static byte[] fillRandom(byte[] data) {
++        Random rnd = new Random();
++        rnd.nextBytes(data);
++        return data;
++    }
++
++    public static byte[] fillSecureRandom(byte[] data) {
++        SecureRandom rnd = new SecureRandom();
++        rnd.nextBytes(data);
++        return data;
++    }
++
++    public static byte[][] fillEncrypted(byte[][] data, Cipher encryptCipher) throws BadPaddingException, IllegalBlockSizeException {
++        byte[][] encryptedData = new byte[data.length][];
++        for (int i = 0; i < encryptedData.length; i++) {
++            encryptedData[i] = encryptCipher.doFinal(data[i]);
++        }
++        return encryptedData;
++    }
++}
+\ No newline at end of file
+diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMBench.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMBench.java
+new file mode 100644
+index 000000000..a21b0c87f
+--- /dev/null
++++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMBench.java
+@@ -0,0 +1,36 @@
++/*
++ * Copyright (c) 2015, 2021, Oracle and/or its affiliates. All rights reserved.
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This code is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 only, as
++ * published by the Free Software Foundation.
++ *
++ * This code is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++ * version 2 for more details (a copy is included in the LICENSE file that
++ * accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License version
++ * 2 along with this work; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
++ *
++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
++ * or visit www.oracle.com if you need additional information or have any
++ * questions.
++ */
++package org.openjdk.bench.javax.crypto.small;
++
++import org.openjdk.jmh.annotations.Param;
++
++public class AESGCMBench extends
++        org.openjdk.bench.javax.crypto.full.AESGCMBench {
++
++    @Param({"128"})
++    private int keyLength;
++
++    @Param({"1024"})
++    private int dataSize;
++
++}
+\ No newline at end of file
+diff --git a/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMByteBuffer.java b/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMByteBuffer.java
+new file mode 100644
+index 000000000..2e389d300
+--- /dev/null
++++ b/jdk/test/micro/org/openjdk/bench/javax/crypto/small/AESGCMByteBuffer.java
+@@ -0,0 +1,36 @@
++/*
++ * Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This code is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 only, as
++ * published by the Free Software Foundation.
++ *
++ * This code is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++ * version 2 for more details (a copy is included in the LICENSE file that
++ * accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License version
++ * 2 along with this work; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
++ *
++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
++ * or visit www.oracle.com if you need additional information or have any
++ * questions.
++ */
++package org.openjdk.bench.javax.crypto.small;
++
++import org.openjdk.jmh.annotations.Param;
++
++public class AESGCMByteBuffer extends
++        org.openjdk.bench.javax.crypto.full.AESGCMByteBuffer {
++
++    @Param({"128"})
++    private int keyLength;
++
++    @Param({"1024"})
++    private int dataSize;
++
++}
+\ No newline at end of file
+diff --git a/jdk/test/sun/security/ssl/SSLSocketImpl/ClientTimeout.java b/jdk/test/sun/security/ssl/SSLSocketImpl/ClientTimeout.java
+index 3eb1d7b89..7678cc71f 100644
+--- a/jdk/test/sun/security/ssl/SSLSocketImpl/ClientTimeout.java
++++ b/jdk/test/sun/security/ssl/SSLSocketImpl/ClientTimeout.java
+@@ -26,8 +26,7 @@
+ 
+ /*
+  * @test
+- * @bug 4836493
+- * @ignore need further evaluation
++ * @bug 4836493 8239798
+  * @summary Socket timeouts for SSLSockets causes data corruption.
+  * @run main/othervm ClientTimeout
+  */
+diff --git a/jdk/test/sun/security/ssl/SSLSocketImpl/SSLExceptionForIOIssue.java b/jdk/test/sun/security/ssl/SSLSocketImpl/SSLExceptionForIOIssue.java
+index 3e626a257..5578ea725 100644
+--- a/jdk/test/sun/security/ssl/SSLSocketImpl/SSLExceptionForIOIssue.java
++++ b/jdk/test/sun/security/ssl/SSLSocketImpl/SSLExceptionForIOIssue.java
+@@ -36,7 +36,7 @@
+ 
+ import javax.net.ssl.*;
+ import java.io.*;
+-import java.net.InetAddress;
++import java.net.*;
+ 
+ public class SSLExceptionForIOIssue implements SSLContextTemplate {
+ 
+@@ -139,7 +139,7 @@ public class SSLExceptionForIOIssue implements SSLContextTemplate {
+             } catch (SSLProtocolException | SSLHandshakeException sslhe) {
+                 clientException = sslhe;
+                 System.err.println("unexpected client exception: " + sslhe);
+-            } catch (SSLException ssle) {
++            } catch (SSLException | SocketTimeoutException ssle) {
+                 // the expected exception, ignore it
+                 System.err.println("expected client exception: " + ssle);
+             } catch (Exception e) {
+-- 
+2.17.1
+
diff --git a/openjdk-1.8.0.spec b/openjdk-1.8.0.spec
index a8643154f3b397beb3b9b85e550b762a0cc988ad..c5b3d0136c8361f780f0bba8f5d49df2e79c07eb 100644
--- a/openjdk-1.8.0.spec
+++ b/openjdk-1.8.0.spec
@@ -916,7 +916,7 @@ Provides: java-%{javaver}-%{origin}-accessibility%{?1} = %{epoch}:%{version}-%{r
 
 Name:    java-%{javaver}-%{origin}
 Version: %{javaver}.%{updatever}.%{buildver}
-Release: 3
+Release: 4
 # java-1.5.0-ibm from jpackage.org set Epoch to 1 for unknown reasons
 # and this change was brought into RHEL-4. java-1.5.0-ibm packages
 # also included the epoch in their virtual provides. This created a
@@ -1139,6 +1139,7 @@ Patch249: Improve_AlgorithmConstraints_checkAlgorithm_performance.patch
 Patch250: modify_coding_style_and_describe_error.patch
 Patch251: fix_wrap_memcpy_undefined_gcc10_3.patch
 Patch252: 8290705_fix_StringConcat_validate_mem_flow_asserts_with_unexpected_userStoreI.patch
+Patch253: 8143925-enhancing-CounterMode.crypt-for-AESCrypt.patch
 
 #############################################
 #
@@ -1618,6 +1619,7 @@ pushd %{top_level_dir_name}
 %patch250 -p1
 %patch251 -p1
 %patch252 -p1
+%patch253 -p1
 popd
 
 # System library fixes
@@ -2242,6 +2244,10 @@ cjc.mainProgram(arg)
 %endif
 
 %changelog
+
+* Thu Sep 15 2022 kuenking111 <wangkun49@huawei.com> - 1:1.8.0.342-b07.4
+- add 8143925-enhancing-CounterMode.crypt-for-AESCrypt.patch
+
 * Fri Aug 5 2022 kuenking111 <wangkun49@huawei.com> - 1:1.8.0.342-b07.3
 - add 8290705_fix_StringConcat_validate_mem_flow_asserts_with_unexpected_userStoreI.patch
 - modified version.txt