From 1abe9b3b43101067b004512d962580e966480eec Mon Sep 17 00:00:00 2001
From: hu_bo_dao <hubodao@huawei.com>
Date: Fri, 11 Jun 2021 09:48:31 +0800
Subject: [PATCH 1/6] I3V9SK: jmap support CMS parallel inspection

---
 openjdk-1.8.0.spec                    |   8 +-
 support_CMS_parallel_inspection.patch | 298 ++++++++++++++++++++++++++
 2 files changed, 304 insertions(+), 2 deletions(-)
 create mode 100755 support_CMS_parallel_inspection.patch

diff --git a/openjdk-1.8.0.spec b/openjdk-1.8.0.spec
index 5d5277e..83df46b 100644
--- a/openjdk-1.8.0.spec
+++ b/openjdk-1.8.0.spec
@@ -918,7 +918,7 @@ Provides: java-%{javaver}-%{origin}-accessibility%{?1} = %{epoch}:%{version}-%{r
 
 Name:    java-%{javaver}-%{origin}
 Version: %{javaver}.%{updatever}.%{buildver}
-Release: 9
+Release: 10
 # java-1.5.0-ibm from jpackage.org set Epoch to 1 for unknown reasons
 # and this change was brought into RHEL-4. java-1.5.0-ibm packages
 # also included the epoch in their virtual provides. This created a
@@ -1105,6 +1105,7 @@ Patch191: 8264640.patch
 Patch192: add_kae_implementation_add_default_conf_file.patch
 Patch193: improve_algorithmConstraints_checkAlgorithm_performance.patch
 Patch194: modify_the_default_iteration_time_and_forks_in_the_JMH_of_KAEProvider.patch
+Patch195: support_CMS_parallel_inspection.patch
 
 #############################################
 #
@@ -1558,7 +1559,7 @@ pushd %{top_level_dir_name}
 %patch191 -p1
 %patch192 -p1
 %patch194 -p1
-
+%patch195 -p1
 popd
 
 # System library fixes
@@ -2175,6 +2176,9 @@ require "copy_jdk_configs.lua"
 %endif
 
 %changelog
+* Wed Jun 10 2021 hu_bo_dao <hubodao@huawei.com> - 1:1.8.0.292-b10.10
+- add support_CMS_parallel_inspection.patch
+
 * Wed Jun 9 2021 noah <hedongbo@huawei.com> - 1:1.8.0.292-b10.9
 - add modify_the_default_iteration_time_and_forks_in_the_JMH_of_KAEProvider.patch
 
diff --git a/support_CMS_parallel_inspection.patch b/support_CMS_parallel_inspection.patch
new file mode 100755
index 0000000..a5a5b9d
--- /dev/null
+++ b/support_CMS_parallel_inspection.patch
@@ -0,0 +1,298 @@
+commit 6128a6c319f9d10c604bf7d4049ef68b7fd11b27
+Author: hubodao <hubodao@huawei.com>
+Date:   Tue Jun 8 07:37:02 2021 +0000
+
+    support CMS Parallel inspection
+
+diff --git a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
+index 53b75a4ca..3c3deab28 100644
+--- a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
++++ b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
+@@ -2871,6 +2871,47 @@ void ConcurrentMarkSweepGeneration::gc_epilogue(bool full) {
+   }
+ }
+ 
++size_t ConcurrentMarkSweepGeneration::num_iterable_blocks() const
++{
++  return (used_stable() + CMSIterateBlockSize - 1) / CMSIterateBlockSize;
++}
++
++void ConcurrentMarkSweepGeneration::object_iterate_block(ObjectClosure *cl, size_t block_index)
++{
++  size_t block_word_size = CMSIterateBlockSize / HeapWordSize;
++  MemRegion span = MemRegion(cmsSpace()->bottom() + block_index * block_word_size,
++                             cmsSpace()->bottom() + (block_index + 1) * block_word_size);
++  if (!span.is_empty()) {  // Non-null task
++    HeapWord *prev_obj;
++    if (block_index == 0) {
++      prev_obj = span.start();
++    } else {
++      prev_obj = cmsSpace()->block_start_careful(span.start());
++      while (prev_obj < span.start()) {
++        size_t sz = cmsSpace()->block_size_no_stall(prev_obj, _collector);
++        if (sz > 0) {
++          prev_obj += sz;
++        } else {
++          break;
++        }
++      }
++    }
++    if (prev_obj < span.end()) {
++      HeapWord *cur, *limit;
++      size_t curSize;
++      for (cur = prev_obj, limit = span.end(); cur < limit; cur += curSize) {
++        curSize = cmsSpace()->block_size_no_stall(cur, _collector);
++        if (curSize == 0) {
++          break;
++        }
++        if (cmsSpace()->block_is_obj(cur)) {
++          cl->do_object(oop(cur));
++        }
++      }
++    }
++  }
++}
++
+ void ConcurrentMarkSweepGeneration::gc_epilogue_work(bool full) {
+   assert(!incremental_collection_failed(), "Should have been cleared");
+   cmsSpace()->setPreconsumptionDirtyCardClosure(NULL);
+diff --git a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp
+index ca3fee21b..7d05410fe 100644
+--- a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp
++++ b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.hpp
+@@ -1154,9 +1154,10 @@ class ConcurrentMarkSweepGeneration: public CardGeneration {
+ 
+   // Adaptive size policy
+   CMSAdaptiveSizePolicy* size_policy();
+-
++  static const size_t CMSIterateBlockSize = 1024 * 1024;
+   void set_did_compact(bool v) { _did_compact = v; }
+-
++  virtual size_t num_iterable_blocks() const;
++  virtual void object_iterate_block(ObjectClosure *cl, size_t block_index);
+   bool refs_discovery_is_atomic() const { return false; }
+   bool refs_discovery_is_mt()     const {
+     // Note: CMS does MT-discovery during the parallel-remark
+diff --git a/hotspot/src/share/vm/memory/genCollectedHeap.cpp b/hotspot/src/share/vm/memory/genCollectedHeap.cpp
+index ed2c0afb7..20fbbfd8e 100644
+--- a/hotspot/src/share/vm/memory/genCollectedHeap.cpp
++++ b/hotspot/src/share/vm/memory/genCollectedHeap.cpp
+@@ -1272,6 +1272,73 @@ void GenCollectedHeap::print_heap_change(size_t prev_used) const {
+   }
+ }
+ 
++// The CMSHeapBlockClaimer is used during parallel iteration over the heap,
++// allowing workers to claim heap areas ("blocks"), gaining exclusive rights to these.
++// The eden and survivor spaces are treated as single blocks as it is hard to divide
++// these spaces.
++// The old space is divided into fixed-size blocks.
++class CMSHeapBlockClaimer : public StackObj {
++  size_t _claimed_index;
++
++public:
++  static const size_t InvalidIndex = SIZE_MAX;
++  static const size_t EdenIndex = 0;
++  static const size_t SurvivorIndex = 1;
++  static const size_t NumNonOldGenClaims = 2;
++
++  CMSHeapBlockClaimer() : _claimed_index(EdenIndex) { }
++  // Claim the block and get the block index.
++  size_t claim_and_get_block()
++  {
++    size_t block_index;
++    block_index = Atomic::add(1u, reinterpret_cast<volatile jint *>(&_claimed_index)) - 1;
++    Generation *old_gen = GenCollectedHeap::heap()->get_gen(1);
++    size_t num_claims = old_gen->num_iterable_blocks() + NumNonOldGenClaims;
++    return block_index < num_claims ? block_index : InvalidIndex;
++  }
++  ~CMSHeapBlockClaimer() {}
++};
++
++void GenCollectedHeap::object_iterate_parallel(ObjectClosure *cl, CMSHeapBlockClaimer *claimer)
++{
++  size_t block_index = claimer->claim_and_get_block();
++  DefNewGeneration *def_new_gen = (DefNewGeneration*) get_gen(0);
++  // Iterate until all blocks are claimed
++  if (block_index == CMSHeapBlockClaimer::EdenIndex) {
++    def_new_gen->eden()->object_iterate(cl);
++    block_index = claimer->claim_and_get_block();
++  }
++  if (block_index == CMSHeapBlockClaimer::SurvivorIndex) {
++    def_new_gen->from()->object_iterate(cl);
++    def_new_gen->to()->object_iterate(cl);
++    block_index = claimer->claim_and_get_block();
++  }
++  while (block_index != CMSHeapBlockClaimer::InvalidIndex) {
++    get_gen(1)->object_iterate_block(cl, block_index - CMSHeapBlockClaimer::NumNonOldGenClaims);
++    block_index = claimer->claim_and_get_block();
++  }
++}
++
++class GenParallelObjectIterator : public ParallelObjectIterator {
++private:
++  GenCollectedHeap *_heap;
++  CMSHeapBlockClaimer  _claimer;
++
++public:
++  GenParallelObjectIterator(uint thread_num) : _heap(GenCollectedHeap::heap()),_claimer(){}
++
++  virtual void object_iterate(ObjectClosure *cl, uint worker_id)
++  {
++    _heap->object_iterate_parallel(cl, &_claimer);
++  }
++  ~GenParallelObjectIterator() {}
++};
++
++ParallelObjectIterator* GenCollectedHeap::parallel_object_iterator(uint thread_num)
++{
++  return new GenParallelObjectIterator(thread_num);
++}
++
+ class GenGCPrologueClosure: public GenCollectedHeap::GenClosure {
+  private:
+   bool _full;
+@@ -1415,6 +1482,7 @@ void GenCollectedHeap::stop() {
+ #endif
+ }
+ 
+-void GenCollectedHeap::run_task(AbstractGangTask *task) {
+-
++void GenCollectedHeap::run_task(AbstractGangTask *task)
++{
++  workers()->run_task(task);
+ }
+diff --git a/hotspot/src/share/vm/memory/genCollectedHeap.hpp b/hotspot/src/share/vm/memory/genCollectedHeap.hpp
+index 2c78ea15a..9e5405e28 100644
+--- a/hotspot/src/share/vm/memory/genCollectedHeap.hpp
++++ b/hotspot/src/share/vm/memory/genCollectedHeap.hpp
+@@ -30,6 +30,7 @@
+ #include "memory/generation.hpp"
+ #include "memory/sharedHeap.hpp"
+ 
++class CMSHeapBlockClaimer;
+ class SubTasksDone;
+ 
+ // A "GenCollectedHeap" is a SharedHeap that uses generational
+@@ -213,7 +214,14 @@ public:
+   // Iteration functions.
+   void oop_iterate(ExtendedOopClosure* cl);
+   void object_iterate(ObjectClosure* cl);
++  virtual ParallelObjectIterator* parallel_object_iterator(uint thread_num);
++  // Iteration functions.
++  void object_iterate_parallel(ObjectClosure *cl, CMSHeapBlockClaimer *claimer);
+   void safe_object_iterate(ObjectClosure* cl);
++  virtual FlexibleWorkGang* get_safepoint_workers()
++  {
++    return workers();
++  }
+   Space* space_containing(const void* addr) const;
+ 
+   // A CollectedHeap is divided into a dense sequence of "blocks"; that is,
+diff --git a/hotspot/src/share/vm/memory/generation.cpp b/hotspot/src/share/vm/memory/generation.cpp
+index dc4ac0869..9d6c926e1 100644
+--- a/hotspot/src/share/vm/memory/generation.cpp
++++ b/hotspot/src/share/vm/memory/generation.cpp
+@@ -103,6 +103,12 @@ void Generation::ref_processor_init() {
+   }
+ }
+ 
++size_t Generation::num_iterable_blocks() const
++{
++  return 0;
++}
++void Generation::object_iterate_block(ObjectClosure *cl, size_t block_index){};
++
+ void Generation::print() const { print_on(tty); }
+ 
+ void Generation::print_on(outputStream* st)  const {
+diff --git a/hotspot/src/share/vm/memory/generation.hpp b/hotspot/src/share/vm/memory/generation.hpp
+index ef5457890..eeb9fa691 100644
+--- a/hotspot/src/share/vm/memory/generation.hpp
++++ b/hotspot/src/share/vm/memory/generation.hpp
+@@ -175,7 +175,8 @@ class Generation: public CHeapObj<mtGC> {
+   // Returns the total number of bytes  available in a generation
+   // for the allocation of objects.
+   virtual size_t max_capacity() const;
+-
++  virtual size_t num_iterable_blocks() const;
++  virtual void object_iterate_block(ObjectClosure *cl, size_t block_index);
+   // If this is a young generation, the maximum number of bytes that can be
+   // allocated in this generation before a GC is triggered.
+   virtual size_t capacity_before_gc() const { return 0; }
+diff --git a/jdk/src/share/classes/sun/tools/jmap/JMap.java b/jdk/src/share/classes/sun/tools/jmap/JMap.java
+index e891b6c55..2cb5a5c10 100644
+--- a/jdk/src/share/classes/sun/tools/jmap/JMap.java
++++ b/jdk/src/share/classes/sun/tools/jmap/JMap.java
+@@ -220,20 +220,24 @@ public class JMap {
+ 
+     private static void histo(String pid, String options) throws IOException {
+         VirtualMachine vm = attach(pid);
+-        String parallel = null;
+         String liveopt = "-all";
+-        if (options.startsWith("live")) {
+-            liveopt = "-live";
+-        }
+-        String[] subopts = options.split(",");
++        String parallel = null;
++        String subopts[] = options.split(",");
+         for (int i = 0; i < subopts.length; i++) {
+             String subopt = subopts[i];
+-            if (subopt.startsWith("parallel=")) {
++            if (subopt.equals("") || subopt.equals("all")) {
++                // pass
++            } else if (subopt.equals("live")) {
++                liveopt = "-live";
++            } else if (subopt.startsWith("parallel=")) {
+                 parallel = subopt.substring("parallel=".length());
+                 if (parallel == null) {
+                     System.err.println("Fail: no number provided in option: '" + subopt + "'");
+-                    System.exit(1);
++                    usage(1);
+                 }
++            } else {
++                System.err.println("Fail: invalid option: '" + subopt + "'");
++                usage(1);
+             }
+         }
+         InputStream in = ((HotSpotVirtualMachine)vm).heapHisto(liveopt,parallel);
+diff --git a/jdk/test/sun/tools/jmap/ParallelInspection.sh b/jdk/test/sun/tools/jmap/ParallelInspection.sh
+index 69e51a76f..b4add98c0 100644
+--- a/jdk/test/sun/tools/jmap/ParallelInspection.sh
++++ b/jdk/test/sun/tools/jmap/ParallelInspection.sh
+@@ -76,4 +76,36 @@ set -e
+ stopApplication "${PORTFILE}"
+ waitForApplication
+ 
++# parallel num in CMS GC
++# Start application and use PORTFILE for coordination
++PORTFILE="${TESTCLASSES}"/shutdown.port
++startApplication SimpleApplication "${PORTFILE}" defineGC UseConcMarkSweepGC
++
++# all return statuses are checked in this test
++set +e
++
++failed=0
++
++${JMAP} -J-XX:+UsePerfData -histo:parallel=0 $appJavaPid
++if [ $? != 0 ]; then failed=1; fi
++
++${JMAP} -J-XX:+UsePerfData -histo:parallel=1 $appJavaPid
++if [ $? != 0 ]; then failed=1; fi
++
++${JMAP} -J-XX:+UsePerfData -histo:parallel=2 $appJavaPid
++if [ $? != 0 ]; then failed=1; fi
++
++${JMAP} -J-XX:+UsePerfData -histo:live,parallel=0 $appJavaPid
++if [ $? != 0 ]; then failed=1; fi
++
++${JMAP} -J-XX:+UsePerfData -histo:live,parallel=1 $appJavaPid
++if [ $? != 0 ]; then failed=1; fi
++
++${JMAP} -J-XX:+UsePerfData -histo:live,parallel=2 $appJavaPid
++if [ $? != 0 ]; then failed=1; fi
++set -e
++
++stopApplication "${PORTFILE}"
++waitForApplication
++
+ exit $failed
-- 
Gitee


From f2f0467619a51bd85d9c6d608e584968b6cbce42 Mon Sep 17 00:00:00 2001
From: kuenking111 <wangkun49@huawei.com>
Date: Sat, 12 Jun 2021 11:42:36 +0800
Subject: [PATCH 2/6] I3VF9L: G1 GC adds support for NUMA

---
 g1gc-numa-aware-Implementation.patch | 3566 ++++++++++++++++++++++++++
 openjdk-1.8.0.spec                   |    7 +-
 2 files changed, 3572 insertions(+), 1 deletion(-)
 create mode 100755 g1gc-numa-aware-Implementation.patch

diff --git a/g1gc-numa-aware-Implementation.patch b/g1gc-numa-aware-Implementation.patch
new file mode 100755
index 0000000..d9001e2
--- /dev/null
+++ b/g1gc-numa-aware-Implementation.patch
@@ -0,0 +1,3566 @@
+commit 63c022739be1810316e2504f4abeaa4ca144ef46
+Author: hubodao <hubodao@huawei.com>
+Date:   Tue Jun 8 07:44:36 2021 +0000
+
+    numa-aware implementation
+
+diff --git a/hotspot/src/os/bsd/vm/os_bsd.cpp b/hotspot/src/os/bsd/vm/os_bsd.cpp
+index 3e4d8c7e6..340334c47 100644
+--- a/hotspot/src/os/bsd/vm/os_bsd.cpp
++++ b/hotspot/src/os/bsd/vm/os_bsd.cpp
+@@ -2290,6 +2290,10 @@ size_t os::numa_get_leaf_groups(int *ids, size_t size) {
+   return 0;
+ }
+ 
++int os::numa_get_group_id_for_address(const void* address) {
++  return 0;
++}
++
+ bool os::get_page_info(char *start, page_info* info) {
+   return false;
+ }
+diff --git a/hotspot/src/os/linux/vm/os_linux.cpp b/hotspot/src/os/linux/vm/os_linux.cpp
+index 621316b99..f700335a3 100644
+--- a/hotspot/src/os/linux/vm/os_linux.cpp
++++ b/hotspot/src/os/linux/vm/os_linux.cpp
+@@ -2908,6 +2908,19 @@ int os::numa_get_group_id() {
+   return 0;
+ }
+ 
++int os::numa_get_group_id_for_address(const void* address) {
++  void** pages = const_cast<void**>(&address);
++  int id = -1;
++
++  if (os::Linux::numa_move_pages(0, 1, pages, NULL, &id, 0) == -1) {
++    return -1;
++  }
++  if (id < 0) {
++    return -1;
++  }
++  return id;
++}
++
+ int os::Linux::get_existing_num_nodes() {
+   size_t node;
+   size_t highest_node_number = Linux::numa_max_node();
+@@ -2930,7 +2943,7 @@ size_t os::numa_get_leaf_groups(int *ids, size_t size) {
+   // not always consecutively available, i.e. available from 0 to the highest
+   // node number.
+   for (size_t node = 0; node <= highest_node_number; node++) {
+-    if (Linux::isnode_in_configured_nodes(node)) {
++    if (Linux::isnode_in_bound_nodes(node)) {
+       ids[i++] = node;
+     }
+   }
+@@ -3023,11 +3036,21 @@ bool os::Linux::libnuma_init() {
+                                                libnuma_dlsym(handle, "numa_bitmask_isbitset")));
+       set_numa_distance(CAST_TO_FN_PTR(numa_distance_func_t,
+                                        libnuma_dlsym(handle, "numa_distance")));
++      set_numa_get_membind(CAST_TO_FN_PTR(numa_get_membind_func_t,
++                                          libnuma_v2_dlsym(handle, "numa_get_membind")));
++      set_numa_get_interleave_mask(CAST_TO_FN_PTR(numa_get_interleave_mask_func_t,
++                                                  libnuma_v2_dlsym(handle, "numa_get_interleave_mask")));
++      set_numa_move_pages(CAST_TO_FN_PTR(numa_move_pages_func_t,
++                                         libnuma_dlsym(handle, "numa_move_pages")));
++      set_numa_run_on_node(CAST_TO_FN_PTR(numa_run_on_node_func_t,
++                                         libnuma_dlsym(handle, "numa_run_on_node")));
+ 
+       if (numa_available() != -1) {
+         set_numa_all_nodes((unsigned long*)libnuma_dlsym(handle, "numa_all_nodes"));
+         set_numa_all_nodes_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_all_nodes_ptr"));
+         set_numa_nodes_ptr((struct bitmask **)libnuma_dlsym(handle, "numa_nodes_ptr"));
++        set_numa_interleave_bitmask(_numa_get_interleave_mask());
++        set_numa_membind_bitmask(_numa_get_membind());
+         // Create an index -> node mapping, since nodes are not always consecutive
+         _nindex_to_node = new (ResourceObj::C_HEAP, mtInternal) GrowableArray<int>(0, true);
+         rebuild_nindex_to_node_map();
+@@ -3081,12 +3104,15 @@ void os::Linux::rebuild_cpu_to_node_map() {
+   for (size_t i = 0; i < node_num; i++) {
+     // Check if node is configured (not a memory-less node). If it is not, find
+     // the closest configured node.
+-    if (!isnode_in_configured_nodes(nindex_to_node()->at(i))) {
++    if (!isnode_in_configured_nodes(nindex_to_node()->at(i)) ||
++        !isnode_in_bound_nodes(nindex_to_node()->at(i))) {
+       closest_distance = INT_MAX;
+       // Check distance from all remaining nodes in the system. Ignore distance
+       // from itself and from another non-configured node.
+       for (size_t m = 0; m < node_num; m++) {
+-        if (m != i && isnode_in_configured_nodes(nindex_to_node()->at(m))) {
++        if (m != i &&
++            isnode_in_configured_nodes(nindex_to_node()->at(m)) &&
++            isnode_in_bound_nodes(nindex_to_node()->at(m))) {
+           distance = numa_distance(nindex_to_node()->at(i), nindex_to_node()->at(m));
+           // If a closest node is found, update. There is always at least one
+           // configured node in the system so there is always at least one node
+@@ -3140,9 +3166,16 @@ os::Linux::numa_interleave_memory_v2_func_t os::Linux::_numa_interleave_memory_v
+ os::Linux::numa_set_bind_policy_func_t os::Linux::_numa_set_bind_policy;
+ os::Linux::numa_bitmask_isbitset_func_t os::Linux::_numa_bitmask_isbitset;
+ os::Linux::numa_distance_func_t os::Linux::_numa_distance;
++os::Linux::numa_get_membind_func_t os::Linux::_numa_get_membind;
++os::Linux::numa_get_interleave_mask_func_t os::Linux::_numa_get_interleave_mask;
++os::Linux::numa_move_pages_func_t os::Linux::_numa_move_pages;
++os::Linux::numa_run_on_node_func_t os::Linux::_numa_run_on_node;
++os::Linux::NumaAllocationPolicy os::Linux::_current_numa_policy;
+ unsigned long* os::Linux::_numa_all_nodes;
+ struct bitmask* os::Linux::_numa_all_nodes_ptr;
+ struct bitmask* os::Linux::_numa_nodes_ptr;
++struct bitmask* os::Linux::_numa_interleave_bitmask;
++struct bitmask* os::Linux::_numa_membind_bitmask;
+ 
+ bool os::pd_uncommit_memory(char* addr, size_t size) {
+   uintptr_t res = (uintptr_t) ::mmap(addr, size, PROT_NONE,
+@@ -5195,9 +5228,11 @@ jint os::init_2(void)
+     if (!Linux::libnuma_init()) {
+       UseNUMA = false;
+     } else {
+-      if ((Linux::numa_max_node() < 1)) {
++      if ((Linux::numa_max_node() < 1) || Linux::isbound_to_single_node()) {
+         // There's only one node(they start from 0), disable NUMA.
+         UseNUMA = false;
++      } else {
++        Linux::set_configured_numa_policy(Linux::identify_numa_policy());
+       }
+     }
+     // With SHM and HugeTLBFS large pages we cannot uncommit a page, so there's no way
+diff --git a/hotspot/src/os/linux/vm/os_linux.hpp b/hotspot/src/os/linux/vm/os_linux.hpp
+index 79a9f39ab..c6748824e 100644
+--- a/hotspot/src/os/linux/vm/os_linux.hpp
++++ b/hotspot/src/os/linux/vm/os_linux.hpp
+@@ -260,6 +260,10 @@ private:
+   typedef int (*numa_tonode_memory_func_t)(void *start, size_t size, int node);
+   typedef void (*numa_interleave_memory_func_t)(void *start, size_t size, unsigned long *nodemask);
+   typedef void (*numa_interleave_memory_v2_func_t)(void *start, size_t size, struct bitmask* mask);
++  typedef struct bitmask* (*numa_get_membind_func_t)(void);
++  typedef struct bitmask* (*numa_get_interleave_mask_func_t)(void);
++  typedef long (*numa_move_pages_func_t)(int pid, unsigned long count, void **pages, const int *nodes, int *status, int flags);
++  typedef int (*numa_run_on_node_func_t)(int node);
+ 
+   typedef void (*numa_set_bind_policy_func_t)(int policy);
+   typedef int (*numa_bitmask_isbitset_func_t)(struct bitmask *bmp, unsigned int n);
+@@ -276,9 +280,16 @@ private:
+   static numa_set_bind_policy_func_t _numa_set_bind_policy;
+   static numa_bitmask_isbitset_func_t _numa_bitmask_isbitset;
+   static numa_distance_func_t _numa_distance;
++  static numa_get_membind_func_t _numa_get_membind;
++  static numa_get_interleave_mask_func_t _numa_get_interleave_mask;
++  static numa_move_pages_func_t _numa_move_pages;
++  static numa_run_on_node_func_t _numa_run_on_node;
++
+   static unsigned long* _numa_all_nodes;
+   static struct bitmask* _numa_all_nodes_ptr;
+   static struct bitmask* _numa_nodes_ptr;
++  static struct bitmask* _numa_interleave_bitmask;
++  static struct bitmask* _numa_membind_bitmask;
+ 
+   static void set_sched_getcpu(sched_getcpu_func_t func) { _sched_getcpu = func; }
+   static void set_numa_node_to_cpus(numa_node_to_cpus_func_t func) { _numa_node_to_cpus = func; }
+@@ -291,10 +302,24 @@ private:
+   static void set_numa_set_bind_policy(numa_set_bind_policy_func_t func) { _numa_set_bind_policy = func; }
+   static void set_numa_bitmask_isbitset(numa_bitmask_isbitset_func_t func) { _numa_bitmask_isbitset = func; }
+   static void set_numa_distance(numa_distance_func_t func) { _numa_distance = func; }
++  static void set_numa_get_membind(numa_get_membind_func_t func) { _numa_get_membind = func; }
++  static void set_numa_get_interleave_mask(numa_get_interleave_mask_func_t func) { _numa_get_interleave_mask = func; }
++  static void set_numa_move_pages(numa_move_pages_func_t func) { _numa_move_pages = func; }
++  static void set_numa_run_on_node(numa_run_on_node_func_t func) { _numa_run_on_node = func; }
+   static void set_numa_all_nodes(unsigned long* ptr) { _numa_all_nodes = ptr; }
+   static void set_numa_all_nodes_ptr(struct bitmask **ptr) { _numa_all_nodes_ptr = (ptr == NULL ? NULL : *ptr); }
+   static void set_numa_nodes_ptr(struct bitmask **ptr) { _numa_nodes_ptr = (ptr == NULL ? NULL : *ptr); }
++  static void set_numa_interleave_bitmask(struct bitmask* ptr)     { _numa_interleave_bitmask = ptr ;   }
++  static void set_numa_membind_bitmask(struct bitmask* ptr)        { _numa_membind_bitmask = ptr ;      }
+   static int sched_getcpu_syscall(void);
++
++  enum NumaAllocationPolicy{
++    NotInitialized,
++    Membind,
++    Interleave
++  };
++  static NumaAllocationPolicy _current_numa_policy;
++
+ public:
+   static int sched_getcpu()  { return _sched_getcpu != NULL ? _sched_getcpu() : -1; }
+   static int numa_node_to_cpus(int node, unsigned long *buffer, int bufferlen) {
+@@ -308,6 +333,20 @@ public:
+   static int numa_tonode_memory(void *start, size_t size, int node) {
+     return _numa_tonode_memory != NULL ? _numa_tonode_memory(start, size, node) : -1;
+   }
++
++  static void set_configured_numa_policy(NumaAllocationPolicy numa_policy) {
++    _current_numa_policy = numa_policy;
++  }
++
++  static NumaAllocationPolicy identify_numa_policy() {
++    for (int node = 0; node <= Linux::numa_max_node(); node++) {
++      if (Linux::_numa_bitmask_isbitset(Linux::_numa_interleave_bitmask, node)) {
++        return Interleave;
++      }
++    }
++    return Membind;
++  }
++
+   static void numa_interleave_memory(void *start, size_t size) {
+     // Use v2 api if available
+     if (_numa_interleave_memory_v2 != NULL && _numa_all_nodes_ptr != NULL) {
+@@ -324,6 +363,14 @@ public:
+   static int numa_distance(int node1, int node2) {
+     return _numa_distance != NULL ? _numa_distance(node1, node2) : -1;
+   }
++  static int numa_run_on_node(int node) {
++    return _numa_run_on_node != NULL ? _numa_run_on_node(node) : -1;
++  }
++
++  static long numa_move_pages(int pid, unsigned long count, void **pages, const int *nodes, int *status, int flags) {
++    return _numa_move_pages != NULL ? _numa_move_pages(pid, count, pages, nodes, status, flags) : -1;
++  }
++
+   static int get_node_by_cpu(int cpu_id);
+   static int get_existing_num_nodes();
+   // Check if numa node is configured (non-zero memory node).
+@@ -352,6 +399,39 @@ public:
+     } else
+       return 0;
+   }
++  // Check if node is in bound node set.
++  static bool isnode_in_bound_nodes(int node) {
++    if (_numa_membind_bitmask != NULL && _numa_bitmask_isbitset != NULL) {
++      return _numa_bitmask_isbitset(_numa_membind_bitmask, node);
++    } else {
++      return false;
++    }
++  }
++  // Check if bound to only one numa node.
++  // Returns true if bound to a single numa node, otherwise returns false.
++  static bool isbound_to_single_node() {
++    int nodes = 0;
++    unsigned int node = 0;
++    unsigned int highest_node_number = 0;
++
++    if (_numa_membind_bitmask != NULL && _numa_max_node != NULL && _numa_bitmask_isbitset != NULL) {
++      highest_node_number = _numa_max_node();
++    } else {
++      return false;
++    }
++
++    for (node = 0; node <= highest_node_number; node++) {
++      if (_numa_bitmask_isbitset(_numa_membind_bitmask, node)) {
++        nodes++;
++      }
++    }
++
++    if (nodes == 1) {
++      return true;
++    } else {
++      return false;
++    }
++  }
+ };
+ 
+ 
+diff --git a/hotspot/src/os/solaris/vm/os_solaris.cpp b/hotspot/src/os/solaris/vm/os_solaris.cpp
+index 732538434..d995f51e3 100644
+--- a/hotspot/src/os/solaris/vm/os_solaris.cpp
++++ b/hotspot/src/os/solaris/vm/os_solaris.cpp
+@@ -2788,6 +2788,10 @@ int os::numa_get_group_id() {
+   return ids[os::random() % r];
+ }
+ 
++int os::numa_get_group_id_for_address(const void* address) {
++  return 0;
++}
++
+ // Request information about the page.
+ bool os::get_page_info(char *start, page_info* info) {
+   const uint_t info_types[] = { MEMINFO_VLGRP, MEMINFO_VPAGESIZE };
+diff --git a/hotspot/src/os/windows/vm/os_windows.cpp b/hotspot/src/os/windows/vm/os_windows.cpp
+index e7ff202af..39f5410d1 100644
+--- a/hotspot/src/os/windows/vm/os_windows.cpp
++++ b/hotspot/src/os/windows/vm/os_windows.cpp
+@@ -3532,6 +3532,10 @@ size_t os::numa_get_leaf_groups(int *ids, size_t size) {
+   }
+ }
+ 
++int os::numa_get_group_id_for_address(const void* address) {
++  return 0;
++}
++
+ bool os::get_page_info(char *start, page_info* info) {
+   return false;
+ }
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1AllocRegion.cpp b/hotspot/src/share/vm/gc_implementation/g1/g1AllocRegion.cpp
+index f92ae1102..0f9bc3f81 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/g1AllocRegion.cpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1AllocRegion.cpp
+@@ -235,15 +235,16 @@ void G1AllocRegion::trace(const char* str, size_t word_size, HeapWord* result) {
+ #endif // G1_ALLOC_REGION_TRACING
+ 
+ G1AllocRegion::G1AllocRegion(const char* name,
+-                             bool bot_updates)
+-  : _name(name), _bot_updates(bot_updates),
++                             bool bot_updates,
++                             uint node_index)
++  : _name(name), _bot_updates(bot_updates), _node_index(node_index),
+     _alloc_region(NULL), _count(0), _used_bytes_before(0),
+     _allocation_context(AllocationContext::system()) { }
+ 
+ 
+ HeapRegion* MutatorAllocRegion::allocate_new_region(size_t word_size,
+                                                     bool force) {
+-  return _g1h->new_mutator_alloc_region(word_size, force);
++  return _g1h->new_mutator_alloc_region(word_size, force, _node_index);
+ }
+ 
+ void MutatorAllocRegion::retire_region(HeapRegion* alloc_region,
+@@ -254,7 +255,7 @@ void MutatorAllocRegion::retire_region(HeapRegion* alloc_region,
+ HeapRegion* SurvivorGCAllocRegion::allocate_new_region(size_t word_size,
+                                                        bool force) {
+   assert(!force, "not supported for GC alloc regions");
+-  return _g1h->new_gc_alloc_region(word_size, count(), InCSetState::Young);
++  return _g1h->new_gc_alloc_region(word_size, count(), InCSetState::Young, _node_index);
+ }
+ 
+ void SurvivorGCAllocRegion::retire_region(HeapRegion* alloc_region,
+@@ -265,7 +266,7 @@ void SurvivorGCAllocRegion::retire_region(HeapRegion* alloc_region,
+ HeapRegion* OldGCAllocRegion::allocate_new_region(size_t word_size,
+                                                   bool force) {
+   assert(!force, "not supported for GC alloc regions");
+-  return _g1h->new_gc_alloc_region(word_size, count(), InCSetState::Old);
++  return _g1h->new_gc_alloc_region(word_size, count(), InCSetState::Old, _node_index);
+ }
+ 
+ void OldGCAllocRegion::retire_region(HeapRegion* alloc_region,
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1AllocRegion.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1AllocRegion.hpp
+index 2edc6545c..bc1c65d5a 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/g1AllocRegion.hpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1AllocRegion.hpp
+@@ -26,6 +26,7 @@
+ #define SHARE_VM_GC_IMPLEMENTATION_G1_G1ALLOCREGION_HPP
+ 
+ #include "gc_implementation/g1/heapRegion.hpp"
++#include "gc_implementation/g1/g1NUMA.hpp"
+ 
+ class G1CollectedHeap;
+ 
+@@ -133,7 +134,9 @@ protected:
+   virtual void retire_region(HeapRegion* alloc_region,
+                              size_t allocated_bytes) = 0;
+ 
+-  G1AllocRegion(const char* name, bool bot_updates);
++  G1AllocRegion(const char* name, bool bot_updates, uint node_index);
++  // The memory node index this allocation region belongs to.
++  uint _node_index;
+ 
+ public:
+   static void setup(G1CollectedHeap* g1h, HeapRegion* dummy_region);
+@@ -197,8 +200,8 @@ protected:
+   virtual HeapRegion* allocate_new_region(size_t word_size, bool force);
+   virtual void retire_region(HeapRegion* alloc_region, size_t allocated_bytes);
+ public:
+-  MutatorAllocRegion()
+-    : G1AllocRegion("Mutator Alloc Region", false /* bot_updates */) { }
++  MutatorAllocRegion(uint node_index)
++    : G1AllocRegion("Mutator Alloc Region", false /* bot_updates */, node_index) { }
+ };
+ 
+ class SurvivorGCAllocRegion : public G1AllocRegion {
+@@ -206,8 +209,8 @@ protected:
+   virtual HeapRegion* allocate_new_region(size_t word_size, bool force);
+   virtual void retire_region(HeapRegion* alloc_region, size_t allocated_bytes);
+ public:
+-  SurvivorGCAllocRegion()
+-  : G1AllocRegion("Survivor GC Alloc Region", false /* bot_updates */) { }
++  SurvivorGCAllocRegion(uint node_index)
++  : G1AllocRegion("Survivor GC Alloc Region", false /* bot_updates */, node_index) { }
+ };
+ 
+ class OldGCAllocRegion : public G1AllocRegion {
+@@ -216,7 +219,7 @@ protected:
+   virtual void retire_region(HeapRegion* alloc_region, size_t allocated_bytes);
+ public:
+   OldGCAllocRegion()
+-  : G1AllocRegion("Old GC Alloc Region", true /* bot_updates */) { }
++  : G1AllocRegion("Old GC Alloc Region", true /* bot_updates */, G1NUMA::AnyNodeIndex) { }
+ 
+   // This specialization of release() makes sure that the last card that has
+   // been allocated into has been completely filled by a dummy object.  This
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1Allocator.cpp b/hotspot/src/share/vm/gc_implementation/g1/g1Allocator.cpp
+index 0d1ab8411..f6fb2cdee 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/g1Allocator.cpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1Allocator.cpp
+@@ -26,19 +26,73 @@
+ #include "gc_implementation/g1/g1Allocator.hpp"
+ #include "gc_implementation/g1/g1CollectedHeap.hpp"
+ #include "gc_implementation/g1/g1CollectorPolicy.hpp"
++#include "gc_implementation/g1/g1NUMA.hpp"
+ #include "gc_implementation/g1/heapRegion.inline.hpp"
+ #include "gc_implementation/g1/heapRegionSet.inline.hpp"
+ 
+-void G1DefaultAllocator::init_mutator_alloc_region() {
+-  assert(_mutator_alloc_region.get() == NULL, "pre-condition");
+-  _mutator_alloc_region.init();
++void G1DefaultAllocator::init_mutator_alloc_regions() {
++  for (uint i = 0; i < _num_alloc_regions; i++) {
++    assert(mutator_alloc_region(i)->get() == NULL, "pre-condition");
++    mutator_alloc_region(i)->init();
++  }
++}
++
++void G1DefaultAllocator::release_mutator_alloc_regions() {
++  for (uint i = 0; i < _num_alloc_regions; i++) {
++    mutator_alloc_region(i)->release();
++    assert(mutator_alloc_region(i)->get() == NULL, "post-condition");
++  }
++}
++
++inline HeapWord* G1DefaultAllocator::attempt_allocation_locked(size_t word_size, bool bot_updates, uint &node_index) {
++  node_index = current_node_index();
++  HeapWord* result = mutator_alloc_region(node_index)->attempt_allocation_locked(word_size, bot_updates);
++  assert(result != NULL || mutator_alloc_region(node_index)->get() == NULL,
++         err_msg("Must not have a mutator alloc region if there is no memory, but is " PTR_FORMAT, p2i(mutator_alloc_region(node_index)->get())));
++  return result;
++}
++
++inline HeapWord* G1DefaultAllocator::attempt_allocation_force(size_t word_size, bool bot_updates, uint node_index) {
++  if (node_index == G1NUMA::AnyNodeIndex) {
++    return NULL;
++  }
++  assert(node_index < _num_alloc_regions, err_msg("Invalid index: %u", node_index));
++  return mutator_alloc_region(node_index)->attempt_allocation_force(word_size, bot_updates);
+ }
+ 
+-void G1DefaultAllocator::release_mutator_alloc_region() {
+-  _mutator_alloc_region.release();
+-  assert(_mutator_alloc_region.get() == NULL, "post-condition");
++G1DefaultAllocator::G1DefaultAllocator(G1CollectedHeap* heap) :
++  G1Allocator(heap),
++  _numa(heap->numa()),
++  _num_alloc_regions(_numa->num_active_nodes()),
++  _mutator_alloc_regions(NULL),
++  _survivor_gc_alloc_regions(NULL),
++  _old_gc_alloc_region(),
++  _retained_old_gc_alloc_region(NULL) {
++
++  _mutator_alloc_regions = NEW_C_HEAP_ARRAY(MutatorAllocRegion, _num_alloc_regions, mtGC);
++  _survivor_gc_alloc_regions = NEW_C_HEAP_ARRAY(SurvivorGCAllocRegion, _num_alloc_regions, mtGC);
++  for (uint i = 0; i < _num_alloc_regions; i++) {
++    ::new(_mutator_alloc_regions + i) MutatorAllocRegion(i);
++    ::new(_survivor_gc_alloc_regions + i) SurvivorGCAllocRegion(i);
++  }
+ }
+ 
++G1DefaultAllocator::~G1DefaultAllocator() {
++  for (uint i = 0; i < _num_alloc_regions; i++) {
++    _mutator_alloc_regions[i].~MutatorAllocRegion();
++    _survivor_gc_alloc_regions[i].~SurvivorGCAllocRegion();
++  }
++  FREE_C_HEAP_ARRAY(MutatorAllocRegion, _mutator_alloc_regions, mtGC);
++  FREE_C_HEAP_ARRAY(SurvivorGCAllocRegion, _survivor_gc_alloc_regions, mtGC);
++}
++
++#ifdef ASSERT
++bool G1Allocator::has_mutator_alloc_region() {
++  uint node_index = current_node_index();
++  return mutator_alloc_region(node_index)->get() != NULL;
++}
++#endif
++
+ void G1Allocator::reuse_retained_old_region(EvacuationInfo& evacuation_info,
+                                             OldGCAllocRegion* old,
+                                             HeapRegion** retained_old) {
+@@ -76,7 +130,9 @@ void G1Allocator::reuse_retained_old_region(EvacuationInfo& evacuation_info,
+ void G1DefaultAllocator::init_gc_alloc_regions(EvacuationInfo& evacuation_info) {
+   assert_at_safepoint(true /* should_be_vm_thread */);
+ 
+-  _survivor_gc_alloc_region.init();
++  for (uint i = 0; i < _num_alloc_regions; i++) {
++    survivor_gc_alloc_region(i)->init();
++  }
+   _old_gc_alloc_region.init();
+   reuse_retained_old_region(evacuation_info,
+                             &_old_gc_alloc_region,
+@@ -85,9 +141,13 @@ void G1DefaultAllocator::init_gc_alloc_regions(EvacuationInfo& evacuation_info)
+ 
+ void G1DefaultAllocator::release_gc_alloc_regions(uint no_of_gc_workers, EvacuationInfo& evacuation_info) {
+   AllocationContext_t context = AllocationContext::current();
+-  evacuation_info.set_allocation_regions(survivor_gc_alloc_region(context)->count() +
++  uint survivor_region_count = 0;
++  for (uint node_index = 0; node_index < _num_alloc_regions; node_index++) {
++    survivor_region_count += survivor_gc_alloc_region(node_index)->count();
++    survivor_gc_alloc_region(node_index)->release();
++  }
++  evacuation_info.set_allocation_regions(survivor_region_count +
+                                          old_gc_alloc_region(context)->count());
+-  survivor_gc_alloc_region(context)->release();
+   // If we have an old GC alloc region to release, we'll save it in
+   // _retained_old_gc_alloc_region. If we don't
+   // _retained_old_gc_alloc_region will become NULL. This is what we
+@@ -105,7 +165,9 @@ void G1DefaultAllocator::release_gc_alloc_regions(uint no_of_gc_workers, Evacuat
+ }
+ 
+ void G1DefaultAllocator::abandon_gc_alloc_regions() {
+-  assert(survivor_gc_alloc_region(AllocationContext::current())->get() == NULL, "pre-condition");
++  for (uint i = 0; i < _num_alloc_regions; i++) {
++    assert(survivor_gc_alloc_region(i)->get() == NULL, "pre-condition");
++  }
+   assert(old_gc_alloc_region(AllocationContext::current())->get() == NULL, "pre-condition");
+   _retained_old_gc_alloc_region = NULL;
+ }
+@@ -113,16 +175,24 @@ void G1DefaultAllocator::abandon_gc_alloc_regions() {
+ G1ParGCAllocBuffer::G1ParGCAllocBuffer(size_t gclab_word_size) :
+   ParGCAllocBuffer(gclab_word_size), _retired(true) { }
+ 
++G1ParGCAllocator::G1ParGCAllocator(G1CollectedHeap* g1h) :
++  _g1h(g1h), _survivor_alignment_bytes(calc_survivor_alignment_bytes()),
++  _numa(g1h->numa()),
++  _num_alloc_regions(_numa->num_active_nodes()),
++  _alloc_buffer_waste(0), _undo_waste(0) {
++}
++
+ HeapWord* G1ParGCAllocator::allocate_direct_or_new_plab(InCSetState dest,
+                                                         size_t word_sz,
+-                                                        AllocationContext_t context) {
++                                                        AllocationContext_t context,
++                                                        uint node_index) {
+   size_t gclab_word_size = _g1h->desired_plab_sz(dest);
+   if (word_sz * 100 < gclab_word_size * ParallelGCBufferWastePct) {
+-    G1ParGCAllocBuffer* alloc_buf = alloc_buffer(dest, context);
++    G1ParGCAllocBuffer* alloc_buf = alloc_buffer(dest, context, node_index);
+     add_to_alloc_buffer_waste(alloc_buf->words_remaining());
+     alloc_buf->retire(false /* end_of_gc */, false /* retain */);
+ 
+-    HeapWord* buf = _g1h->par_allocate_during_gc(dest, gclab_word_size, context);
++    HeapWord* buf = _g1h->par_allocate_during_gc(dest, gclab_word_size, context, node_index);
+     if (buf == NULL) {
+       return NULL; // Let caller handle allocation failure.
+     }
+@@ -134,29 +204,47 @@ HeapWord* G1ParGCAllocator::allocate_direct_or_new_plab(InCSetState dest,
+     assert(obj != NULL, "buffer was definitely big enough...");
+     return obj;
+   } else {
+-    return _g1h->par_allocate_during_gc(dest, word_sz, context);
++    return _g1h->par_allocate_during_gc(dest, word_sz, context, node_index);
+   }
+ }
+ 
+ G1DefaultParGCAllocator::G1DefaultParGCAllocator(G1CollectedHeap* g1h) :
+-  G1ParGCAllocator(g1h),
+-  _surviving_alloc_buffer(g1h->desired_plab_sz(InCSetState::Young)),
+-  _tenured_alloc_buffer(g1h->desired_plab_sz(InCSetState::Old)) {
++  G1ParGCAllocator(g1h) {
+   for (uint state = 0; state < InCSetState::Num; state++) {
+     _alloc_buffers[state] = NULL;
++    uint length = alloc_buffers_length(state);
++    _alloc_buffers[state] = NEW_C_HEAP_ARRAY(G1ParGCAllocBuffer*, length, mtGC);
++    for (uint node_index = 0; node_index < length; node_index++) {
++      _alloc_buffers[state][node_index] = new G1ParGCAllocBuffer(_g1h->desired_plab_sz(state));
++    }
++  }
++}
++
++G1DefaultParGCAllocator::~G1DefaultParGCAllocator() {
++  for (in_cset_state_t state = 0; state < InCSetState::Num; state++) {
++    uint length = alloc_buffers_length(state);
++    for (uint node_index = 0; node_index < length; node_index++) {
++      delete _alloc_buffers[state][node_index];
++    }
++    FREE_C_HEAP_ARRAY(G1ParGCAllocBuffer*, _alloc_buffers[state], mtGC);
+   }
+-  _alloc_buffers[InCSetState::Young] = &_surviving_alloc_buffer;
+-  _alloc_buffers[InCSetState::Old]  = &_tenured_alloc_buffer;
+ }
+ 
+ void G1DefaultParGCAllocator::retire_alloc_buffers() {
+   for (uint state = 0; state < InCSetState::Num; state++) {
+-    G1ParGCAllocBuffer* const buf = _alloc_buffers[state];
+-    if (buf != NULL) {
+-      add_to_alloc_buffer_waste(buf->words_remaining());
+-      buf->flush_stats_and_retire(_g1h->alloc_buffer_stats(state),
++    uint length = alloc_buffers_length(state);
++    for (uint node_index = 0; node_index < length; node_index++) {
++      G1ParGCAllocBuffer* const buf = _alloc_buffers[state][node_index];
++      if (buf != NULL) {
++        add_to_alloc_buffer_waste(buf->words_remaining());
++        buf->flush_stats_and_retire(_g1h->alloc_buffer_stats(state),
+                                   true /* end_of_gc */,
+                                   false /* retain */);
++      }
+     }
+   }
+ }
++
++uint G1DefaultAllocator::current_node_index() const {
++  return _numa->index_of_current_thread();
++}
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1Allocator.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1Allocator.hpp
+index 04628b7de..9b26168a8 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/g1Allocator.hpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1Allocator.hpp
+@@ -30,6 +30,8 @@
+ #include "gc_implementation/g1/g1InCSetState.hpp"
+ #include "gc_implementation/shared/parGCAllocBuffer.hpp"
+ 
++class G1NUMA;
++
+ // Base class for G1 allocators.
+ class G1Allocator : public CHeapObj<mtGC> {
+   friend class VMStructs;
+@@ -44,17 +46,27 @@ public:
+    G1Allocator(G1CollectedHeap* heap) :
+      _g1h(heap), _summary_bytes_used(0) { }
+ 
++   // Node index of current thread.
++   virtual uint current_node_index() const = 0;
++
+    static G1Allocator* create_allocator(G1CollectedHeap* g1h);
+ 
+-   virtual void init_mutator_alloc_region() = 0;
+-   virtual void release_mutator_alloc_region() = 0;
++   virtual void init_mutator_alloc_regions() = 0;
++   virtual void release_mutator_alloc_regions() = 0;
+ 
+    virtual void init_gc_alloc_regions(EvacuationInfo& evacuation_info) = 0;
+    virtual void release_gc_alloc_regions(uint no_of_gc_workers, EvacuationInfo& evacuation_info) = 0;
+    virtual void abandon_gc_alloc_regions() = 0;
+ 
+-   virtual MutatorAllocRegion*    mutator_alloc_region(AllocationContext_t context) = 0;
+-   virtual SurvivorGCAllocRegion* survivor_gc_alloc_region(AllocationContext_t context) = 0;
++#ifdef ASSERT
++   // Do we currently have an active mutator region to allocate into?
++   bool has_mutator_alloc_region();
++#endif
++
++   virtual MutatorAllocRegion*    mutator_alloc_region(uint node_index) = 0;
++   virtual SurvivorGCAllocRegion* survivor_gc_alloc_region(uint node_index) = 0;
++   virtual MutatorAllocRegion*    mutator_alloc_region() = 0;
++   virtual SurvivorGCAllocRegion* survivor_gc_alloc_region() = 0;
+    virtual OldGCAllocRegion*      old_gc_alloc_region(AllocationContext_t context) = 0;
+    virtual size_t                 used() = 0;
+    virtual bool                   is_retained_old_region(HeapRegion* hr) = 0;
+@@ -63,6 +75,9 @@ public:
+                                                             OldGCAllocRegion* old,
+                                                             HeapRegion** retained);
+ 
++   virtual HeapWord* attempt_allocation_locked(size_t word_size, bool bot_updates, uint &node) = 0;
++   virtual HeapWord* attempt_allocation_force(size_t word_size, bool bot_updates, uint node = G1NUMA::AnyNodeIndex) = 0;
++
+    size_t used_unlocked() const {
+      return _summary_bytes_used;
+    }
+@@ -93,37 +108,58 @@ public:
+ class G1DefaultAllocator : public G1Allocator {
+ protected:
+   // Alloc region used to satisfy mutator allocation requests.
+-  MutatorAllocRegion _mutator_alloc_region;
++  MutatorAllocRegion* _mutator_alloc_regions;
+ 
+   // Alloc region used to satisfy allocation requests by the GC for
+   // survivor objects.
+-  SurvivorGCAllocRegion _survivor_gc_alloc_region;
++  SurvivorGCAllocRegion* _survivor_gc_alloc_regions;
+ 
+   // Alloc region used to satisfy allocation requests by the GC for
+   // old objects.
+   OldGCAllocRegion _old_gc_alloc_region;
+ 
+   HeapRegion* _retained_old_gc_alloc_region;
++
++  G1NUMA* _numa;
++    // The number of MutatorAllocRegions used, one per memory node.
++  size_t _num_alloc_regions;
++
+ public:
+-  G1DefaultAllocator(G1CollectedHeap* heap) : G1Allocator(heap), _retained_old_gc_alloc_region(NULL) { }
++  G1DefaultAllocator(G1CollectedHeap* heap);
++  ~G1DefaultAllocator();
+ 
+-  virtual void init_mutator_alloc_region();
+-  virtual void release_mutator_alloc_region();
++  uint current_node_index() const;
++  uint num_nodes() { return (uint)_num_alloc_regions; }
++
++  virtual void init_mutator_alloc_regions();
++  virtual void release_mutator_alloc_regions();
+ 
+   virtual void init_gc_alloc_regions(EvacuationInfo& evacuation_info);
+   virtual void release_gc_alloc_regions(uint no_of_gc_workers, EvacuationInfo& evacuation_info);
+   virtual void abandon_gc_alloc_regions();
+ 
++  virtual HeapWord* attempt_allocation_locked(size_t word_size, bool bot_updates, uint &node);
++  virtual HeapWord* attempt_allocation_force(size_t word_size, bool bot_updates, uint node = G1NUMA::AnyNodeIndex);
+   virtual bool is_retained_old_region(HeapRegion* hr) {
+     return _retained_old_gc_alloc_region == hr;
+   }
+ 
+-  virtual MutatorAllocRegion* mutator_alloc_region(AllocationContext_t context) {
+-    return &_mutator_alloc_region;
++  virtual MutatorAllocRegion* mutator_alloc_region() {
++    return &_mutator_alloc_regions[current_node_index()];
++  }
++
++  virtual SurvivorGCAllocRegion* survivor_gc_alloc_region() {
++    return &_survivor_gc_alloc_regions[current_node_index()];
+   }
+ 
+-  virtual SurvivorGCAllocRegion* survivor_gc_alloc_region(AllocationContext_t context) {
+-    return &_survivor_gc_alloc_region;
++  virtual MutatorAllocRegion* mutator_alloc_region(uint node_index) {
++    assert(node_index < _num_alloc_regions, err_msg("Invalid index: %u", node_index));
++    return &_mutator_alloc_regions[node_index];
++  }
++
++  virtual SurvivorGCAllocRegion* survivor_gc_alloc_region(uint node_index) {
++    assert(node_index < _num_alloc_regions, err_msg("Invalid index: %u", node_index));
++    return &_survivor_gc_alloc_regions[node_index];
+   }
+ 
+   virtual OldGCAllocRegion* old_gc_alloc_region(AllocationContext_t context) {
+@@ -136,9 +172,11 @@ public:
+     size_t result = _summary_bytes_used;
+ 
+     // Read only once in case it is set to NULL concurrently
+-    HeapRegion* hr = mutator_alloc_region(AllocationContext::current())->get();
+-    if (hr != NULL) {
+-      result += hr->used();
++    for (uint i = 0; i < _num_alloc_regions; i++) {
++      HeapRegion* hr = mutator_alloc_region(i)->get();
++      if (hr != NULL) {
++        result += hr->used();
++      }
+     }
+     return result;
+   }
+@@ -173,6 +211,7 @@ class G1ParGCAllocator : public CHeapObj<mtGC> {
+ protected:
+   G1CollectedHeap* _g1h;
+ 
++  typedef InCSetState::in_cset_state_t in_cset_state_t;
+   // The survivor alignment in effect in bytes.
+   // == 0 : don't align survivors
+   // != 0 : align survivors to that alignment
+@@ -187,7 +226,12 @@ protected:
+   void add_to_undo_waste(size_t waste)         { _undo_waste += waste; }
+ 
+   virtual void retire_alloc_buffers() = 0;
+-  virtual G1ParGCAllocBuffer* alloc_buffer(InCSetState dest, AllocationContext_t context) = 0;
++  virtual G1ParGCAllocBuffer* alloc_buffer(InCSetState dest, AllocationContext_t context, uint node_index) = 0;
++
++  // Returns the number of allocation buffers for the given dest.
++  // There is only 1 buffer for Old while Young may have multiple buffers depending on
++  // active NUMA nodes.
++  inline uint alloc_buffers_length(in_cset_state_t dest) const;
+ 
+   // Calculate the survivor space object alignment in bytes. Returns that or 0 if
+   // there are no restrictions on survivor alignment.
+@@ -203,30 +247,34 @@ protected:
+     }
+   }
+ 
++  G1NUMA* _numa;
++    // The number of MutatorAllocRegions used, one per memory node.
++  size_t _num_alloc_regions;
++
+ public:
+-  G1ParGCAllocator(G1CollectedHeap* g1h) :
+-    _g1h(g1h), _survivor_alignment_bytes(calc_survivor_alignment_bytes()),
+-    _alloc_buffer_waste(0), _undo_waste(0) {
+-  }
++  G1ParGCAllocator(G1CollectedHeap* g1h);
+ 
+   static G1ParGCAllocator* create_allocator(G1CollectedHeap* g1h);
+ 
+   size_t alloc_buffer_waste() { return _alloc_buffer_waste; }
+   size_t undo_waste() {return _undo_waste; }
+ 
++  uint num_nodes() const { return (uint)_num_alloc_regions; }
+   // Allocate word_sz words in dest, either directly into the regions or by
+   // allocating a new PLAB. Returns the address of the allocated memory, NULL if
+   // not successful.
+   HeapWord* allocate_direct_or_new_plab(InCSetState dest,
+                                         size_t word_sz,
+-                                        AllocationContext_t context);
++                                        AllocationContext_t context,
++                                        uint node_index);
+ 
+   // Allocate word_sz words in the PLAB of dest.  Returns the address of the
+   // allocated memory, NULL if not successful.
+   HeapWord* plab_allocate(InCSetState dest,
+                           size_t word_sz,
+-                          AllocationContext_t context) {
+-    G1ParGCAllocBuffer* buffer = alloc_buffer(dest, context);
++                          AllocationContext_t context,
++                          uint node_index) {
++    G1ParGCAllocBuffer* buffer = alloc_buffer(dest, context, node_index);
+     if (_survivor_alignment_bytes == 0) {
+       return buffer->allocate(word_sz);
+     } else {
+@@ -235,19 +283,19 @@ public:
+   }
+ 
+   HeapWord* allocate(InCSetState dest, size_t word_sz,
+-                     AllocationContext_t context) {
+-    HeapWord* const obj = plab_allocate(dest, word_sz, context);
++                     AllocationContext_t context, uint node_index) {
++    HeapWord* const obj = plab_allocate(dest, word_sz, context, node_index);
+     if (obj != NULL) {
+       return obj;
+     }
+-    return allocate_direct_or_new_plab(dest, word_sz, context);
++    return allocate_direct_or_new_plab(dest, word_sz, context, node_index);
+   }
+ 
+-  void undo_allocation(InCSetState dest, HeapWord* obj, size_t word_sz, AllocationContext_t context) {
+-    if (alloc_buffer(dest, context)->contains(obj)) {
+-      assert(alloc_buffer(dest, context)->contains(obj + word_sz - 1),
++  void undo_allocation(InCSetState dest, HeapWord* obj, size_t word_sz, AllocationContext_t context, uint node_index) {
++    if (alloc_buffer(dest, context, node_index)->contains(obj)) {
++      assert(alloc_buffer(dest, context, node_index)->contains(obj + word_sz - 1),
+              "should contain whole object");
+-      alloc_buffer(dest, context)->undo_allocation(obj, word_sz);
++      alloc_buffer(dest, context, node_index)->undo_allocation(obj, word_sz);
+     } else {
+       CollectedHeap::fill_with_object(obj, word_sz);
+       add_to_undo_waste(word_sz);
+@@ -256,19 +304,38 @@ public:
+ };
+ 
+ class G1DefaultParGCAllocator : public G1ParGCAllocator {
+-  G1ParGCAllocBuffer  _surviving_alloc_buffer;
+-  G1ParGCAllocBuffer  _tenured_alloc_buffer;
+-  G1ParGCAllocBuffer* _alloc_buffers[InCSetState::Num];
++  G1ParGCAllocBuffer** _alloc_buffers[InCSetState::Num];
+ 
+ public:
+   G1DefaultParGCAllocator(G1CollectedHeap* g1h);
++  ~G1DefaultParGCAllocator();
+ 
+-  virtual G1ParGCAllocBuffer* alloc_buffer(InCSetState dest, AllocationContext_t context) {
++  virtual G1ParGCAllocBuffer* alloc_buffer(InCSetState dest, AllocationContext_t context, uint node_index) {
+     assert(dest.is_valid(),
+            err_msg("Allocation buffer index out-of-bounds: " CSETSTATE_FORMAT, dest.value()));
+     assert(_alloc_buffers[dest.value()] != NULL,
+            err_msg("Allocation buffer is NULL: " CSETSTATE_FORMAT, dest.value()));
+-    return _alloc_buffers[dest.value()];
++    return alloc_buffer(dest.value(), node_index);
++  }
++
++  inline G1ParGCAllocBuffer* alloc_buffer(in_cset_state_t dest, uint node_index) const {
++    assert(dest < InCSetState::Num, err_msg("Allocation buffer index out of bounds: %u", dest));
++
++    if (dest == InCSetState::Young) {
++      assert(node_index < alloc_buffers_length(dest),
++           err_msg("Allocation buffer index out of bounds: %u, %u", dest, node_index));
++      return _alloc_buffers[dest][node_index];
++    } else {
++      return _alloc_buffers[dest][0];
++    }
++  }
++
++  inline uint alloc_buffers_length(in_cset_state_t dest) const {
++    if (dest == InCSetState::Young) {
++      return num_nodes();
++    } else {
++      return 1;
++    }
+   }
+ 
+   virtual void retire_alloc_buffers() ;
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
+index 5cb135354..57dcff3f5 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
+@@ -75,6 +75,9 @@ size_t G1CollectedHeap::_humongous_object_threshold_in_words = 0;
+ // to-be-collected) are printed at "strategic" points before / during
+ // / after the collection --- this is useful for debugging
+ #define YOUNG_LIST_VERBOSE 0
++
++#define THREAD_MIGRATION_MAX_TIMES 1
++
+ // CURRENT STATUS
+ // This file is under construction.  Search for "FIXME".
+ 
+@@ -515,7 +518,7 @@ G1CollectedHeap* G1CollectedHeap::_g1h;
+ // Private methods.
+ 
+ HeapRegion*
+-G1CollectedHeap::new_region_try_secondary_free_list(bool is_old) {
++G1CollectedHeap::new_region_try_secondary_free_list(bool is_old, uint node_index) {
+   MutexLockerEx x(SecondaryFreeList_lock, Mutex::_no_safepoint_check_flag);
+   while (!_secondary_free_list.is_empty() || free_regions_coming()) {
+     if (!_secondary_free_list.is_empty()) {
+@@ -531,7 +534,7 @@ G1CollectedHeap::new_region_try_secondary_free_list(bool is_old) {
+ 
+       assert(_hrm.num_free_regions() > 0, "if the secondary_free_list was not "
+              "empty we should have moved at least one entry to the free_list");
+-      HeapRegion* res = _hrm.allocate_free_region(is_old);
++      HeapRegion* res = _hrm.allocate_free_region(is_old, node_index);
+       if (G1ConcRegionFreeingVerbose) {
+         gclog_or_tty->print_cr("G1ConcRegionFreeing [region alloc] : "
+                                "allocated " HR_FORMAT " from secondary_free_list",
+@@ -553,7 +556,7 @@ G1CollectedHeap::new_region_try_secondary_free_list(bool is_old) {
+   return NULL;
+ }
+ 
+-HeapRegion* G1CollectedHeap::new_region(size_t word_size, bool is_old, bool do_expand) {
++HeapRegion* G1CollectedHeap::new_region(size_t word_size, bool is_old, bool do_expand, uint node_index) {
+   assert(!isHumongous(word_size) || word_size <= HeapRegion::GrainWords,
+          "the only time we use this to allocate a humongous region is "
+          "when we are allocating a single humongous region");
+@@ -565,21 +568,21 @@ HeapRegion* G1CollectedHeap::new_region(size_t word_size, bool is_old, bool do_e
+         gclog_or_tty->print_cr("G1ConcRegionFreeing [region alloc] : "
+                                "forced to look at the secondary_free_list");
+       }
+-      res = new_region_try_secondary_free_list(is_old);
++      res = new_region_try_secondary_free_list(is_old, node_index);
+       if (res != NULL) {
+         return res;
+       }
+     }
+   }
+ 
+-  res = _hrm.allocate_free_region(is_old);
++  res = _hrm.allocate_free_region(is_old, node_index);
+ 
+   if (res == NULL) {
+     if (G1ConcRegionFreeingVerbose) {
+       gclog_or_tty->print_cr("G1ConcRegionFreeing [region alloc] : "
+                              "res == NULL, trying the secondary_free_list");
+     }
+-    res = new_region_try_secondary_free_list(is_old);
++    res = new_region_try_secondary_free_list(is_old, node_index);
+   }
+   if (res == NULL && do_expand && _expand_heap_after_alloc_failure) {
+     // Currently, only attempts to allocate GC alloc regions set
+@@ -593,12 +596,12 @@ HeapRegion* G1CollectedHeap::new_region(size_t word_size, bool is_old, bool do_e
+                   ergo_format_reason("region allocation request failed")
+                   ergo_format_byte("allocation request"),
+                   word_size * HeapWordSize);
+-    if (expand(word_size * HeapWordSize)) {
++    if (expand_single_region(node_index)) {
+       // Given that expand() succeeded in expanding the heap, and we
+       // always expand the heap by an amount aligned to the heap
+       // region size, the free list should in theory not be empty.
+       // In either case allocate_free_region() will check for NULL.
+-      res = _hrm.allocate_free_region(is_old);
++      res = _hrm.allocate_free_region(is_old, node_index);
+     } else {
+       _expand_heap_after_alloc_failure = false;
+     }
+@@ -919,22 +922,29 @@ HeapWord* G1CollectedHeap::attempt_allocation_slow(size_t word_size,
+ 
+     {
+       MutexLockerEx x(Heap_lock);
+-      result = _allocator->mutator_alloc_region(context)->attempt_allocation_locked(word_size,
+-                                                                                    false /* bot_updates */);
++      uint node_idx_by_locked_alloc = G1NUMA::AnyNodeIndex;
++      result = _allocator->attempt_allocation_locked(word_size,
++                                                     false /* bot_updates */,
++                                                     node_idx_by_locked_alloc);
+       if (result != NULL) {
+         return result;
+       }
+ 
+-      // If we reach here, attempt_allocation_locked() above failed to
+-      // allocate a new region. So the mutator alloc region should be NULL.
+-      assert(_allocator->mutator_alloc_region(context)->get() == NULL, "only way to get here");
+-
+       if (GC_locker::is_active_and_needs_gc()) {
+         if (g1_policy()->can_expand_young_list()) {
+           // No need for an ergo verbose message here,
+           // can_expand_young_list() does this when it returns true.
+-          result = _allocator->mutator_alloc_region(context)->attempt_allocation_force(word_size,
+-                                                                                       false /* bot_updates */);
++          uint curr_node_index = _allocator->current_node_index();
++          uint thread_migration_times = 0;
++          while (curr_node_index != node_idx_by_locked_alloc && thread_migration_times < THREAD_MIGRATION_MAX_TIMES) {
++            result = _allocator->attempt_allocation_locked(word_size, false, node_idx_by_locked_alloc);
++            if (result != NULL) {
++              return result;
++            }
++            thread_migration_times++;
++            curr_node_index = _allocator->current_node_index();
++          }
++          result = _allocator->attempt_allocation_force(word_size, false /* bot_updates */, node_idx_by_locked_alloc);
+           if (result != NULL) {
+             return result;
+           }
+@@ -994,7 +1004,7 @@ HeapWord* G1CollectedHeap::attempt_allocation_slow(size_t word_size,
+     // first attempt (without holding the Heap_lock) here and the
+     // follow-on attempt will be at the start of the next loop
+     // iteration (after taking the Heap_lock).
+-    result = _allocator->mutator_alloc_region(context)->attempt_allocation(word_size,
++    result = _allocator->mutator_alloc_region()->attempt_allocation(word_size,
+                                                                            false /* bot_updates */);
+     if (result != NULL) {
+       return result;
+@@ -1134,12 +1144,12 @@ HeapWord* G1CollectedHeap::attempt_allocation_at_safepoint(size_t word_size,
+                                                            AllocationContext_t context,
+                                                            bool expect_null_mutator_alloc_region) {
+   assert_at_safepoint(true /* should_be_vm_thread */);
+-  assert(_allocator->mutator_alloc_region(context)->get() == NULL ||
++  assert(!_allocator->has_mutator_alloc_region() ||
+                                              !expect_null_mutator_alloc_region,
+          "the current alloc region was unexpectedly found to be non-NULL");
+ 
+   if (!isHumongous(word_size)) {
+-    return _allocator->mutator_alloc_region(context)->attempt_allocation_locked(word_size,
++    return _allocator->mutator_alloc_region()->attempt_allocation_locked(word_size,
+                                                       false /* bot_updates */);
+   } else {
+     HeapWord* result = humongous_obj_allocate(word_size, context);
+@@ -1341,7 +1351,7 @@ bool G1CollectedHeap::do_collection(bool explicit_gc,
+       concurrent_mark()->abort();
+ 
+       // Make sure we'll choose a new allocation region afterwards.
+-      _allocator->release_mutator_alloc_region();
++      _allocator->release_mutator_alloc_regions();
+       _allocator->abandon_gc_alloc_regions();
+       g1_rem_set()->cleanupHRRS();
+ 
+@@ -1517,7 +1527,7 @@ bool G1CollectedHeap::do_collection(bool explicit_gc,
+ 
+       clear_cset_fast_test();
+ 
+-      _allocator->init_mutator_alloc_region();
++      _allocator->init_mutator_alloc_regions();
+ 
+       double end = os::elapsedTime();
+       g1_policy()->record_full_collection_end();
+@@ -1792,6 +1802,18 @@ bool G1CollectedHeap::expand(size_t expand_bytes) {
+   return regions_to_expand > 0;
+ }
+ 
++bool G1CollectedHeap::expand_single_region(uint node_index) {
++  uint expanded_by = _hrm.expand_on_preferred_node(node_index);
++
++  if (expanded_by == 0) {
++    assert(is_maximal_no_gc(), err_msg("Should be no regions left, available: %u", _hrm.available()));
++    return false;
++  }
++
++  g1_policy()->record_new_heap_size(num_regions());
++  return true;
++}
++
+ void G1CollectedHeap::shrink_helper(size_t shrink_bytes) {
+   size_t aligned_shrink_bytes =
+     ReservedSpace::page_align_size_down(shrink_bytes);
+@@ -1853,6 +1875,7 @@ G1CollectedHeap::G1CollectedHeap(G1CollectorPolicy* policy_) :
+   _ref_processor_cm(NULL),
+   _ref_processor_stw(NULL),
+   _bot_shared(NULL),
++  _numa(G1NUMA::numa()),
+   _evac_failure_scan_stack(NULL),
+   _mark_in_progress(false),
+   _cg1r(NULL),
+@@ -2015,10 +2038,11 @@ jint G1CollectedHeap::initialize() {
+   // Carve out the G1 part of the heap.
+ 
+   ReservedSpace g1_rs = heap_rs.first_part(max_byte_size);
++  size_t page_size = UseLargePages ? os::large_page_size() : os::vm_page_size();
+   G1RegionToSpaceMapper* heap_storage =
+     G1RegionToSpaceMapper::create_mapper(g1_rs,
+                                          g1_rs.size(),
+-                                         UseLargePages ? os::large_page_size() : os::vm_page_size(),
++                                         page_size,
+                                          HeapRegion::GrainBytes,
+                                          1,
+                                          mtJavaHeap);
+@@ -2077,6 +2101,7 @@ jint G1CollectedHeap::initialize() {
+     _humongous_reclaim_candidates.initialize(start, end, granularity);
+   }
+ 
++  _numa->set_region_info(HeapRegion::GrainBytes, page_size);
+   // Create the ConcurrentMark data structure and thread.
+   // (Must do this late, so that "max_regions" is defined.)
+   _cm = new ConcurrentMark(this, prev_bitmap_storage, next_bitmap_storage);
+@@ -2145,7 +2170,7 @@ jint G1CollectedHeap::initialize() {
+   dummy_region->set_top(dummy_region->end());
+   G1AllocRegion::setup(this, dummy_region);
+ 
+-  _allocator->init_mutator_alloc_region();
++  _allocator->init_mutator_alloc_regions();
+ 
+   // Do create of the monitoring and management support so that
+   // values in the heap have been properly initialized.
+@@ -2975,8 +3000,7 @@ size_t G1CollectedHeap::unsafe_max_tlab_alloc(Thread* ignored) const {
+   // Also, this value can be at most the humongous object threshold,
+   // since we can't allow tlabs to grow big enough to accommodate
+   // humongous objects.
+-
+-  HeapRegion* hr = _allocator->mutator_alloc_region(AllocationContext::current())->get();
++  HeapRegion* hr = _allocator->mutator_alloc_region()->get();
+   size_t max_tlab = max_tlab_size() * wordSize;
+   if (hr == NULL) {
+     return max_tlab;
+@@ -3535,6 +3559,15 @@ void G1CollectedHeap::print_on(outputStream* st) const {
+   st->print("%u survivors (" SIZE_FORMAT "K)", survivor_regions,
+             (size_t) survivor_regions * HeapRegion::GrainBytes / K);
+   st->cr();
++  if (_numa->is_enabled()) {
++    uint num_nodes = _numa->num_active_nodes();
++    st->print("  remaining free region(s) on each NUMA node: ");
++    const int* node_ids = _numa->node_ids();
++    for (uint node_index = 0; node_index < num_nodes; node_index++) {
++      st->print("%d=%u ", node_ids[node_index], _hrm.num_free_regions(node_index));
++    }
++    st->cr();
++  }
+   MetaspaceAux::print_on(st);
+ }
+ 
+@@ -4032,6 +4065,8 @@ void G1CollectedHeap::log_gc_footer(double pause_time_sec) {
+     g1_policy()->phase_times()->note_gc_end();
+     g1_policy()->phase_times()->print(pause_time_sec);
+     g1_policy()->print_detailed_heap_transition();
++    // Print NUMA statistics.
++    _numa->print_statistics();
+   } else {
+     if (evacuation_failed()) {
+       gclog_or_tty->print("--");
+@@ -4042,6 +4077,14 @@ void G1CollectedHeap::log_gc_footer(double pause_time_sec) {
+   gclog_or_tty->flush();
+ }
+ 
++void G1CollectedHeap::verify_numa_regions(const char* desc) {
++  if (G1Log::finer()) {
++    // Iterate all heap regions to print matching between preferred numa id and actual numa id.
++    G1NodeIndexCheckClosure cl(desc, _numa);
++    heap_region_iterate(&cl);
++  }
++}
++
+ bool
+ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
+   assert_at_safepoint(true /* should_be_vm_thread */);
+@@ -4149,7 +4192,7 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
+ 
+       verify_before_gc();
+       check_bitmaps("GC Start");
+-
++      verify_numa_regions("GC Start");
+       COMPILER2_PRESENT(DerivedPointerTable::clear());
+ 
+       // Please see comment in g1CollectedHeap.hpp and
+@@ -4169,7 +4212,7 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
+ 
+         // Forget the current alloc region (we might even choose it to be part
+         // of the collection set!).
+-        _allocator->release_mutator_alloc_region();
++        _allocator->release_mutator_alloc_regions();
+ 
+         // We should call this after we retire the mutator alloc
+         // region(s) so that all the ALLOC / RETIRE events are generated
+@@ -4223,7 +4266,6 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
+ #endif // YOUNG_LIST_VERBOSE
+ 
+         g1_policy()->finalize_cset(target_pause_time_ms, evacuation_info);
+-
+         // Make sure the remembered sets are up to date. This needs to be
+         // done before register_humongous_regions_with_cset(), because the
+         // remembered sets are used there to choose eager reclaim candidates.
+@@ -4327,7 +4369,7 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
+         g1_policy()->print_collection_set(g1_policy()->inc_cset_head(), gclog_or_tty);
+ #endif // YOUNG_LIST_VERBOSE
+ 
+-        _allocator->init_mutator_alloc_region();
++        _allocator->init_mutator_alloc_regions();
+ 
+         {
+           size_t expand_bytes = g1_policy()->expansion_amount();
+@@ -4388,7 +4430,7 @@ G1CollectedHeap::do_collection_pause_at_safepoint(double target_pause_time_ms) {
+ 
+         verify_after_gc();
+         check_bitmaps("GC End");
+-
++        verify_numa_regions("GC End");
+         assert(!ref_processor_stw()->discovery_enabled(), "Postcondition");
+         ref_processor_stw()->verify_no_references_recorded();
+ 
+@@ -4744,6 +4786,7 @@ class G1KlassScanClosure : public KlassClosure {
+ class G1ParTask : public AbstractGangTask {
+ protected:
+   G1CollectedHeap*       _g1h;
++  G1ParScanThreadStateSet* _per_thread_states;
+   RefToScanQueueSet      *_queues;
+   G1RootProcessor*       _root_processor;
+   TaskTerminator         _terminator;
+@@ -4753,9 +4796,10 @@ protected:
+   Mutex* stats_lock() { return &_stats_lock; }
+ 
+ public:
+-  G1ParTask(G1CollectedHeap* g1h, RefToScanQueueSet *task_queues, G1RootProcessor* root_processor)
++  G1ParTask(G1CollectedHeap* g1h, G1ParScanThreadStateSet* per_thread_states, RefToScanQueueSet *task_queues, G1RootProcessor* root_processor)
+     : AbstractGangTask("G1 collection"),
+       _g1h(g1h),
++      _per_thread_states(per_thread_states),
+       _queues(task_queues),
+       _root_processor(root_processor),
+       _terminator(0, _queues),
+@@ -4816,26 +4860,26 @@ public:
+ 
+       ReferenceProcessor*             rp = _g1h->ref_processor_stw();
+ 
+-      G1ParScanThreadState            pss(_g1h, worker_id, rp);
+-      G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, rp);
++      G1ParScanThreadState*           pss = _per_thread_states->state_for_worker(worker_id, rp);
++      G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, pss, rp);
+ 
+-      pss.set_evac_failure_closure(&evac_failure_cl);
++      pss->set_evac_failure_closure(&evac_failure_cl);
+ 
+       bool only_young = _g1h->g1_policy()->gcs_are_young();
+ 
+       // Non-IM young GC.
+-      G1ParCopyClosure<G1BarrierNone, G1MarkNone>             scan_only_root_cl(_g1h, &pss, rp);
++      G1ParCopyClosure<G1BarrierNone, G1MarkNone>             scan_only_root_cl(_g1h, pss, rp);
+       G1CLDClosure<G1MarkNone>                                scan_only_cld_cl(&scan_only_root_cl,
+                                                                                only_young, // Only process dirty klasses.
+                                                                                false);     // No need to claim CLDs.
+       // IM young GC.
+       //    Strong roots closures.
+-      G1ParCopyClosure<G1BarrierNone, G1MarkFromRoot>         scan_mark_root_cl(_g1h, &pss, rp);
++      G1ParCopyClosure<G1BarrierNone, G1MarkFromRoot>         scan_mark_root_cl(_g1h, pss, rp);
+       G1CLDClosure<G1MarkFromRoot>                            scan_mark_cld_cl(&scan_mark_root_cl,
+                                                                                false, // Process all klasses.
+                                                                                true); // Need to claim CLDs.
+       //    Weak roots closures.
+-      G1ParCopyClosure<G1BarrierNone, G1MarkPromotedFromRoot> scan_mark_weak_root_cl(_g1h, &pss, rp);
++      G1ParCopyClosure<G1BarrierNone, G1MarkPromotedFromRoot> scan_mark_weak_root_cl(_g1h, pss, rp);
+       G1CLDClosure<G1MarkPromotedFromRoot>                    scan_mark_weak_cld_cl(&scan_mark_weak_root_cl,
+                                                                                     false, // Process all klasses.
+                                                                                     true); // Need to claim CLDs.
+@@ -4866,7 +4910,7 @@ public:
+         weak_cld_cl    = &scan_only_cld_cl;
+       }
+ 
+-      pss.start_strong_roots();
++      pss->start_strong_roots();
+ 
+       _root_processor->evacuate_roots(strong_root_cl,
+                                       weak_root_cl,
+@@ -4875,31 +4919,31 @@ public:
+                                       trace_metadata,
+                                       worker_id);
+ 
+-      G1ParPushHeapRSClosure push_heap_rs_cl(_g1h, &pss);
++      G1ParPushHeapRSClosure push_heap_rs_cl(_g1h, pss);
+       _root_processor->scan_remembered_sets(&push_heap_rs_cl,
+                                             weak_root_cl,
+                                             worker_id);
+-      pss.end_strong_roots();
++      pss->end_strong_roots();
+ 
+       {
+         double start = os::elapsedTime();
+-        G1ParEvacuateFollowersClosure evac(_g1h, &pss, _queues, _terminator.terminator());
++        G1ParEvacuateFollowersClosure evac(_g1h, pss, _queues, _terminator.terminator());
+         evac.do_void();
+         double elapsed_sec = os::elapsedTime() - start;
+-        double term_sec = pss.term_time();
++        double term_sec = pss->term_time();
+         _g1h->g1_policy()->phase_times()->add_time_secs(G1GCPhaseTimes::ObjCopy, worker_id, elapsed_sec - term_sec);
+         _g1h->g1_policy()->phase_times()->record_time_secs(G1GCPhaseTimes::Termination, worker_id, term_sec);
+-        _g1h->g1_policy()->phase_times()->record_thread_work_item(G1GCPhaseTimes::Termination, worker_id, pss.term_attempts());
++        _g1h->g1_policy()->phase_times()->record_thread_work_item(G1GCPhaseTimes::Termination, worker_id, pss->term_attempts());
+       }
+-      _g1h->g1_policy()->record_thread_age_table(pss.age_table());
+-      _g1h->update_surviving_young_words(pss.surviving_young_words()+1);
++      _g1h->g1_policy()->record_thread_age_table(pss->age_table());
++      _g1h->update_surviving_young_words(pss->surviving_young_words()+1);
+ 
+       if (ParallelGCVerbose) {
+         MutexLocker x(stats_lock());
+-        pss.print_termination_stats(worker_id);
++        pss->print_termination_stats(worker_id);
+       }
+ 
+-      assert(pss.queue_is_empty(), "should be empty");
++      assert(pss->queue_is_empty(), "should be empty");
+ 
+       // Close the inner scope so that the ResourceMark and HandleMark
+       // destructors are executed here and are included as part of the
+@@ -5890,8 +5934,9 @@ void G1CollectedHeap::evacuate_collection_set(EvacuationInfo& evacuation_info) {
+   double end_par_time_sec;
+ 
+   {
++    G1ParScanThreadStateSet per_thread_states(this, workers()->active_workers());
+     G1RootProcessor root_processor(this);
+-    G1ParTask g1_par_task(this, _task_queues, &root_processor);
++    G1ParTask g1_par_task(this, &per_thread_states, _task_queues, &root_processor);
+     // InitialMark needs claim bits to keep track of the marked-through CLDs.
+     if (g1_policy()->during_initial_mark_pause()) {
+       ClassLoaderDataGraph::clear_claimed_marks();
+@@ -5916,6 +5961,8 @@ void G1CollectedHeap::evacuate_collection_set(EvacuationInfo& evacuation_info) {
+     // elapsed time before closing the scope so that time
+     // taken for the destructor is NOT included in the
+     // reported parallel time.
++
++    per_thread_states.flush();
+   }
+ 
+   G1GCPhaseTimes* phase_times = g1_policy()->phase_times();
+@@ -6325,7 +6372,6 @@ void G1CollectedHeap::free_collection_set(HeapRegion* cs_head, EvacuationInfo& e
+   // all we need to do to clear the young list is clear its
+   // head and length, and unlink any young regions in the code below
+   _young_list->clear();
+-
+   G1CollectorPolicy* policy = g1_policy();
+ 
+   double start_sec = os::elapsedTime();
+@@ -6803,7 +6849,8 @@ bool G1CollectedHeap::is_in_closed_subset(const void* p) const {
+ // Methods for the mutator alloc region
+ 
+ HeapRegion* G1CollectedHeap::new_mutator_alloc_region(size_t word_size,
+-                                                      bool force) {
++                                                      bool force,
++                                                      uint node_index) {
+   assert_heap_locked_or_at_safepoint(true /* should_be_vm_thread */);
+   assert(!force || g1_policy()->can_expand_young_list(),
+          "if force is true we should be able to expand the young list");
+@@ -6811,7 +6858,8 @@ HeapRegion* G1CollectedHeap::new_mutator_alloc_region(size_t word_size,
+   if (force || !young_list_full) {
+     HeapRegion* new_alloc_region = new_region(word_size,
+                                               false /* is_old */,
+-                                              false /* do_expand */);
++                                              false /* do_expand */,
++                                              node_index);
+     if (new_alloc_region != NULL) {
+       set_region_short_lived_locked(new_alloc_region);
+       _hr_printer.alloc(new_alloc_region, G1HRPrinter::Eden, young_list_full);
+@@ -6856,14 +6904,16 @@ void G1CollectedHeap::set_par_threads() {
+ 
+ HeapRegion* G1CollectedHeap::new_gc_alloc_region(size_t word_size,
+                                                  uint count,
+-                                                 InCSetState dest) {
++                                                 InCSetState dest,
++                                                 uint node_index) {
+   assert(FreeList_lock->owned_by_self(), "pre-condition");
+ 
+   if (count < g1_policy()->max_regions(dest)) {
+     const bool is_survivor = (dest.is_young());
+     HeapRegion* new_alloc_region = new_region(word_size,
+                                               !is_survivor,
+-                                              true /* do_expand */);
++                                              true /* do_expand */,
++                                              node_index);
+     if (new_alloc_region != NULL) {
+       // We really only need to do this for old regions given that we
+       // should never scan survivors. But it doesn't hurt to do it
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
+index f8c52e681..61d5aad2d 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
+@@ -268,6 +268,9 @@ private:
+   // Callback for region mapping changed events.
+   G1RegionMappingChangedListener _listener;
+ 
++  // Handle G1 NUMA support.
++  G1NUMA* _numa;
++
+   // The sequence of all heap regions in the heap.
+   HeapRegionManager _hrm;
+ 
+@@ -468,14 +471,14 @@ protected:
+   // check whether there's anything available on the
+   // secondary_free_list and/or wait for more regions to appear on
+   // that list, if _free_regions_coming is set.
+-  HeapRegion* new_region_try_secondary_free_list(bool is_old);
++  HeapRegion* new_region_try_secondary_free_list(bool is_old, uint node_index);
+ 
+   // Try to allocate a single non-humongous HeapRegion sufficient for
+   // an allocation of the given word_size. If do_expand is true,
+   // attempt to expand the heap if necessary to satisfy the allocation
+   // request. If the region is to be used as an old region or for a
+   // humongous object, set is_old to true. If not, to false.
+-  HeapRegion* new_region(size_t word_size, bool is_old, bool do_expand);
++  HeapRegion* new_region(size_t word_size, bool is_old, bool do_expand, uint node_index = G1NUMA::AnyNodeIndex);
+ 
+   // Initialize a contiguous set of free regions of length num_regions
+   // and starting at index first so that they appear as a single
+@@ -573,14 +576,16 @@ protected:
+   // may not be a humongous - it must fit into a single heap region.
+   inline HeapWord* par_allocate_during_gc(InCSetState dest,
+                                           size_t word_size,
+-                                          AllocationContext_t context);
++                                          AllocationContext_t context,
++                                          uint node_index);
+   // Ensure that no further allocations can happen in "r", bearing in mind
+   // that parallel threads might be attempting allocations.
+   void par_allocate_remaining_space(HeapRegion* r);
+ 
+   // Allocation attempt during GC for a survivor object / PLAB.
+   inline HeapWord* survivor_attempt_allocation(size_t word_size,
+-                                               AllocationContext_t context);
++                                               AllocationContext_t context,
++                                               uint node_index);
+ 
+   // Allocation attempt during GC for an old object / PLAB.
+   inline HeapWord* old_attempt_allocation(size_t word_size,
+@@ -589,13 +594,13 @@ protected:
+   // These methods are the "callbacks" from the G1AllocRegion class.
+ 
+   // For mutator alloc regions.
+-  HeapRegion* new_mutator_alloc_region(size_t word_size, bool force);
++  HeapRegion* new_mutator_alloc_region(size_t word_size, bool force, uint node_index);
+   void retire_mutator_alloc_region(HeapRegion* alloc_region,
+                                    size_t allocated_bytes);
+ 
+   // For GC alloc regions.
+   HeapRegion* new_gc_alloc_region(size_t word_size, uint count,
+-                                  InCSetState dest);
++                                  InCSetState dest, uint node_index);
+   void retire_gc_alloc_region(HeapRegion* alloc_region,
+                               size_t allocated_bytes, InCSetState dest);
+ 
+@@ -641,6 +646,8 @@ protected:
+   // after processing.
+   void enqueue_discovered_references(uint no_of_gc_workers);
+ 
++  void verify_numa_regions(const char* desc);
++
+ public:
+ 
+   G1Allocator* allocator() {
+@@ -654,11 +661,13 @@ public:
+     return _g1mm;
+   }
+ 
++  G1NUMA* numa() const { return _numa; }
+   // Expand the garbage-first heap by at least the given size (in bytes!).
+   // Returns true if the heap was expanded by the requested amount;
+   // false otherwise.
+   // (Rounds up to a HeapRegion boundary.)
+   bool expand(size_t expand_bytes);
++  bool expand_single_region(uint node_index);
+ 
+   // Returns the PLAB statistics for a given destination.
+   inline PLABStats* alloc_buffer_stats(InCSetState dest);
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.inline.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.inline.hpp
+index c8b270aa3..9350c7bac 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.inline.hpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.inline.hpp
+@@ -58,10 +58,11 @@ size_t G1CollectedHeap::desired_plab_sz(InCSetState dest) {
+ 
+ HeapWord* G1CollectedHeap::par_allocate_during_gc(InCSetState dest,
+                                                   size_t word_size,
+-                                                  AllocationContext_t context) {
++                                                  AllocationContext_t context,
++                                                  uint node_index) {
+   switch (dest.value()) {
+     case InCSetState::Young:
+-      return survivor_attempt_allocation(word_size, context);
++      return survivor_attempt_allocation(word_size, context, node_index);
+     case InCSetState::Old:
+       return old_attempt_allocation(word_size, context);
+     default:
+@@ -138,7 +139,7 @@ inline HeapWord* G1CollectedHeap::attempt_allocation(size_t word_size,
+          "be called for humongous allocation requests");
+ 
+   AllocationContext_t context = AllocationContext::current();
+-  HeapWord* result = _allocator->mutator_alloc_region(context)->attempt_allocation(word_size,
++  HeapWord* result = _allocator->mutator_alloc_region()->attempt_allocation(word_size,
+                                                                                    false /* bot_updates */);
+   if (result == NULL) {
+     result = attempt_allocation_slow(word_size,
+@@ -154,15 +155,16 @@ inline HeapWord* G1CollectedHeap::attempt_allocation(size_t word_size,
+ }
+ 
+ inline HeapWord* G1CollectedHeap::survivor_attempt_allocation(size_t word_size,
+-                                                              AllocationContext_t context) {
++                                                              AllocationContext_t context,
++                                                              uint node_index) {
+   assert(!isHumongous(word_size),
+          "we should not be seeing humongous-size allocations in this path");
+ 
+-  HeapWord* result = _allocator->survivor_gc_alloc_region(context)->attempt_allocation(word_size,
++  HeapWord* result = _allocator->survivor_gc_alloc_region(node_index)->attempt_allocation(word_size,
+                                                                                        false /* bot_updates */);
+   if (result == NULL) {
+     MutexLockerEx x(FreeList_lock, Mutex::_no_safepoint_check_flag);
+-    result = _allocator->survivor_gc_alloc_region(context)->attempt_allocation_locked(word_size,
++    result = _allocator->survivor_gc_alloc_region(node_index)->attempt_allocation_locked(word_size,
+                                                                                       false /* bot_updates */);
+   }
+   if (result != NULL) {
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1InCSetState.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1InCSetState.hpp
+index 50639c330..cbeb93f34 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/g1InCSetState.hpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1InCSetState.hpp
+@@ -58,10 +58,10 @@ struct InCSetState {
+     // or not, which is encoded by values < 0.
+     // The other values are simply encoded in increasing generation order, which
+     // makes getting the next generation fast by a simple increment.
+-    Humongous    = -1,    // The region is humongous - note that actually any value < 0 would be possible here.
+-    NotInCSet    =  0,    // The region is not in the collection set.
+-    Young        =  1,    // The region is in the collection set and a young region.
+-    Old          =  2,    // The region is in the collection set and an old region.
++    Humongous    = -2,    // The region is humongous - note that actually any value < 0 would be possible here.
++    NotInCSet    = -1,    // The region is not in the collection set.
++    Young        =  0,    // The region is in the collection set and a young region.
++    Old          =  1,    // The region is in the collection set and an old region.
+     Num
+   };
+ 
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1NUMA.cpp b/hotspot/src/share/vm/gc_implementation/g1/g1NUMA.cpp
+new file mode 100644
+index 000000000..05b4d8989
+--- /dev/null
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1NUMA.cpp
+@@ -0,0 +1,311 @@
++/*
++ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This code is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 only, as
++ * published by the Free Software Foundation.
++ *
++ * This code is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++ * version 2 for more details (a copy is included in the LICENSE file that
++ * accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License version
++ * 2 along with this work; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
++ *
++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
++ * or visit www.oracle.com if you need additional information or have any
++ * questions.
++ *
++ */
++
++#include "precompiled.hpp"
++#include "gc_implementation/g1/g1Log.hpp"
++#include "gc_implementation/g1/g1NUMA.hpp"
++#include "memory/allocation.hpp"
++#include "runtime/globals.hpp"
++#include "runtime/os.hpp"
++#include "utilities/align.hpp"
++
++G1NUMA* G1NUMA::_inst = NULL;
++
++size_t G1NUMA::region_size() const {
++  assert(_region_size > 0, "Heap region size is not yet set");
++  return _region_size;
++}
++
++size_t G1NUMA::page_size() const {
++  assert(_page_size > 0, "Page size not is yet set");
++  return _page_size;
++}
++
++bool G1NUMA::is_enabled() const { return num_active_nodes() > 1; }
++
++G1NUMA* G1NUMA::create() {
++  guarantee(_inst == NULL, "Should be called once.");
++  _inst = new G1NUMA();
++
++  // NUMA only supported on Linux.
++#ifdef LINUX
++  _inst->initialize(UseNUMA);
++#else
++  _inst->initialize(false);
++#endif /* LINUX */
++
++  return _inst;
++}
++
++  // Returns memory node ids
++const int* G1NUMA::node_ids() const {
++  return _node_ids;
++}
++
++uint G1NUMA::index_of_node_id(int node_id) const {
++  assert(node_id >= 0, err_msg("invalid node id %d", node_id));
++  assert(node_id < _len_node_id_to_index_map, err_msg("invalid node id %d", node_id));
++  uint node_index = _node_id_to_index_map[node_id];
++  assert(node_index != G1NUMA::UnknownNodeIndex,
++         err_msg("invalid node id %d", node_id));
++  return node_index;
++}
++
++G1NUMA::G1NUMA() :
++  _node_id_to_index_map(NULL), _len_node_id_to_index_map(0),
++  _node_ids(NULL), _num_active_node_ids(0),
++  _region_size(0), _page_size(0), _stats(NULL) {
++}
++
++void G1NUMA::initialize_without_numa() {
++  // If NUMA is not enabled or supported, initialize as having a singel node.
++  _num_active_node_ids = 1;
++  _node_ids = NEW_C_HEAP_ARRAY(int, _num_active_node_ids, mtGC);
++  _node_ids[0] = 0;
++  // Map index 0 to node 0
++  _len_node_id_to_index_map = 1;
++  _node_id_to_index_map = NEW_C_HEAP_ARRAY(uint, _len_node_id_to_index_map, mtGC);
++  _node_id_to_index_map[0] = 0;
++}
++
++void G1NUMA::initialize(bool use_numa) {
++  if (!use_numa) {
++    initialize_without_numa();
++    return;
++  }
++
++  assert(UseNUMA, "Invariant");
++  size_t num_node_ids = os::numa_get_groups_num();
++
++  // Create an array of active node ids.
++  _node_ids = NEW_C_HEAP_ARRAY(int, num_node_ids, mtGC);
++  _num_active_node_ids = (uint)os::numa_get_leaf_groups(_node_ids, num_node_ids);
++
++  int max_node_id = 0;
++  for (uint i = 0; i < _num_active_node_ids; i++) {
++    max_node_id = MAX2(max_node_id, _node_ids[i]);
++  }
++
++  // Create a mapping between node_id and index.
++  _len_node_id_to_index_map = max_node_id + 1;
++  _node_id_to_index_map = NEW_C_HEAP_ARRAY(uint, _len_node_id_to_index_map, mtGC);
++
++  // Set all indices with unknown node id.
++  for (int i = 0; i < _len_node_id_to_index_map; i++) {
++    _node_id_to_index_map[i] = G1NUMA::UnknownNodeIndex;
++  }
++
++  // Set the indices for the actually retrieved node ids.
++  for (uint i = 0; i < _num_active_node_ids; i++) {
++    _node_id_to_index_map[_node_ids[i]] = i;
++  }
++
++  _stats = new G1NUMAStats(_node_ids, _num_active_node_ids);
++}
++
++G1NUMA::~G1NUMA() {
++  delete _stats;
++  FREE_C_HEAP_ARRAY(int, _node_id_to_index_map, mtGC);
++  FREE_C_HEAP_ARRAY(int, _node_ids, mtGC);
++}
++
++void G1NUMA::set_region_info(size_t region_size, size_t page_size) {
++  _region_size = region_size;
++  _page_size = page_size;
++}
++
++uint G1NUMA::num_active_nodes() const {
++  assert(_num_active_node_ids > 0, "just checking");
++  return _num_active_node_ids;
++}
++
++uint G1NUMA::index_of_current_thread() const {
++  if (!is_enabled()) {
++    return 0;
++  }
++  return index_of_node_id(os::numa_get_group_id());
++}
++
++uint G1NUMA::preferred_node_index_for_index(uint region_index) const {
++  if (region_size() >= page_size()) {
++    // Simple case, pages are smaller than the region so we
++    // can just alternate over the nodes.
++    return region_index % _num_active_node_ids;
++  } else {
++    // Multiple regions in one page, so we need to make sure the
++    // regions within a page is preferred on the same node.
++    size_t regions_per_page = page_size() / region_size();
++    return (region_index / regions_per_page) % _num_active_node_ids;
++  }
++}
++
++int G1NUMA::numa_id(int index) const {
++  assert(index < _len_node_id_to_index_map, err_msg("Index %d out of range: [0,%d)",
++         index, _len_node_id_to_index_map));
++  return _node_ids[index];
++}
++
++uint G1NUMA::index_of_address(HeapWord *address) const {
++  int numa_id = os::numa_get_group_id_for_address((const void*)address);
++  if (numa_id == -1) {
++    return UnknownNodeIndex;
++  } else {
++    return index_of_node_id(numa_id);
++  }
++}
++
++uint G1NUMA::index_for_region(HeapRegion* hr) const {
++  if (!is_enabled()) {
++    return 0;
++  }
++
++  if (AlwaysPreTouch) {
++    // If we already pretouched, we can check actual node index here.
++    // However, if node index is still unknown, use preferred node index.
++    uint node_index = index_of_address(hr->bottom());
++    if (node_index != UnknownNodeIndex) {
++      return node_index;
++    }
++  }
++
++  return preferred_node_index_for_index(hr->hrm_index());
++}
++
++// Request to spread the given memory evenly across the available NUMA
++// nodes. Which node to request for a given address is given by the
++// region size and the page size. Below are two examples on 4 NUMA nodes system:
++//   1. G1HeapRegionSize(_region_size) is larger than or equal to page size.
++//      * Page #:       |-0--||-1--||-2--||-3--||-4--||-5--||-6--||-7--||-8--||-9--||-10-||-11-||-12-||-13-||-14-||-15-|
++//      * HeapRegion #: |----#0----||----#1----||----#2----||----#3----||----#4----||----#5----||----#6----||----#7----|
++//      * NUMA node #:  |----#0----||----#1----||----#2----||----#3----||----#0----||----#1----||----#2----||----#3----|
++//   2. G1HeapRegionSize(_region_size) is smaller than page size.
++//      Memory will be touched one page at a time because G1RegionToSpaceMapper commits
++//      pages one by one.
++//      * Page #:       |-----0----||-----1----||-----2----||-----3----||-----4----||-----5----||-----6----||-----7----|
++//      * HeapRegion #: |-#0-||-#1-||-#2-||-#3-||-#4-||-#5-||-#6-||-#7-||-#8-||-#9-||#10-||#11-||#12-||#13-||#14-||#15-|
++//      * NUMA node #:  |----#0----||----#1----||----#2----||----#3----||----#0----||----#1----||----#2----||----#3----|
++void G1NUMA::request_memory_on_node(void* aligned_address, size_t size_in_bytes, uint region_index) {
++  if (!is_enabled()) {
++    return;
++  }
++
++  if (size_in_bytes == 0) {
++    return;
++  }
++
++  uint node_index = preferred_node_index_for_index(region_index);
++
++  assert(is_aligned(aligned_address, page_size()), err_msg("Given address (" PTR_FORMAT ") should be aligned.", p2i(aligned_address)));
++  assert(is_aligned(size_in_bytes, page_size()), err_msg("Given size (" SIZE_FORMAT ") should be aligned.", size_in_bytes));
++
++  if (G1Log::finer()) {
++    gclog_or_tty->print_cr("Request memory [" PTR_FORMAT ", " PTR_FORMAT ") to be NUMA id (%d)",
++                          p2i(aligned_address), p2i((char*)aligned_address + size_in_bytes), _node_ids[node_index]);
++  }
++  os::numa_make_local((char*)aligned_address, size_in_bytes, _node_ids[node_index]);
++}
++
++uint G1NUMA::max_search_depth() const {
++  // Multiple of 3 is just random number to limit iterations.
++  // There would be some cases that 1 page may be consisted of multiple HeapRegions.
++  return 3 * MAX2((uint)(page_size() / region_size()), (uint)1) * num_active_nodes();
++}
++
++void G1NUMA::update_statistics(G1NUMAStats::NodeDataItems phase,
++                               uint requested_node_index,
++                               uint allocated_node_index) {
++  if (_stats == NULL) {
++    return;
++  }
++
++  uint converted_req_index;
++  if(requested_node_index < _num_active_node_ids) {
++    converted_req_index = requested_node_index;
++  } else {
++    assert(requested_node_index == AnyNodeIndex,
++           err_msg("Requested node index %u should be AnyNodeIndex.", requested_node_index));
++    converted_req_index = _num_active_node_ids;
++  }
++  _stats->update(phase, converted_req_index, allocated_node_index);
++}
++
++void G1NUMA::copy_statistics(G1NUMAStats::NodeDataItems phase,
++                             uint requested_node_index,
++                             size_t* allocated_stat) {
++  if (_stats == NULL) {
++    return;
++  }
++
++  _stats->copy(phase, requested_node_index, allocated_stat);
++}
++
++void G1NUMA::print_statistics() const {
++  if (_stats == NULL) {
++    return;
++  }
++
++  _stats->print_statistics();
++}
++
++G1NodeIndexCheckClosure::G1NodeIndexCheckClosure(const char* desc, G1NUMA* numa) :
++  _desc(desc), _numa(numa) {
++
++  uint num_nodes = _numa->num_active_nodes();
++  _matched = NEW_C_HEAP_ARRAY(uint, num_nodes, mtGC);
++  _mismatched = NEW_C_HEAP_ARRAY(uint, num_nodes, mtGC);
++  _total = NEW_C_HEAP_ARRAY(uint, num_nodes, mtGC);
++  memset(_matched, 0, sizeof(uint) * num_nodes);
++  memset(_mismatched, 0, sizeof(uint) * num_nodes);
++  memset(_total, 0, sizeof(uint) * num_nodes);
++}
++
++G1NodeIndexCheckClosure::~G1NodeIndexCheckClosure() {
++  if (G1Log::finer()) {
++    gclog_or_tty->print("%s: NUMA region verification (id: matched/mismatched/total): ", _desc);
++    const int* numa_ids = _numa->node_ids();
++    for (uint i = 0; i < _numa->num_active_nodes(); i++) {
++      gclog_or_tty->print("%d: %u/%u/%u ", numa_ids[i], _matched[i], _mismatched[i], _total[i]);
++    }
++    gclog_or_tty->print_cr(" ");
++  }
++  FREE_C_HEAP_ARRAY(uint, _matched, mtGC);
++  FREE_C_HEAP_ARRAY(uint, _mismatched, mtGC);
++  FREE_C_HEAP_ARRAY(uint, _total, mtGC);
++}
++
++bool G1NodeIndexCheckClosure::doHeapRegion(HeapRegion* hr) {
++  // Preferred node index will only have valid node index.
++  uint preferred_node_index = _numa->preferred_node_index_for_index(hr->hrm_index());
++  // Active node index may have UnknownNodeIndex.
++  uint active_node_index = _numa->index_of_address(hr->bottom());
++
++  if (preferred_node_index == active_node_index) {
++    _matched[preferred_node_index]++;
++  } else if (active_node_index != G1NUMA::UnknownNodeIndex) {
++    _mismatched[preferred_node_index]++;
++  }
++  _total[preferred_node_index]++;
++
++  return false;
++}
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1NUMA.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1NUMA.hpp
+new file mode 100644
+index 000000000..30a03dd6d
+--- /dev/null
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1NUMA.hpp
+@@ -0,0 +1,149 @@
++/*
++ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This code is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 only, as
++ * published by the Free Software Foundation.
++ *
++ * This code is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++ * version 2 for more details (a copy is included in the LICENSE file that
++ * accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License version
++ * 2 along with this work; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
++ *
++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
++ * or visit www.oracle.com if you need additional information or have any
++ * questions.
++ *
++ */
++
++#ifndef SHARE_VM_GC_G1_NUMA_HPP
++#define SHARE_VM_GC_G1_NUMA_HPP
++
++#include "gc_implementation/g1/g1NUMAStats.hpp"
++#include "gc_implementation/g1/heapRegion.hpp"
++#include "memory/allocation.hpp"
++#include "runtime/os.hpp"
++
++class G1NUMA: public CHeapObj<mtGC> {
++  // Mapping of available node ids to  0-based index which can be used for
++  // fast resource management. I.e. for every node id provides a unique value in
++  // the range from [0, {# of nodes-1}].
++  // For invalid node id, return UnknownNodeIndex.
++  uint* _node_id_to_index_map;
++  // Length of _num_active_node_ids_id to index map.
++  int _len_node_id_to_index_map;
++
++  // Current active node ids.
++  int* _node_ids;
++  // Total number of node ids.
++  uint _num_active_node_ids;
++
++  // HeapRegion size
++  size_t _region_size;
++  // Necessary when touching memory.
++  size_t _page_size;
++
++  // Stores statistic data.
++  G1NUMAStats* _stats;
++
++  size_t region_size() const;
++  size_t page_size() const;
++
++  // Returns node index of the given node id.
++  // Precondition: node_id is an active node id.
++  inline uint index_of_node_id(int node_id) const;
++
++  // Creates node id and node index mapping table of _node_id_to_index_map.
++  void init_node_id_to_index_map(const int* node_ids, uint num_node_ids);
++
++  static G1NUMA* _inst;
++
++  G1NUMA();
++  void initialize(bool use_numa);
++  void initialize_without_numa();
++
++public:
++  static const uint UnknownNodeIndex = UINT_MAX;
++  static const uint AnyNodeIndex = UnknownNodeIndex - 1;
++
++  static G1NUMA* numa() { return _inst; }
++
++  static G1NUMA* create();
++
++  ~G1NUMA();
++
++  // Sets heap region size and page size after those values
++  // are determined at G1CollectedHeap::initialize().
++  void set_region_info(size_t region_size, size_t page_size);
++
++  // Returns active memory node count.
++  uint num_active_nodes() const;
++
++  bool is_enabled() const;
++
++  int numa_id(int index) const;
++
++  // Returns memory node ids
++  const int* node_ids() const;
++
++  // Returns node index of current calling thread.
++  uint index_of_current_thread() const;
++
++  // Returns the preferred index for the given HeapRegion index.
++  // This assumes that HeapRegions are evenly spit, so we can decide preferred index
++  // with the given HeapRegion index.
++  // Result is less than num_active_nodes().
++  uint preferred_node_index_for_index(uint region_index) const;
++
++  // Retrieves node index of the given address.
++  // Result is less than num_active_nodes() or is UnknownNodeIndex.
++  // Precondition: address is in reserved range for heap.
++  uint index_of_address(HeapWord* address) const;
++
++  // If AlwaysPreTouch is enabled, return actual node index via system call.
++  // If disabled, return preferred node index of the given heap region.
++  uint index_for_region(HeapRegion* hr) const;
++
++  // Requests the given memory area to be located at the given node index.
++  void request_memory_on_node(void* aligned_address, size_t size_in_bytes, uint region_index);
++
++  // Returns maximum search depth which is used to limit heap region search iterations.
++  // The number of active nodes, page size and heap region size are considered.
++  uint max_search_depth() const;
++
++  // Update the given phase of requested and allocated node index.
++  void update_statistics(G1NUMAStats::NodeDataItems phase, uint requested_node_index, uint allocated_node_index);
++
++  // Copy all allocated statistics of the given phase and requested node.
++  // Precondition: allocated_stat should have same length of active nodes.
++  void copy_statistics(G1NUMAStats::NodeDataItems phase, uint requested_node_index, size_t* allocated_stat);
++
++  // Print all statistics.
++  void print_statistics() const;
++};
++
++class G1NodeIndexCheckClosure : public HeapRegionClosure {
++  const char* _desc;
++  G1NUMA* _numa;
++  // Records matched count of each node.
++  uint* _matched;
++  // Records mismatched count of each node.
++  uint* _mismatched;
++  // Records total count of each node.
++  // Total = matched + mismatched + unknown.
++  uint* _total;
++
++public:
++  G1NodeIndexCheckClosure(const char* desc, G1NUMA* numa);
++  ~G1NodeIndexCheckClosure();
++
++  bool doHeapRegion(HeapRegion* hr);
++};
++
++#endif // SHARE_VM_GC_G1_NUMA_HPP
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1NUMAStats.cpp b/hotspot/src/share/vm/gc_implementation/g1/g1NUMAStats.cpp
+new file mode 100644
+index 000000000..cfc3633f8
+--- /dev/null
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1NUMAStats.cpp
+@@ -0,0 +1,226 @@
++/*
++ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This code is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 only, as
++ * published by the Free Software Foundation.
++ *
++ * This code is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++ * version 2 for more details (a copy is included in the LICENSE file that
++ * accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License version
++ * 2 along with this work; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
++ *
++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
++ * or visit www.oracle.com if you need additional information or have any
++ * questions.
++ *
++ */
++
++#include "precompiled.hpp"
++#include "gc_implementation/g1/g1Log.hpp"
++#include "gc_implementation/g1/g1NUMAStats.hpp"
++#include "memory/allocation.inline.hpp"
++
++double G1NUMAStats::Stat::rate() const {
++  return _requested == 0 ? 0 : (double)_hit / _requested * 100;
++}
++
++G1NUMAStats::NodeDataArray::NodeDataArray(uint num_nodes) {
++  guarantee(num_nodes > 1, err_msg("Number of nodes (%u) should be set", num_nodes));
++
++  // The row represents the number of nodes.
++  _num_column = num_nodes;
++  // +1 for G1MemoryNodeManager::AnyNodeIndex.
++  _num_row = num_nodes + 1;
++
++  _data = NEW_C_HEAP_ARRAY(size_t*, _num_row, mtGC);
++  for (uint row = 0; row < _num_row; row++) {
++    _data[row] = NEW_C_HEAP_ARRAY(size_t, _num_column, mtGC);
++  }
++
++  clear();
++}
++
++G1NUMAStats::NodeDataArray::~NodeDataArray() {
++  for (uint row = 0; row < _num_row; row++) {
++    FREE_C_HEAP_ARRAY(size_t, _data[row], mtGC);
++  }
++  FREE_C_HEAP_ARRAY(size_t*, _data, mtGC);
++}
++
++void G1NUMAStats::NodeDataArray::create_hit_rate(Stat* result) const {
++  size_t requested = 0;
++  size_t hit = 0;
++
++  for (size_t row = 0; row < _num_row; row++) {
++    for (size_t column = 0; column < _num_column; column++) {
++      requested += _data[row][column];
++      if (row == column) {
++        hit += _data[row][column];
++      }
++    }
++  }
++
++  assert(result != NULL, "Invariant");
++  result->_hit = hit;
++  result->_requested = requested;
++}
++
++void G1NUMAStats::NodeDataArray::create_hit_rate(Stat* result, uint req_index) const {
++  size_t requested = 0;
++  size_t hit = _data[req_index][req_index];
++
++  for (size_t column = 0; column < _num_column; column++) {
++    requested += _data[req_index][column];
++  }
++
++  assert(result != NULL, "Invariant");
++  result->_hit = hit;
++  result->_requested = requested;
++}
++
++size_t G1NUMAStats::NodeDataArray::sum(uint req_index) const {
++  size_t sum = 0;
++  for (size_t column = 0; column < _num_column; column++) {
++    sum += _data[req_index][column];
++  }
++
++  return sum;
++}
++
++void G1NUMAStats::NodeDataArray::increase(uint req_index, uint alloc_index) {
++  assert(req_index < _num_row,
++         err_msg("Requested index %u should be less than the row size %u",
++         req_index, _num_row));
++  assert(alloc_index < _num_column,
++         err_msg("Allocated index %u should be less than the column size %u",
++         alloc_index, _num_column));
++  _data[req_index][alloc_index] += 1;
++}
++
++void G1NUMAStats::NodeDataArray::clear() {
++  for (uint row = 0; row < _num_row; row++) {
++    memset((void*)_data[row], 0, sizeof(size_t) * _num_column);
++  }
++}
++
++size_t G1NUMAStats::NodeDataArray::get(uint req_index, uint alloc_index) {
++  return _data[req_index][alloc_index];
++}
++
++void G1NUMAStats::NodeDataArray::copy(uint req_index, size_t* stat) {
++  assert(stat != NULL, "Invariant");
++
++  for (uint column = 0; column < _num_column; column++) {
++    _data[req_index][column] += stat[column];
++  }
++}
++
++G1NUMAStats::G1NUMAStats(const int* node_ids, uint num_node_ids) :
++  _node_ids(node_ids), _num_node_ids(num_node_ids), _node_data() {
++
++  assert(_num_node_ids  > 1, err_msg("Should have more than one active memory nodes %u", _num_node_ids));
++
++  for (int i = 0; i < NodeDataItemsSentinel; i++) {
++    _node_data[i] = new NodeDataArray(_num_node_ids);
++  }
++}
++
++G1NUMAStats::~G1NUMAStats() {
++  for (int i = 0; i < NodeDataItemsSentinel; i++) {
++    delete _node_data[i];
++  }
++}
++
++void G1NUMAStats::clear(G1NUMAStats::NodeDataItems phase) {
++  _node_data[phase]->clear();
++}
++
++void G1NUMAStats::update(G1NUMAStats::NodeDataItems phase,
++                         uint requested_node_index,
++                         uint allocated_node_index) {
++  _node_data[phase]->increase(requested_node_index, allocated_node_index);
++}
++
++void G1NUMAStats::copy(G1NUMAStats::NodeDataItems phase,
++                       uint requested_node_index,
++                       size_t* allocated_stat) {
++  _node_data[phase]->copy(requested_node_index, allocated_stat);
++}
++
++static const char* phase_to_explanatory_string(G1NUMAStats::NodeDataItems phase) {
++  switch(phase) {
++    case G1NUMAStats::NewRegionAlloc:
++      return "Placement match ratio";
++    case G1NUMAStats::LocalObjProcessAtCopyToSurv:
++      return "Worker task locality match ratio";
++    default:
++      return "";
++  }
++}
++
++#define RATE_TOTAL_FORMAT "%0.0f%% " SIZE_FORMAT "/" SIZE_FORMAT
++
++void G1NUMAStats::print_info(G1NUMAStats::NodeDataItems phase) {
++  if (G1Log::finer()) {
++    Stat result;
++    size_t array_width = _num_node_ids;
++
++    _node_data[phase]->create_hit_rate(&result);
++    gclog_or_tty->print("%s: " RATE_TOTAL_FORMAT " (",
++           phase_to_explanatory_string(phase), result.rate(), result._hit, result._requested);
++
++    for (uint i = 0; i < array_width; i++) {
++      if (i != 0) {
++        gclog_or_tty->print(", ");
++      }
++      _node_data[phase]->create_hit_rate(&result, i);
++      gclog_or_tty->print("%d: " RATE_TOTAL_FORMAT,
++             _node_ids[i], result.rate(), result._hit, result._requested);
++    }
++    gclog_or_tty->print_cr(")");
++  }
++}
++
++void G1NUMAStats::print_mutator_alloc_stat_debug() {
++  uint array_width = _num_node_ids;
++
++  if (G1Log::finer()) {
++    gclog_or_tty->print("Allocated NUMA ids    ");
++    for (uint i = 0; i < array_width; i++) {
++      gclog_or_tty->print("%8d", _node_ids[i]);
++    }
++    gclog_or_tty->print_cr("   Total");
++
++    gclog_or_tty->print("Requested NUMA id ");
++    for (uint req = 0; req < array_width; req++) {
++      gclog_or_tty->print("%3d ", _node_ids[req]);
++      for (uint alloc = 0; alloc < array_width; alloc++) {
++        gclog_or_tty->print(SIZE_FORMAT_W(8), _node_data[NewRegionAlloc]->get(req, alloc));
++      }
++      gclog_or_tty->print(SIZE_FORMAT_W(8), _node_data[NewRegionAlloc]->sum(req));
++      gclog_or_tty->print_cr(" ");
++      // Add padding to align with the string 'Requested NUMA id'.
++      gclog_or_tty->print("                  ");
++    }
++    gclog_or_tty->print("Any ");
++    for (uint alloc = 0; alloc < array_width; alloc++) {
++      gclog_or_tty->print(SIZE_FORMAT_W(8), _node_data[NewRegionAlloc]->get(array_width, alloc));
++    }
++    gclog_or_tty->print(SIZE_FORMAT_W(8), _node_data[NewRegionAlloc]->sum(array_width));
++    gclog_or_tty->print_cr(" ");
++  }
++}
++
++void G1NUMAStats::print_statistics() {
++  print_info(NewRegionAlloc);
++  print_mutator_alloc_stat_debug();
++
++  print_info(LocalObjProcessAtCopyToSurv);
++}
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1NUMAStats.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1NUMAStats.hpp
+new file mode 100644
+index 000000000..fba9442c8
+--- /dev/null
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1NUMAStats.hpp
+@@ -0,0 +1,119 @@
++/*
++ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This code is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 only, as
++ * published by the Free Software Foundation.
++ *
++ * This code is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++ * version 2 for more details (a copy is included in the LICENSE file that
++ * accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License version
++ * 2 along with this work; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
++ *
++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
++ * or visit www.oracle.com if you need additional information or have any
++ * questions.
++ *
++ */
++
++#ifndef SHARE_VM_GC_G1_NODE_TIMES_HPP
++#define SHARE_VM_GC_G1_NODE_TIMES_HPP
++
++#include "memory/allocation.hpp"
++
++// Manages statistics of multi nodes.
++class G1NUMAStats : public CHeapObj<mtGC> {
++  struct Stat {
++    // Hit count: if requested id equals to returned id.
++    size_t _hit;
++    // Total request count
++    size_t _requested;
++
++    // Hit count / total request count
++    double rate() const;
++  };
++
++  // Holds data array which has a size of (node count) * (node count + 1) to
++  // represent request node * allocated node. The request node includes any node case.
++  // All operations are NOT thread-safe.
++  // The row index indicates a requested node index while the column node index
++  // indicates an allocated node index. The last row is for any node index request.
++  // E.g. (req, alloc) = (0,0) (1,0) (2,0) (0,1) (Any, 3) (0,2) (0,3) (0,3) (3,3)
++  // Allocated node index      0    1    2    3  Total
++  // Requested node index 0    1    1    1    2    5
++  //                      1    1    0    0    0    1
++  //                      2    1    0    0    0    1
++  //                      3    0    0    0    1    1
++  //                    Any    0    0    0    1    1
++  class NodeDataArray : public CHeapObj<mtGC> {
++    // The number of nodes.
++    uint _num_column;
++    // The number of nodes + 1 (for any node request)
++    uint _num_row;
++    // 2-dimension array that holds count of allocated / requested node index.
++    size_t** _data;
++
++  public:
++    NodeDataArray(uint num_nodes);
++    ~NodeDataArray();
++
++    // Create Stat result of hit count, requested count and hit rate.
++    // The result is copied to the given result parameter.
++    void create_hit_rate(Stat* result) const;
++    // Create Stat result of hit count, requested count and hit rate of the given index.
++    // The result is copied to the given result parameter.
++    void create_hit_rate(Stat* result, uint req_index) const;
++    // Return sum of the given index.
++    size_t sum(uint req_index) const;
++    // Increase at the request / allocated index.
++    void increase(uint req_index, uint alloc_index);
++    // Clear all data.
++    void clear();
++    // Return current value of the given request / allocated index.
++    size_t get(uint req_index, uint alloc_index);
++    // Copy values of the given request index.
++    void copy(uint req_index, size_t* stat);
++  };
++
++public:
++  enum NodeDataItems {
++    // Statistics of a new region allocation.
++    NewRegionAlloc,
++    // Statistics of object processing during copy to survivor region.
++    LocalObjProcessAtCopyToSurv,
++    NodeDataItemsSentinel
++  };
++
++private:
++  const int* _node_ids;
++  uint _num_node_ids;
++
++  NodeDataArray* _node_data[NodeDataItemsSentinel];
++
++  void print_info(G1NUMAStats::NodeDataItems phase);
++
++  void print_mutator_alloc_stat_debug();
++
++public:
++  G1NUMAStats(const int* node_ids, uint num_node_ids);
++  ~G1NUMAStats();
++
++  void clear(G1NUMAStats::NodeDataItems phase);
++
++  // Update the given phase of requested and allocated node index.
++  void update(G1NUMAStats::NodeDataItems phase, uint requested_node_index, uint allocated_node_index);
++
++  // Copy all allocated statistics of the given phase and requested node.
++  // Precondition: allocated_stat should have same length of active nodes.
++  void copy(G1NUMAStats::NodeDataItems phase, uint requested_node_index, size_t* allocated_stat);
++
++  void print_statistics();
++};
++
++#endif // SHARE_VM_GC_G1_NODE_TIMES_HPP
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1PageBasedVirtualSpace.cpp b/hotspot/src/share/vm/gc_implementation/g1/g1PageBasedVirtualSpace.cpp
+index 075217d60..7bc84bfe8 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/g1PageBasedVirtualSpace.cpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1PageBasedVirtualSpace.cpp
+@@ -135,6 +135,11 @@ char* G1PageBasedVirtualSpace::page_start(size_t index) const {
+   return _low_boundary + index * _page_size;
+ }
+ 
++size_t G1PageBasedVirtualSpace::page_size() const {
++  assert(_page_size > 0, "Page size is not yet initialized.");
++  return _page_size;
++}
++
+ bool G1PageBasedVirtualSpace::is_after_last_page(size_t index) const {
+   guarantee(index <= _committed.size(),
+             err_msg("Given boundary page " SIZE_FORMAT " is beyond managed page count " SIZE_FORMAT, index, _committed.size()));
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1PageBasedVirtualSpace.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1PageBasedVirtualSpace.hpp
+index 4d0b7b21b..f171bfcf1 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/g1PageBasedVirtualSpace.hpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1PageBasedVirtualSpace.hpp
+@@ -90,8 +90,6 @@ class G1PageBasedVirtualSpace VALUE_OBJ_CLASS_SPEC {
+ 
+   // Returns the index of the page which contains the given address.
+   uintptr_t  addr_to_page_index(char* addr) const;
+-  // Returns the address of the given page index.
+-  char*  page_start(size_t index) const;
+ 
+   // Is the given page index the last page?
+   bool is_last_page(size_t index) const { return index == (_committed.size() - 1); }
+@@ -143,6 +141,10 @@ class G1PageBasedVirtualSpace VALUE_OBJ_CLASS_SPEC {
+ 
+   void check_for_contiguity() PRODUCT_RETURN;
+ 
++  // Returns the address of the given page index.
++  char*  page_start(size_t index) const;
++  size_t page_size() const;
++
+   // Debugging
+   void print_on(outputStream* out) PRODUCT_RETURN;
+   void print();
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1ParScanThreadState.cpp b/hotspot/src/share/vm/gc_implementation/g1/g1ParScanThreadState.cpp
+index 394f20e82..a095abaf6 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/g1ParScanThreadState.cpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1ParScanThreadState.cpp
+@@ -40,6 +40,8 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint queue_num,
+     _term_attempts(0),
+     _tenuring_threshold(g1h->g1_policy()->tenuring_threshold()),
+     _age_table(false), _scanner(g1h, rp),
++    _numa(g1h->numa()),
++    _obj_alloc_stat(NULL),
+     _strong_roots_time(0), _term_time(0) {
+   _scanner.set_par_scan_thread_state(this);
+   // we allocate G1YoungSurvRateNumRegions plus one entries, since
+@@ -60,19 +62,20 @@ G1ParScanThreadState::G1ParScanThreadState(G1CollectedHeap* g1h, uint queue_num,
+ 
+   _g1_par_allocator = G1ParGCAllocator::create_allocator(_g1h);
+ 
+-  _dest[InCSetState::NotInCSet]    = InCSetState::NotInCSet;
+   // The dest for Young is used when the objects are aged enough to
+   // need to be moved to the next space.
+   _dest[InCSetState::Young]        = InCSetState::Old;
+   _dest[InCSetState::Old]          = InCSetState::Old;
+ 
+   _start = os::elapsedTime();
++  initialize_numa_stats();
+ }
+ 
+ G1ParScanThreadState::~G1ParScanThreadState() {
+   _g1_par_allocator->retire_alloc_buffers();
+   delete _g1_par_allocator;
+   FREE_C_HEAP_ARRAY(size_t, _surviving_young_words_base, mtGC);
++  FREE_C_HEAP_ARRAY(size_t, _obj_alloc_stat, mtGC);
+ }
+ 
+ void
+@@ -162,7 +165,8 @@ void G1ParScanThreadState::trim_queue() {
+ HeapWord* G1ParScanThreadState::allocate_in_next_plab(InCSetState const state,
+                                                       InCSetState* dest,
+                                                       size_t word_sz,
+-                                                      AllocationContext_t const context) {
++                                                      AllocationContext_t const context,
++                                                      uint node_index) {
+   assert(state.is_in_cset_or_humongous(), err_msg("Unexpected state: " CSETSTATE_FORMAT, state.value()));
+   assert(dest->is_in_cset_or_humongous(), err_msg("Unexpected dest: " CSETSTATE_FORMAT, dest->value()));
+ 
+@@ -170,7 +174,7 @@ HeapWord* G1ParScanThreadState::allocate_in_next_plab(InCSetState const state,
+   // let's keep the logic here simple. We can generalize it when necessary.
+   if (dest->is_young()) {
+     HeapWord* const obj_ptr = _g1_par_allocator->allocate(InCSetState::Old,
+-                                                          word_sz, context);
++                                                          word_sz, context, node_index);
+     if (obj_ptr == NULL) {
+       return NULL;
+     }
+@@ -190,8 +194,8 @@ HeapWord* G1ParScanThreadState::allocate_in_next_plab(InCSetState const state,
+ void G1ParScanThreadState::report_promotion_event(InCSetState const dest_state,
+                                                   oop const old, size_t word_sz, uint age,
+                                                   HeapWord * const obj_ptr,
+-                                                  AllocationContext_t context) const {
+-  ParGCAllocBuffer* alloc_buf = _g1_par_allocator->alloc_buffer(dest_state, context);
++                                                  AllocationContext_t context, uint node_index) const {
++  ParGCAllocBuffer* alloc_buf = _g1_par_allocator->alloc_buffer(dest_state, context, node_index);
+   if (alloc_buf->contains(obj_ptr)) {
+     _g1h->_gc_tracer_stw->report_promotion_in_new_plab_event(old->klass(), word_sz, age,
+                                                              dest_state.value() == InCSetState::Old,
+@@ -226,23 +230,25 @@ oop G1ParScanThreadState::copy_to_survivor_space(InCSetState const state,
+ 
+   uint age = 0;
+   InCSetState dest_state = next_state(state, old_mark, age);
+-  HeapWord* obj_ptr = _g1_par_allocator->plab_allocate(dest_state, word_sz, context);
++  uint node_index = from_region->node_index();
++  HeapWord* obj_ptr = _g1_par_allocator->plab_allocate(dest_state, word_sz, context, node_index);
+ 
+   // PLAB allocations should succeed most of the time, so we'll
+   // normally check against NULL once and that's it.
+   if (obj_ptr == NULL) {
+-    obj_ptr = _g1_par_allocator->allocate_direct_or_new_plab(dest_state, word_sz, context);
++    obj_ptr = _g1_par_allocator->allocate_direct_or_new_plab(dest_state, word_sz, context, node_index);
+     if (obj_ptr == NULL) {
+-      obj_ptr = allocate_in_next_plab(state, &dest_state, word_sz, context);
++      obj_ptr = allocate_in_next_plab(state, &dest_state, word_sz, context, node_index);
+       if (obj_ptr == NULL) {
+         // This will either forward-to-self, or detect that someone else has
+         // installed a forwarding pointer.
+         return _g1h->handle_evacuation_failure_par(this, old);
+       }
+     }
++    update_numa_stats(node_index);
+     if (_g1h->_gc_tracer_stw->should_report_promotion_events()) {
+       // The events are checked individually as part of the actual commit
+-      report_promotion_event(dest_state, old, word_sz, age, obj_ptr, context);
++      report_promotion_event(dest_state, old, word_sz, age, obj_ptr, context, node_index);
+     }
+   }
+ 
+@@ -252,7 +258,7 @@ oop G1ParScanThreadState::copy_to_survivor_space(InCSetState const state,
+   if (_g1h->evacuation_should_fail()) {
+     // Doing this after all the allocation attempts also tests the
+     // undo_allocation() method too.
+-    _g1_par_allocator->undo_allocation(dest_state, obj_ptr, word_sz, context);
++    _g1_par_allocator->undo_allocation(dest_state, obj_ptr, word_sz, context, node_index);
+     return _g1h->handle_evacuation_failure_par(this, old);
+   }
+ #endif // !PRODUCT
+@@ -314,7 +320,49 @@ oop G1ParScanThreadState::copy_to_survivor_space(InCSetState const state,
+     }
+     return obj;
+   } else {
+-    _g1_par_allocator->undo_allocation(dest_state, obj_ptr, word_sz, context);
++    _g1_par_allocator->undo_allocation(dest_state, obj_ptr, word_sz, context, node_index);
+     return forward_ptr;
+   }
+ }
++
++G1ParScanThreadState* G1ParScanThreadStateSet::state_for_worker(uint worker_id, ReferenceProcessor* rp) {
++  assert(worker_id < _n_workers, "out of bounds access");
++  if (_states[worker_id] == NULL) {
++    _states[worker_id] =
++      new G1ParScanThreadState(_g1h, worker_id, rp);
++  }
++  return _states[worker_id];
++}
++
++void G1ParScanThreadStateSet::flush() {
++  assert(!_flushed, "thread local state from the per thread states should be flushed once");
++
++  for (uint worker_index = 0; worker_index < _n_workers; ++worker_index) {
++    G1ParScanThreadState* pss = _states[worker_index];
++
++    if (pss == NULL) {
++      continue;
++    }
++
++    pss->flush_numa_stats();
++    delete pss;
++    _states[worker_index] = NULL;
++  }
++  _flushed = true;
++}
++
++G1ParScanThreadStateSet::G1ParScanThreadStateSet(G1CollectedHeap* g1h,
++                                                 uint n_workers) :
++    _g1h(g1h),
++    _states(NEW_C_HEAP_ARRAY(G1ParScanThreadState*, n_workers, mtGC)),
++    _n_workers(n_workers),
++    _flushed(false) {
++  for (uint i = 0; i < n_workers; ++i) {
++    _states[i] = NULL;
++  }
++}
++
++G1ParScanThreadStateSet::~G1ParScanThreadStateSet() {
++  assert(_flushed, "thread local state from the per thread states should have been flushed");
++  FREE_C_HEAP_ARRAY(G1ParScanThreadState*, _states, mtGC);
++}
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1ParScanThreadState.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1ParScanThreadState.hpp
+index 990b71d31..60c00b178 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/g1ParScanThreadState.hpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1ParScanThreadState.hpp
+@@ -38,7 +38,7 @@
+ class HeapRegion;
+ class outputStream;
+ 
+-class G1ParScanThreadState : public StackObj {
++class G1ParScanThreadState : public CHeapObj<mtGC> {
+  private:
+   G1CollectedHeap* _g1h;
+   RefToScanQueue*  _refs;
+@@ -91,6 +91,13 @@ class G1ParScanThreadState : public StackObj {
+     return _dest[original.value()];
+   }
+ 
++  G1NUMA* _numa;
++
++  // Records how many object allocations happened at each node during copy to survivor.
++  // Only starts recording when log of gc+heap+numa is enabled and its data is
++  // transferred when flushed.
++  size_t* _obj_alloc_stat;
++
+  public:
+   G1ParScanThreadState(G1CollectedHeap* g1h, uint queue_num, ReferenceProcessor* rp);
+   ~G1ParScanThreadState();
+@@ -208,13 +215,19 @@ class G1ParScanThreadState : public StackObj {
+   HeapWord* allocate_in_next_plab(InCSetState const state,
+                                   InCSetState* dest,
+                                   size_t word_sz,
+-                                  AllocationContext_t const context);
++                                  AllocationContext_t const context,
++                                  uint node_index);
+ 
+   void report_promotion_event(InCSetState const dest_state,
+                               oop const old, size_t word_sz, uint age,
+-                              HeapWord * const obj_ptr, AllocationContext_t context) const;
++                              HeapWord * const obj_ptr, AllocationContext_t context, uint node_index) const;
+ 
+   inline InCSetState next_state(InCSetState const state, markOop const m, uint& age);
++
++  // NUMA statistics related methods.
++  inline void initialize_numa_stats();
++  inline void update_numa_stats(uint node_index);
++
+  public:
+ 
+   oop copy_to_survivor_space(InCSetState const state, oop const obj, markOop const old_mark);
+@@ -222,6 +235,22 @@ class G1ParScanThreadState : public StackObj {
+   void trim_queue();
+ 
+   inline void steal_and_trim_queue(RefToScanQueueSet *task_queues);
++  inline void flush_numa_stats();
++};
++
++class G1ParScanThreadStateSet : public StackObj {
++  G1CollectedHeap* _g1h;
++  G1ParScanThreadState** _states;
++  uint _n_workers;
++  bool _flushed;
++
++ public:
++  G1ParScanThreadStateSet(G1CollectedHeap* g1h,
++                          uint n_workers);
++  ~G1ParScanThreadStateSet();
++
++  void flush();
++  G1ParScanThreadState* state_for_worker(uint worker_id, ReferenceProcessor* rp);
+ };
+ 
+ #endif // SHARE_VM_GC_IMPLEMENTATION_G1_G1PARSCANTHREADSTATE_HPP
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1ParScanThreadState.inline.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1ParScanThreadState.inline.hpp
+index 7dedb1517..b3dc22b30 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/g1ParScanThreadState.inline.hpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1ParScanThreadState.inline.hpp
+@@ -142,5 +142,27 @@ void G1ParScanThreadState::steal_and_trim_queue(RefToScanQueueSet *task_queues)
+   }
+ }
+ 
++void G1ParScanThreadState::initialize_numa_stats() {
++  if (_numa->is_enabled()) {
++    uint num_nodes = _numa->num_active_nodes();
++    // Record only if there are multiple active nodes.
++    _obj_alloc_stat = NEW_C_HEAP_ARRAY(size_t, num_nodes, mtGC);
++    memset(_obj_alloc_stat, 0, sizeof(size_t) * num_nodes);
++  }
++}
++
++void G1ParScanThreadState::flush_numa_stats() {
++  if (_obj_alloc_stat != NULL) {
++    uint node_index = _numa->index_of_current_thread();
++    _numa->copy_statistics(G1NUMAStats::LocalObjProcessAtCopyToSurv, node_index, _obj_alloc_stat);
++  }
++}
++
++void G1ParScanThreadState::update_numa_stats(uint node_index) {
++  if (_obj_alloc_stat != NULL) {
++    _obj_alloc_stat[node_index]++;
++  }
++}
++
+ #endif /* SHARE_VM_GC_IMPLEMENTATION_G1_G1PARSCANTHREADSTATE_INLINE_HPP */
+ 
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1RegionToSpaceMapper.cpp b/hotspot/src/share/vm/gc_implementation/g1/g1RegionToSpaceMapper.cpp
+index f07c27107..27ea0d7a1 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/g1RegionToSpaceMapper.cpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1RegionToSpaceMapper.cpp
+@@ -24,6 +24,7 @@
+ 
+ #include "precompiled.hpp"
+ #include "gc_implementation/g1/g1BiasedArray.hpp"
++#include "gc_implementation/g1/g1NUMA.hpp"
+ #include "gc_implementation/g1/g1RegionToSpaceMapper.hpp"
+ #include "memory/allocation.inline.hpp"
+ #include "runtime/mutex.hpp"
+@@ -40,6 +41,7 @@ G1RegionToSpaceMapper::G1RegionToSpaceMapper(ReservedSpace rs,
+   _storage(rs, used_size, page_size),
+   _region_granularity(region_granularity),
+   _listener(NULL),
++  _memory_type(type),
+   _commit_map() {
+   guarantee(is_power_of_2(page_size), "must be");
+   guarantee(is_power_of_2(region_granularity), "must be");
+@@ -71,6 +73,14 @@ class G1RegionsLargerThanCommitSizeMapper : public G1RegionToSpaceMapper {
+   virtual void commit_regions(uint start_idx, size_t num_regions) {
+     bool zero_filled = _storage.commit((size_t)start_idx * _pages_per_region, num_regions * _pages_per_region);
+     _commit_map.par_set_range(start_idx, start_idx + num_regions, BitMap::unknown_range);
++    if (_memory_type == mtJavaHeap) {
++      for (uint region_index = start_idx; region_index < start_idx + num_regions; region_index++ ) {
++        void* address = _storage.page_start(region_index * _pages_per_region);
++        size_t size_in_bytes = _storage.page_size() * _pages_per_region;
++        G1NUMA::numa()->request_memory_on_node(address, size_in_bytes, region_index);
++      }
++    }
++
+     fire_on_commit(start_idx, num_regions, zero_filled);
+   }
+ 
+@@ -106,7 +116,7 @@ class G1RegionsSmallerThanCommitSizeMapper : public G1RegionToSpaceMapper {
+                                        size_t commit_factor,
+                                        MemoryType type) :
+     G1RegionToSpaceMapper(rs, actual_size, page_size, alloc_granularity, type),
+-    _par_lock(Mutex::leaf, "G1RegionsSmallerThanCommitSizeMapper par lock"),
++    _par_lock(Mutex::leaf, "G1RegionsSmallerThanCommitSizeMapper par lock", true),
+     _regions_per_page((page_size * commit_factor) / alloc_granularity), _refcounts() {
+ 
+     guarantee((page_size * commit_factor) >= alloc_granularity, "allocation granularity smaller than commit granularity");
+@@ -123,6 +133,11 @@ class G1RegionsSmallerThanCommitSizeMapper : public G1RegionToSpaceMapper {
+       bool zero_filled = false;
+       if (old_refcount == 0) {
+         zero_filled = _storage.commit(idx, 1);
++        if (_memory_type == mtJavaHeap) {
++          void* address = _storage.page_start(idx);
++          size_t size_in_bytes = _storage.page_size();
++          G1NUMA::numa()->request_memory_on_node(address, size_in_bytes, i);
++        }
+       }
+       _refcounts.set_by_index(idx, old_refcount + 1);
+       _commit_map.set_bit(i);
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1RegionToSpaceMapper.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1RegionToSpaceMapper.hpp
+index 6623a37f9..6eee4d309 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/g1RegionToSpaceMapper.hpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1RegionToSpaceMapper.hpp
+@@ -51,6 +51,8 @@ class G1RegionToSpaceMapper : public CHeapObj<mtGC> {
+   // Mapping management
+   BitMap _commit_map;
+ 
++  MemoryType _memory_type;
++
+   G1RegionToSpaceMapper(ReservedSpace rs, size_t used_size, size_t page_size, size_t region_granularity, MemoryType type);
+ 
+   void fire_on_commit(uint start_idx, size_t num_regions, bool zero_filled);
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/heapRegion.cpp b/hotspot/src/share/vm/gc_implementation/g1/heapRegion.cpp
+index facd28948..131cdeacd 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/heapRegion.cpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/heapRegion.cpp
+@@ -26,6 +26,7 @@
+ #include "code/nmethod.hpp"
+ #include "gc_implementation/g1/g1BlockOffsetTable.inline.hpp"
+ #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
++#include "gc_implementation/g1/g1NUMA.hpp"
+ #include "gc_implementation/g1/g1OopClosures.inline.hpp"
+ #include "gc_implementation/g1/heapRegion.inline.hpp"
+ #include "gc_implementation/g1/heapRegionBounds.inline.hpp"
+@@ -313,7 +314,7 @@ HeapRegion::HeapRegion(uint hrm_index,
+     _in_uncommit_list(false),
+     _young_index_in_cset(-1), _surv_rate_group(NULL), _age_index(-1),
+     _rem_set(NULL), _recorded_rs_length(0), _predicted_elapsed_time_ms(0),
+-    _predicted_bytes_to_copy(0)
++    _predicted_bytes_to_copy(0), _node_index(G1NUMA::UnknownNodeIndex)
+ {
+   _rem_set = new HeapRegionRemSet(sharedOffsetArray, this);
+   assert(HeapRegionRemSet::num_par_rem_sets() > 0, "Invariant.");
+@@ -704,6 +705,15 @@ void HeapRegion::print_on(outputStream* st) const {
+   st->print(" TS %5d", _gc_time_stamp);
+   st->print(" PTAMS " PTR_FORMAT " NTAMS " PTR_FORMAT,
+             prev_top_at_mark_start(), next_top_at_mark_start());
++  if (UseNUMA) {
++    G1NUMA* numa = G1NUMA::numa();
++    if (node_index() < numa->num_active_nodes()) {
++      st->print("|%d", numa->numa_id(node_index()));
++    } else {
++      st->print("|-");
++    }
++  }
++  st->print_cr(" ");
+   G1OffsetTableContigSpace::print_on(st);
+ }
+ 
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/heapRegion.hpp b/hotspot/src/share/vm/gc_implementation/g1/heapRegion.hpp
+index 656d605ef..bc9527a87 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/heapRegion.hpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/heapRegion.hpp
+@@ -295,7 +295,7 @@ class HeapRegion: public G1OffsetTableContigSpace {
+   // The RSet length that was added to the total value
+   // for the collection set.
+   size_t _recorded_rs_length;
+-
++  uint _node_index;
+   // The predicted elapsed time that was added to total value
+   // for the collection set.
+   double _predicted_elapsed_time_ms;
+@@ -768,6 +768,9 @@ class HeapRegion: public G1OffsetTableContigSpace {
+   // the strong code roots list for this region
+   void strong_code_roots_do(CodeBlobClosure* blk) const;
+ 
++  uint node_index() const { return _node_index; }
++  void set_node_index(uint node_index) { _node_index = node_index; }
++
+   // Verify that the entries on the strong code root list for this
+   // region are live and include at least one pointer into this region.
+   void verify_strong_code_roots(VerifyOption vo, bool* failures) const;
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/heapRegionManager.cpp b/hotspot/src/share/vm/gc_implementation/g1/heapRegionManager.cpp
+index 842550d21..6ad85596d 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/heapRegionManager.cpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/heapRegionManager.cpp
+@@ -27,6 +27,7 @@
+ #include "gc_implementation/g1/heapRegionManager.inline.hpp"
+ #include "gc_implementation/g1/heapRegionSet.inline.hpp"
+ #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
++#include "gc_implementation/g1/g1NUMA.hpp"
+ #include "gc_implementation/g1/concurrentG1Refine.hpp"
+ #include "memory/allocation.hpp"
+ 
+@@ -72,6 +73,34 @@ bool HeapRegionManager::can_expand(uint region) const {
+   return !_available_map.at(region);
+ }
+ 
++HeapRegion* HeapRegionManager::allocate_free_region(bool is_old, uint requested_node_index) {
++  HeapRegion* hr = NULL;
++  bool from_head = is_old;
++  G1NUMA* numa = G1NUMA::numa();
++
++  if (requested_node_index != G1NUMA::AnyNodeIndex && numa->is_enabled()) {
++    // Try to allocate with requested node index.
++    hr = _free_list.remove_region_with_node_index(from_head, requested_node_index);
++  }
++
++  if (hr == NULL) {
++    // If there's a single active node or we did not get a region from our requested node,
++    // try without requested node index.
++    hr = _free_list.remove_region(from_head);
++  }
++
++  if (hr != NULL) {
++    assert(hr->next() == NULL, "Single region should not have next");
++    assert(is_available(hr->hrm_index()), "Must be committed");
++
++    if (numa->is_enabled() && hr->node_index() < numa->num_active_nodes()) {
++      numa->update_statistics(G1NUMAStats::NewRegionAlloc, requested_node_index, hr->node_index());
++    }
++  }
++
++  return hr;
++}
++
+ #ifdef ASSERT
+ bool HeapRegionManager::is_free(HeapRegion* hr) const {
+   return _free_list.contains(hr);
+@@ -107,6 +136,10 @@ void HeapRegionManager::commit_regions(uint index, size_t num_regions) {
+ void HeapRegionManager::uncommit_regions(uint start, size_t num_regions) {
+   guarantee(num_regions >= 1, err_msg("Need to specify at least one region to uncommit, tried to uncommit zero regions at %u", start));
+   guarantee(_num_committed >= num_regions, "pre-condition");
++  // Reset node index to distinguish with committed regions.
++  for (uint i = start; i < start + num_regions; i++) {
++    at(i)->set_node_index(G1NUMA::UnknownNodeIndex);
++  }
+ 
+   // Print before uncommitting.
+   if (G1CollectedHeap::heap()->hr_printer()->is_active()) {
+@@ -155,6 +188,7 @@ void HeapRegionManager::make_regions_available(uint start, uint num_regions) {
+     MemRegion mr(bottom, bottom + HeapRegion::GrainWords);
+ 
+     hr->initialize(mr);
++    hr->set_node_index(G1NUMA::numa()->index_for_region(hr));
+     insert_into_free_list(at(i));
+   }
+ }
+@@ -204,6 +238,35 @@ uint HeapRegionManager::expand_at(uint start, uint num_regions) {
+   return expanded;
+ }
+ 
++uint HeapRegionManager::expand_on_preferred_node(uint preferred_index) {
++  uint expand_candidate = UINT_MAX;
++  for (uint i = 0; i < max_length(); i++) {
++    if (is_available(i)) {
++      // Already in use continue
++      continue;
++    }
++    // Always save the candidate so we can expand later on.
++    expand_candidate = i;
++    if (is_on_preferred_index(expand_candidate, preferred_index)) {
++      // We have found a candidate on the preffered node, break.
++      break;
++    }
++  }
++
++  if (expand_candidate == UINT_MAX) {
++     // No regions left, expand failed.
++    return 0;
++  }
++
++  make_regions_available(expand_candidate, 1);
++  return 1;
++}
++
++bool HeapRegionManager::is_on_preferred_index(uint region_index, uint preferred_node_index) {
++  uint region_node_index = G1NUMA::numa()->preferred_node_index_for_index(region_index);
++  return region_node_index == preferred_node_index;
++}
++
+ uint HeapRegionManager::find_contiguous(size_t num, bool empty_only) {
+   uint found = 0;
+   size_t length_found = 0;
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/heapRegionManager.hpp b/hotspot/src/share/vm/gc_implementation/g1/heapRegionManager.hpp
+index 715122181..a06fa4f56 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/heapRegionManager.hpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/heapRegionManager.hpp
+@@ -120,6 +120,8 @@ class HeapRegionManager: public CHeapObj<mtGC> {
+   // the heap. Returns the length of the sequence found. If this value is zero, no
+   // sequence could be found, otherwise res_idx contains the start index of this range.
+   uint find_empty_from_idx_reverse(uint start_idx, uint* res_idx) const;
++  // Checks the G1MemoryNodeManager to see if this region is on the preferred node.
++  bool is_on_preferred_index(uint region_index, uint preferred_node_index);
+   // Allocate a new HeapRegion for the given index.
+   HeapRegion* new_heap_region(uint hrm_index);
+ #ifdef ASSERT
+@@ -175,15 +177,7 @@ public:
+     _free_list.add_ordered(list);
+   }
+ 
+-  HeapRegion* allocate_free_region(bool is_old) {
+-    HeapRegion* hr = _free_list.remove_region(is_old);
+-
+-    if (hr != NULL) {
+-      assert(hr->next() == NULL, "Single region should not have next");
+-      assert(is_available(hr->hrm_index()), "Must be committed");
+-    }
+-    return hr;
+-  }
++  virtual HeapRegion* allocate_free_region(bool is_old, uint requested_node_index);
+ 
+   inline void allocate_free_regions_starting_at(uint first, uint num_regions);
+ 
+@@ -197,6 +191,10 @@ public:
+     return _free_list.length();
+   }
+ 
++  uint num_free_regions(uint node_index) const {
++    return _free_list.length(node_index);
++  }
++
+   size_t total_capacity_bytes() const {
+     return num_free_regions() * HeapRegion::GrainBytes;
+   }
+@@ -225,6 +223,9 @@ public:
+   // this.
+   uint expand_at(uint start, uint num_regions);
+ 
++  // Try to expand on the given node index.
++  virtual uint expand_on_preferred_node(uint node_index);
++
+   // Find a contiguous set of empty regions of length num. Returns the start index of
+   // that set, or G1_NO_HRM_INDEX.
+   uint find_contiguous_only_empty(size_t num) { return find_contiguous(num, true); }
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/heapRegionSet.cpp b/hotspot/src/share/vm/gc_implementation/g1/heapRegionSet.cpp
+index 09d12fd3f..881bab784 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/heapRegionSet.cpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/heapRegionSet.cpp
+@@ -24,6 +24,7 @@
+ 
+ #include "precompiled.hpp"
+ #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
++#include "gc_implementation/g1/g1NUMA.hpp"
+ #include "gc_implementation/g1/heapRegionRemSet.hpp"
+ #include "gc_implementation/g1/heapRegionSet.inline.hpp"
+ 
+@@ -100,6 +101,12 @@ HeapRegionSetBase::HeapRegionSetBase(const char* name, bool humongous, bool free
+     _count()
+ { }
+ 
++FreeRegionList::FreeRegionList(const char* name, HRSMtSafeChecker* mt_safety_checker):
++    HeapRegionSetBase(name, false /* humongous */, true /* empty */, mt_safety_checker),
++    _node_info(G1NUMA::numa()->is_enabled() ? new NodeInfo() : NULL) {
++  clear();
++}
++
+ void FreeRegionList::set_unrealistically_long_length(uint len) {
+   guarantee(_unrealistically_long_length == 0, "should only be set once");
+   _unrealistically_long_length = len;
+@@ -127,6 +134,7 @@ void FreeRegionList::remove_all(bool uncommit) {
+       OrderAccess::storestore();
+       curr->set_uncommit_list(false);
+     }
++    decrease_length(curr->node_index());
+     curr = next;
+   }
+   clear();
+@@ -144,6 +152,9 @@ void FreeRegionList::add_ordered(FreeRegionList* from_list) {
+   if (from_list->is_empty()) {
+     return;
+   }
++  if (_node_info != NULL && from_list->_node_info != NULL) {
++    _node_info->add(from_list->_node_info);
++  }
+ 
+   #ifdef ASSERT
+   FreeRegionListIterator iter(from_list);
+@@ -246,6 +257,7 @@ void FreeRegionList::remove_starting_at(HeapRegion* first, uint num_regions) {
+     remove(curr);
+ 
+     count++;
++    decrease_length(curr->node_index());
+     curr = next;
+   }
+ 
+@@ -278,6 +290,9 @@ void FreeRegionList::clear() {
+   _head = NULL;
+   _tail = NULL;
+   _last = NULL;
++  if (_node_info!= NULL) {
++    _node_info->clear();
++  }
+ }
+ 
+ void FreeRegionList::print_on(outputStream* out, bool print_contents) {
+@@ -454,6 +469,29 @@ void HumongousRegionSetMtSafeChecker::check() {
+   }
+ }
+ 
++FreeRegionList::NodeInfo::NodeInfo() : _numa(G1NUMA::numa()), _length_of_node(NULL),
++                                       _num_nodes(_numa->num_active_nodes()) {
++  assert(UseNUMA, "Invariant");
++
++  _length_of_node = NEW_C_HEAP_ARRAY(uint, _num_nodes, mtGC);
++}
++
++FreeRegionList::NodeInfo::~NodeInfo() {
++  FREE_C_HEAP_ARRAY(uint, _length_of_node, mtGC);
++}
++
++void FreeRegionList::NodeInfo::clear() {
++  for (uint i = 0; i < _num_nodes; ++i) {
++    _length_of_node[i] = 0;
++  }
++}
++
++void FreeRegionList::NodeInfo::add(NodeInfo* info) {
++  for (uint i = 0; i < _num_nodes; ++i) {
++    _length_of_node[i] += info->_length_of_node[i];
++  }
++}
++
+ void FreeRegionList_test() {
+   FreeRegionList l("test");
+ 
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/heapRegionSet.hpp b/hotspot/src/share/vm/gc_implementation/g1/heapRegionSet.hpp
+index ede3136d5..42f0bd4d0 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/heapRegionSet.hpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/heapRegionSet.hpp
+@@ -197,22 +197,45 @@ public:
+ // add / remove one region at a time or concatenate two lists.
+ 
+ class FreeRegionListIterator;
++class G1NUMA;
+ 
+ class FreeRegionList : public HeapRegionSetBase {
+   friend class FreeRegionListIterator;
+ 
+ private:
++  // This class is only initialized if there are multiple active nodes.
++  class NodeInfo : public CHeapObj<mtGC> {
++    G1NUMA* _numa;
++    uint*   _length_of_node;
++    uint    _num_nodes;
++
++  public:
++    NodeInfo();
++    ~NodeInfo();
++
++    inline void increase_length(uint node_index);
++    inline void decrease_length(uint node_index);
++
++    inline uint length(uint index) const;
++
++    void clear();
++
++    void add(NodeInfo* info);
++  };
++
+   HeapRegion* _head;
+   HeapRegion* _tail;
+ 
+   // _last is used to keep track of where we added an element the last
+   // time. It helps to improve performance when adding several ordered items in a row.
+   HeapRegion* _last;
+-
++  NodeInfo*   _node_info;
+   static uint _unrealistically_long_length;
+ 
+   inline HeapRegion* remove_from_head_impl();
+   inline HeapRegion* remove_from_tail_impl();
++  inline void increase_length(uint node_index);
++  inline void decrease_length(uint node_index);
+ 
+ protected:
+   virtual void fill_in_ext_msg_extra(hrs_ext_msg* msg);
+@@ -221,9 +244,12 @@ protected:
+   virtual void clear();
+ 
+ public:
+-  FreeRegionList(const char* name, HRSMtSafeChecker* mt_safety_checker = NULL):
+-    HeapRegionSetBase(name, false /* humongous */, true /* empty */, mt_safety_checker) {
+-    clear();
++  FreeRegionList(const char* name, HRSMtSafeChecker* mt_safety_checker = NULL);
++
++  ~FreeRegionList() {
++    if (_node_info != NULL) {
++      delete _node_info;
++    }
+   }
+ 
+   void verify_list();
+@@ -244,6 +270,10 @@ public:
+   // Removes from head or tail based on the given argument.
+   HeapRegion* remove_region(bool from_head);
+ 
++  HeapRegion* remove_region_with_node_index(bool from_head,
++                                            uint requested_node_index);
++
++
+   // Merge two ordered lists. The result is also ordered. The order is
+   // determined by hrm_index.
+   void add_ordered(FreeRegionList* from_list);
+@@ -260,6 +290,9 @@ public:
+ 
+   virtual void verify();
+ 
++  using HeapRegionSetBase::length;
++  uint length(uint node_index) const;
++
+   virtual void print_on(outputStream* out, bool print_contents = false);
+ };
+ 
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/heapRegionSet.inline.hpp b/hotspot/src/share/vm/gc_implementation/g1/heapRegionSet.inline.hpp
+index f1fce751a..5ce306288 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/heapRegionSet.inline.hpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/heapRegionSet.inline.hpp
+@@ -25,6 +25,7 @@
+ #ifndef SHARE_VM_GC_IMPLEMENTATION_G1_HEAPREGIONSET_INLINE_HPP
+ #define SHARE_VM_GC_IMPLEMENTATION_G1_HEAPREGIONSET_INLINE_HPP
+ 
++#include "gc_implementation/g1/g1NUMA.hpp"
+ #include "gc_implementation/g1/heapRegionSet.hpp"
+ 
+ inline void HeapRegionSetBase::add(HeapRegion* hr) {
+@@ -94,6 +95,7 @@ inline void FreeRegionList::add_ordered(HeapRegion* hr) {
+     _head = hr;
+   }
+   _last = hr;
++  increase_length(hr->node_index());
+ }
+ 
+ inline HeapRegion* FreeRegionList::remove_from_head_impl() {
+@@ -145,8 +147,106 @@ inline HeapRegion* FreeRegionList::remove_region(bool from_head) {
+ 
+   // remove() will verify the region and check mt safety.
+   remove(hr);
++  decrease_length(hr->node_index());
+   return hr;
+ }
+ 
++inline HeapRegion* FreeRegionList::remove_region_with_node_index(bool from_head,
++                                                                 uint requested_node_index) {
++  assert(UseNUMA, "Invariant");
++
++  const uint max_search_depth = G1NUMA::numa()->max_search_depth();
++  HeapRegion* cur;
++
++  // Find the region to use, searching from _head or _tail as requested.
++  size_t cur_depth = 0;
++  if (from_head) {
++    for (cur = _head;
++         cur != NULL && cur_depth < max_search_depth;
++         cur = cur->next(), ++cur_depth) {
++      if (requested_node_index == cur->node_index()) {
++        break;
++      }
++    }
++  } else {
++    for (cur = _tail;
++         cur != NULL && cur_depth < max_search_depth;
++         cur = cur->prev(), ++cur_depth) {
++      if (requested_node_index == cur->node_index()) {
++        break;
++      }
++    }
++  }
++
++  // Didn't find a region to use.
++  if (cur == NULL || cur_depth >= max_search_depth) {
++    return NULL;
++  }
++
++  // Splice the region out of the list.
++  HeapRegion* prev = cur->prev();
++  HeapRegion* next = cur->next();
++  if (prev == NULL) {
++    _head = next;
++  } else {
++    prev->set_next(next);
++  }
++  if (next == NULL) {
++    _tail = prev;
++  } else {
++    next->set_prev(prev);
++  }
++  cur->set_prev(NULL);
++  cur->set_next(NULL);
++
++  if (_last == cur) {
++    _last = NULL;
++  }
++
++  remove(cur);
++  decrease_length(cur->node_index());
++
++  return cur;
++}
++
++inline void FreeRegionList::NodeInfo::increase_length(uint node_index) {
++  if (node_index < _num_nodes) {
++    _length_of_node[node_index] += 1;
++  }
++}
++
++inline void FreeRegionList::NodeInfo::decrease_length(uint node_index) {
++  if (node_index < _num_nodes) {
++    assert(_length_of_node[node_index] > 0,
++           err_msg("Current length %u should be greater than zero for node %u",
++           _length_of_node[node_index], node_index));
++    _length_of_node[node_index] -= 1;
++  }
++}
++
++inline uint FreeRegionList::NodeInfo::length(uint node_index) const {
++  return _length_of_node[node_index];
++}
++
++inline void FreeRegionList::increase_length(uint node_index) {
++  if (_node_info != NULL) {
++    return _node_info->increase_length(node_index);
++  }
++}
++
++inline void FreeRegionList::decrease_length(uint node_index) {
++  if (_node_info != NULL) {
++    return _node_info->decrease_length(node_index);
++  }
++}
++
++inline uint FreeRegionList::length(uint node_index) const {
++  if (_node_info != NULL) {
++    return _node_info->length(node_index);
++  } else {
++    return 0;
++  }
++}
++
+ #endif // SHARE_VM_GC_IMPLEMENTATION_G1_HEAPREGIONSET_INLINE_HPP
+ 
+diff --git a/hotspot/src/share/vm/memory/universe.cpp b/hotspot/src/share/vm/memory/universe.cpp
+index 53f402172..1b66e0cb8 100644
+--- a/hotspot/src/share/vm/memory/universe.cpp
++++ b/hotspot/src/share/vm/memory/universe.cpp
+@@ -78,6 +78,7 @@
+ #include "gc_implementation/concurrentMarkSweep/cmsAdaptiveSizePolicy.hpp"
+ #include "gc_implementation/concurrentMarkSweep/cmsCollectorPolicy.hpp"
+ #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
++#include "gc_implementation/g1/g1NUMA.hpp"
+ #include "gc_implementation/g1/g1CollectorPolicy_ext.hpp"
+ #include "gc_implementation/parallelScavenge/parallelScavengeHeap.hpp"
+ #include "gc_implementation/shenandoah/shenandoahHeap.hpp"
+@@ -811,6 +812,7 @@ jint Universe::initialize_heap() {
+ #if INCLUDE_ALL_GCS
+     G1CollectorPolicyExt* g1p = new G1CollectorPolicyExt();
+     g1p->initialize_all();
++    G1NUMA::create();
+     G1CollectedHeap* g1h = new G1CollectedHeap(g1p);
+     Universe::_collectedHeap = g1h;
+ #else  // INCLUDE_ALL_GCS
+diff --git a/hotspot/src/share/vm/prims/whitebox.cpp b/hotspot/src/share/vm/prims/whitebox.cpp
+index 2247b29f3..c44697f0d 100644
+--- a/hotspot/src/share/vm/prims/whitebox.cpp
++++ b/hotspot/src/share/vm/prims/whitebox.cpp
+@@ -28,6 +28,7 @@
+ #include "memory/metaspaceShared.hpp"
+ #include "memory/iterator.hpp"
+ #include "memory/universe.hpp"
++#include "memory/oopFactory.hpp"
+ #include "oops/oop.inline.hpp"
+ 
+ #include "classfile/symbolTable.hpp"
+@@ -354,6 +355,30 @@ WB_ENTRY(jobject, WB_G1AuxiliaryMemoryUsage(JNIEnv* env))
+   Handle h = MemoryService::create_MemoryUsage_obj(usage, CHECK_NULL);
+   return JNIHandles::make_local(env, h());
+ WB_END
++
++WB_ENTRY(jint, WB_G1ActiveMemoryNodeCount(JNIEnv* env, jobject o))
++  if (UseG1GC) {
++    G1NUMA* numa = G1NUMA::numa();
++    return (jint)numa->num_active_nodes();
++  }
++  THROW_MSG_0(vmSymbols::java_lang_UnsupportedOperationException(), "WB_G1ActiveMemoryNodeCount: G1 GC is not enabled");
++WB_END
++
++WB_ENTRY(jintArray, WB_G1MemoryNodeIds(JNIEnv* env, jobject o))
++  if (UseG1GC) {
++    G1NUMA* numa = G1NUMA::numa();
++    int num_node_ids = (int)numa->num_active_nodes();
++    const int* node_ids = numa->node_ids();
++
++    typeArrayOop result = oopFactory::new_intArray(num_node_ids, CHECK_NULL);
++    for (int i = 0; i < num_node_ids; i++) {
++      result->int_at_put(i, (jint)node_ids[i]);
++    }
++    return (jintArray) JNIHandles::make_local(env, result);
++  }
++  THROW_MSG_NULL(vmSymbols::java_lang_UnsupportedOperationException(), "WB_G1MemoryNodeIds: G1 GC is not enabled");
++WB_END
++
+ #endif // INCLUDE_ALL_GCS
+ 
+ #if INCLUDE_NMT
+@@ -1246,6 +1271,9 @@ static JNINativeMethod methods[] = {
+   {CC"g1StartConcMarkCycle",       CC"()Z",           (void*)&WB_G1StartMarkCycle  },
+   {CC"g1AuxiliaryMemoryUsage", CC"()Ljava/lang/management/MemoryUsage;",
+                                                       (void*)&WB_G1AuxiliaryMemoryUsage  },
++  {CC"g1ActiveMemoryNodeCount", CC"()I",              (void*)&WB_G1ActiveMemoryNodeCount },
++  {CC"g1MemoryNodeIds",    CC"()[I",                  (void*)&WB_G1MemoryNodeIds },
++
+ #endif // INCLUDE_ALL_GCS
+ #if INCLUDE_NMT
+   {CC"NMTMalloc",           CC"(J)J",                 (void*)&WB_NMTMalloc          },
+diff --git a/hotspot/src/share/vm/runtime/os.hpp b/hotspot/src/share/vm/runtime/os.hpp
+index cff2e9c3e..a60ef4206 100644
+--- a/hotspot/src/share/vm/runtime/os.hpp
++++ b/hotspot/src/share/vm/runtime/os.hpp
+@@ -369,6 +369,7 @@ class os: AllStatic {
+   static size_t numa_get_leaf_groups(int *ids, size_t size);
+   static bool   numa_topology_changed();
+   static int    numa_get_group_id();
++  static int    numa_get_group_id_for_address(const void* address);
+ 
+   // Page manipulation
+   struct page_info {
+diff --git a/hotspot/test/gc/g1/TestG1NUMATouchRegions.java b/hotspot/test/gc/g1/TestG1NUMATouchRegions.java
+new file mode 100644
+index 000000000..c5322849e
+--- /dev/null
++++ b/hotspot/test/gc/g1/TestG1NUMATouchRegions.java
+@@ -0,0 +1,245 @@
++/*
++ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
++ *
++ * This code is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License version 2 only, as
++ * published by the Free Software Foundation.
++ *
++ * This code is distributed in the hope that it will be useful, but WITHOUT
++ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
++ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
++ * version 2 for more details (a copy is included in the LICENSE file that
++ * accompanied this code).
++ *
++ * You should have received a copy of the GNU General Public License version
++ * 2 along with this work; if not, write to the Free Software Foundation,
++ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
++ *
++ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
++ * or visit www.oracle.com if you need additional information or have any
++ * questions.
++ */
++
++package gc.g1;
++
++/**
++ * @test TestG1NUMATouchRegions
++ * @summary Ensure the bottom of the given heap regions are properly touched with requested NUMA id.
++ * @key gc
++ * @requires vm.gc.G1
++ * @requires os.family == "linux"
++ * @library /test/lib
++ * @modules java.base/jdk.internal.misc
++ *          java.management
++ * @build sun.hotspot.WhiteBox
++ * @run driver ClassFileInstaller sun.hotspot.WhiteBox
++ * @run main/othervm -XX:+UseG1GC -Xbootclasspath/a:. -XX:+UseNUMA -XX:+UnlockDiagnosticVMOptions -XX:+WhiteBoxAPI gc.g1.TestG1NUMATouchRegions
++ */
++
++import java.util.LinkedList;
++import jdk.test.lib.process.OutputAnalyzer;
++import jdk.test.lib.process.ProcessTools;
++import sun.hotspot.WhiteBox;
++
++public class TestG1NUMATouchRegions {
++    enum NUMASupportStatus {
++        NOT_CHECKED,
++        SUPPORT,
++        NOT_SUPPORT
++    };
++
++    static int G1HeapRegionSize1MB = 1;
++    static int G1HeapRegionSize8MB = 8;
++
++    static NUMASupportStatus status = NUMASupportStatus.NOT_CHECKED;
++
++    public static void main(String[] args) throws Exception {
++        // 1. Page size < G1HeapRegionSize
++        //    Test default page with 1MB heap region size
++        testMemoryTouch("-XX:-UseLargePages", G1HeapRegionSize1MB);
++        // 2. Page size > G1HeapRegionSize
++        //    Test large page with 1MB heap region size.
++        testMemoryTouch("-XX:+UseLargePages", G1HeapRegionSize1MB);
++        // 3. Page size < G1HeapRegionSize
++        //    Test large page with 8MB heap region size.
++        testMemoryTouch("-XX:+UseLargePages", G1HeapRegionSize8MB);
++    }
++
++    // On Linux, always UseNUMA is enabled if there is multiple active numa nodes.
++    static NUMASupportStatus checkNUMAIsEnabled(OutputAnalyzer output) {
++        boolean supportNUMA = Boolean.parseBoolean(output.firstMatch("\\bUseNUMA\\b.*?=.*?([a-z]+)", 1));
++        System.out.println("supportNUMA=" + supportNUMA);
++        return supportNUMA ? NUMASupportStatus.SUPPORT : NUMASupportStatus.NOT_SUPPORT;
++    }
++
++    static long parseSizeString(String size) {
++        long multiplier = 1;
++
++        if (size.endsWith("B")) {
++            multiplier = 1;
++        } else if (size.endsWith("K")) {
++            multiplier = 1024;
++        } else if (size.endsWith("M")) {
++            multiplier = 1024 * 1024;
++        } else if (size.endsWith("G")) {
++            multiplier = 1024 * 1024 * 1024;
++        } else {
++            throw new IllegalArgumentException("Expected memory string '" + size + "'to end with either of: B, K, M, G");
++        }
++
++        long longSize = Long.parseUnsignedLong(size.substring(0, size.length() - 1));
++
++        return longSize * multiplier;
++    }
++
++    static long heapPageSize(OutputAnalyzer output) {
++        String HeapPageSizePattern = "Heap:  .*page_size=([^ ]+)";
++        String str = output.firstMatch(HeapPageSizePattern, 1);
++
++        if (str == null) {
++            output.reportDiagnosticSummary();
++            throw new RuntimeException("Match from '" + HeapPageSizePattern + "' got 'null'");
++        }
++
++        return parseSizeString(str);
++    }
++
++    // 1. -UseLargePages: default page, page size < G1HeapRegionSize
++    //    +UseLargePages: large page size <= G1HeapRegionSize
++    //
++    //    Each 'int' represents a numa id of single HeapRegion (bottom page).
++    //    e.g. 1MB heap region, 2MB page size and 2 NUMA nodes system
++    //         Check the first set(2 regions)
++    //         0| ...omitted..| 0
++    //         1| ...omitted..| 1
++    static void checkCase1Pattern(OutputAnalyzer output, int index, long g1HeapRegionSize, long actualPageSize, int[] memoryNodeIds) throws Exception {
++        StringBuilder sb = new StringBuilder();
++
++        // Append index which means heap region index.
++        sb.append(String.format("%6d", index));
++        sb.append("| .* | ");
++
++        // Append page node id.
++        sb.append(memoryNodeIds[index]);
++
++        output.shouldMatch(sb.toString());
++    }
++
++    // 3. +UseLargePages: large page size > G1HeapRegionSize
++    //
++    //    As a OS page is consist of multiple heap regions, log also should be
++    //    printed multiple times for same numa id.
++    //    e.g. 1MB heap region, 2MB page size and 2 NUMA nodes system
++    //         Check the first set(4 regions)
++    //         0| ...omitted..| 0
++    //         1| ...omitted..| 0
++    //         2| ...omitted..| 1
++    //         3| ...omitted..| 1
++    static void checkCase2Pattern(OutputAnalyzer output, int index, long g1HeapRegionSize, long actualPageSize, int[] memoryNodeIds) throws Exception {
++        StringBuilder sb = new StringBuilder();
++
++        // Append page range.
++        int lines_to_print = (int)(actualPageSize / g1HeapRegionSize);
++        for (int i = 0; i < lines_to_print; i++) {
++            // Append index which means heap region index.
++            sb.append(String.format("%6d", index * lines_to_print + i));
++            sb.append("| .* | ");
++
++            // Append page node id.
++            sb.append(memoryNodeIds[index]);
++
++            output.shouldMatch(sb.toString());
++            sb.setLength(0);
++        }
++    }
++
++    static void checkNUMALog(OutputAnalyzer output, int regionSizeInMB) throws Exception {
++        WhiteBox wb = WhiteBox.getWhiteBox();
++        long g1HeapRegionSize = regionSizeInMB * 1024 * 1024;
++        long actualPageSize = heapPageSize(output);
++        long defaultPageSize = (long)wb.getVMPageSize();
++        int memoryNodeCount = wb.g1ActiveMemoryNodeCount();
++        int[] memoryNodeIds = wb.g1MemoryNodeIds();
++
++        System.out.println("node count=" + memoryNodeCount + ", actualPageSize=" + actualPageSize);
++        // Check for the first set of active numa nodes.
++        for (int index = 0; index < memoryNodeCount; index++) {
++            if (actualPageSize <= defaultPageSize) {
++                checkCase1Pattern(output, index, g1HeapRegionSize, actualPageSize, memoryNodeIds);
++            } else {
++                checkCase2Pattern(output, index, g1HeapRegionSize, actualPageSize, memoryNodeIds);
++            }
++        }
++    }
++
++    static void testMemoryTouch(String largePagesSetting, int regionSizeInMB) throws Exception {
++        // Skip testing with message.
++        if (status == NUMASupportStatus.NOT_SUPPORT) {
++            System.out.println("NUMA is not supported");
++            return;
++        }
++
++        ProcessBuilder pb_enabled = ProcessTools.createJavaProcessBuilder(
++                                              "-Xbootclasspath/a:.",
++                                              "-Xlog:pagesize,gc+heap+region=trace",
++                                              "-XX:+UseG1GC",
++                                              "-Xmx128m",
++                                              "-Xms128m",
++                                              "-XX:+UnlockDiagnosticVMOptions",
++                                              "-XX:+WhiteBoxAPI",
++                                              "-XX:+PrintFlagsFinal",
++                                              "-XX:+UseNUMA",
++                                              "-XX:+AlwaysPreTouch",
++                                              largePagesSetting,
++                                              "-XX:G1HeapRegionSize=" + regionSizeInMB + "m",
++                                              GCTest.class.getName());
++        OutputAnalyzer output = new OutputAnalyzer(pb_enabled.start());
++
++        // Check NUMA availability.
++        if (status == NUMASupportStatus.NOT_CHECKED) {
++            status = checkNUMAIsEnabled(output);
++        }
++
++        if (status == NUMASupportStatus.SUPPORT) {
++            checkNUMALog(output, regionSizeInMB);
++        } else {
++            // Exit with message for the first test.
++            System.out.println("NUMA is not supported");
++        }
++    }
++
++  static class GCTest {
++    public static final int M = 1024*1024;
++    public static LinkedList<Object> garbageList = new LinkedList<Object>();
++    // A large object referenced by a static.
++    static int[] filler = new int[10 * M];
++
++    public static void genGarbage() {
++      for (int i = 0; i < 32*1024; i++) {
++        garbageList.add(new int[100]);
++      }
++      garbageList.clear();
++    }
++
++    public static void main(String[] args) {
++
++      int[] large = new int[M];
++      Object ref = large;
++
++      System.out.println("Creating garbage");
++      for (int i = 0; i < 100; i++) {
++        // A large object that will be reclaimed eagerly.
++        large = new int[6*M];
++        genGarbage();
++        // Make sure that the compiler cannot completely remove
++        // the allocation of the large object until here.
++        System.out.println(large);
++      }
++
++      // Keep the reference to the first object alive.
++      System.out.println(ref);
++      System.out.println("Done");
++    }
++  }
++}
+diff --git a/jdk/test/lib/sun/hotspot/WhiteBox.java b/jdk/test/lib/sun/hotspot/WhiteBox.java
+index 9497c9530..a6d773bc8 100644
+--- a/jdk/test/lib/sun/hotspot/WhiteBox.java
++++ b/jdk/test/lib/sun/hotspot/WhiteBox.java
+@@ -141,6 +141,8 @@ public class WhiteBox {
+   public native int     g1RegionSize();
+   public native MemoryUsage g1AuxiliaryMemoryUsage();
+   public native Object[]    parseCommandLine(String commandline, DiagnosticCommand[] args);
++  public native int g1ActiveMemoryNodeCount();
++  public native int[] g1MemoryNodeIds();
+ 
+   // Parallel GC
+   public native long psVirtualSpaceAlignment();
diff --git a/openjdk-1.8.0.spec b/openjdk-1.8.0.spec
index 83df46b..c70b9a0 100644
--- a/openjdk-1.8.0.spec
+++ b/openjdk-1.8.0.spec
@@ -918,7 +918,7 @@ Provides: java-%{javaver}-%{origin}-accessibility%{?1} = %{epoch}:%{version}-%{r
 
 Name:    java-%{javaver}-%{origin}
 Version: %{javaver}.%{updatever}.%{buildver}
-Release: 10
+Release: 11
 # java-1.5.0-ibm from jpackage.org set Epoch to 1 for unknown reasons
 # and this change was brought into RHEL-4. java-1.5.0-ibm packages
 # also included the epoch in their virtual provides. This created a
@@ -1106,6 +1106,7 @@ Patch192: add_kae_implementation_add_default_conf_file.patch
 Patch193: improve_algorithmConstraints_checkAlgorithm_performance.patch
 Patch194: modify_the_default_iteration_time_and_forks_in_the_JMH_of_KAEProvider.patch
 Patch195: support_CMS_parallel_inspection.patch
+Patch196: g1gc-numa-aware-Implementation.patch
 
 #############################################
 #
@@ -1560,6 +1561,7 @@ pushd %{top_level_dir_name}
 %patch192 -p1
 %patch194 -p1
 %patch195 -p1
+%patch196 -p1
 popd
 
 # System library fixes
@@ -2176,6 +2178,9 @@ require "copy_jdk_configs.lua"
 %endif
 
 %changelog
+* Sat Jun 12 2021 hu_bo_dao <hubodao@huawei.com> - 1:1.8.0.292-b10.11
+- add g1gc-numa-aware-Implementation.patch
+
 * Wed Jun 10 2021 hu_bo_dao <hubodao@huawei.com> - 1:1.8.0.292-b10.10
 - add support_CMS_parallel_inspection.patch
 
-- 
Gitee


From 6f9be5d77523265b7674ec27171d1c0489fb2496 Mon Sep 17 00:00:00 2001
From: kuenking111 <wangkun49@huawei.com>
Date: Sat, 12 Jun 2021 12:36:48 +0800
Subject: [PATCH 3/6] I3VFBA: Implementation of Blas hotspot function in
 Intrinsics

---
 ..._Blas_hotspot_function_in_Intrinsics.patch | 1638 +++++++++++++++++
 openjdk-1.8.0.spec                            |    9 +-
 2 files changed, 1645 insertions(+), 2 deletions(-)
 create mode 100755 implementation_of_Blas_hotspot_function_in_Intrinsics.patch

diff --git a/implementation_of_Blas_hotspot_function_in_Intrinsics.patch b/implementation_of_Blas_hotspot_function_in_Intrinsics.patch
new file mode 100755
index 0000000..39335db
--- /dev/null
+++ b/implementation_of_Blas_hotspot_function_in_Intrinsics.patch
@@ -0,0 +1,1638 @@
+commit 9856171f660f6edb240bb4e7e95a87b60f4d2bc3
+Author: hubodao <hubodao@huawei.com>
+Date:   Tue Jun 8 08:07:38 2021 +0000
+
+    blas instrinsic
+
+diff --git a/hotspot/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp
+index 7080ea10d..62a8ab7bd 100644
+--- a/hotspot/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp
++++ b/hotspot/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp
+@@ -919,6 +919,126 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
+   }
+ }
+ 
++void LIRGenerator::do_dgemm_dgemm(Intrinsic* x) {
++  assert(x->number_of_arguments() == 16, "wrong type");
++
++  LIRItem ta(x->argument_at(0), this);
++  LIRItem tb(x->argument_at(1), this);
++  LIRItem m(x->argument_at(2), this);
++  LIRItem n(x->argument_at(3), this);
++  LIRItem k(x->argument_at(4), this);
++  LIRItem alpha(x->argument_at(5), this);
++  LIRItem a(x->argument_at(6), this);
++  LIRItem a_offset(x->argument_at(7), this);
++  LIRItem lda(x->argument_at(8), this);
++  LIRItem b(x->argument_at(9), this);
++  LIRItem b_offset(x->argument_at(10), this);
++  LIRItem ldb(x->argument_at(11), this);
++  LIRItem beta(x->argument_at(12), this);
++  LIRItem c(x->argument_at(13), this);
++  LIRItem c_offset(x->argument_at(14), this);
++  LIRItem ldc(x->argument_at(15), this);
++
++  ta.load_item();
++  tb.load_item();
++  m.load_item();
++  n.load_item();
++  k.load_item();
++  alpha.load_item();
++  a.load_item();
++  a_offset.load_nonconstant();
++  lda.load_item();
++  b.load_item();
++  b_offset.load_nonconstant();
++  ldb.load_item();
++  beta.load_item();
++  c.load_item();
++  c_offset.load_nonconstant();
++  ldc.load_item();
++
++  LIR_Opr ta_base = ta.result();
++  LIR_Opr tb_base = tb.result();
++  LIR_Opr r_m = m.result();
++  LIR_Opr r_n = n.result();
++  LIR_Opr r_k = k.result();
++  LIR_Opr r_alpha = alpha.result();
++  LIR_Opr a_base = a.result();
++  LIR_Opr r_a_offset = a_offset.result();
++  LIR_Opr r_lda = lda.result();
++  LIR_Opr b_base = b.result();
++  LIR_Opr r_b_offset = b_offset.result();
++  LIR_Opr r_ldb = ldb.result();
++  LIR_Opr r_beta = beta.result();
++  LIR_Opr c_base = c.result();
++  LIR_Opr r_c_offset = c_offset.result();
++  LIR_Opr r_ldc = ldc.result();
++
++  LIR_Opr ta_value = load_String_value(ta_base);
++  LIR_Opr ta_offset = load_String_offset(ta_base);
++  LIR_Opr tb_value = load_String_value(tb_base);
++  LIR_Opr tb_offset = load_String_offset(tb_base);
++
++  LIR_Address* addr_ta = emit_array_address(ta_value, ta_offset, T_CHAR, false);
++  LIR_Address* addr_tb = emit_array_address(tb_value, tb_offset, T_CHAR, false);
++  LIR_Address* addr_a = emit_array_address(a_base, r_a_offset, T_DOUBLE, false);
++  LIR_Address* addr_b = emit_array_address(b_base, r_b_offset, T_DOUBLE, false);
++  LIR_Address* addr_c = emit_array_address(c_base, r_c_offset, T_DOUBLE, false);
++
++  LIR_Opr tmp = new_pointer_register();
++  LIR_Opr ta_addr = new_register(T_ADDRESS);
++  __ leal(LIR_OprFact::address(addr_ta), tmp);
++  __ move(tmp, ta_addr);
++  tmp = new_pointer_register();
++  LIR_Opr tb_addr = new_register(T_ADDRESS);
++  __ leal(LIR_OprFact::address(addr_tb), tmp);
++  __ move(tmp, tb_addr);
++  tmp = new_pointer_register();
++  LIR_Opr a_addr = new_register(T_ADDRESS);
++  __ leal(LIR_OprFact::address(addr_a), tmp);
++  __ move(tmp, a_addr);
++  tmp = new_pointer_register();
++  LIR_Opr b_addr = new_register(T_ADDRESS);
++  __ leal(LIR_OprFact::address(addr_b), tmp);
++  __ move(tmp, b_addr);
++  tmp = new_pointer_register();
++  LIR_Opr c_addr = new_register(T_ADDRESS);
++  __ leal(LIR_OprFact::address(addr_c), tmp);
++  __ move(tmp, c_addr);
++
++  BasicTypeList signature(13);
++  signature.append(T_ADDRESS);
++  signature.append(T_ADDRESS);
++  signature.append(T_INT);
++  signature.append(T_INT);
++  signature.append(T_INT);
++  signature.append(T_DOUBLE);
++  signature.append(T_ADDRESS);
++  signature.append(T_INT);
++  signature.append(T_ADDRESS);
++  signature.append(T_INT);
++  signature.append(T_DOUBLE);
++  signature.append(T_ADDRESS);
++  signature.append(T_INT);
++
++  LIR_OprList* args = new LIR_OprList();
++  args->append(ta_addr);
++  args->append(tb_addr);
++  args->append(r_m);
++  args->append(r_n);
++  args->append(r_k);
++  args->append(r_alpha);
++  args->append(a_addr);
++  args->append(r_lda);
++  args->append(b_addr);
++  args->append(r_ldb);
++  args->append(r_beta);
++  args->append(c_addr);
++  args->append(r_ldc);
++
++  assert(StubRoutines::dgemmDgemm() != NULL, "invalid stub entry");
++  call_runtime(&signature, args, StubRoutines::dgemmDgemm(), voidType, NULL);
++  set_no_result(x);
++}
+ 
+ void LIRGenerator::do_ArrayCopy(Intrinsic* x) {
+   assert(x->number_of_arguments() == 5, "wrong type");
+@@ -1038,6 +1158,114 @@ void LIRGenerator::do_update_CRC32(Intrinsic* x) {
+   }
+ }
+ 
++void LIRGenerator::do_dgemv_dgemv(Intrinsic* x) {
++  assert(x->number_of_arguments() == 14, "wrong type");
++
++  LIRItem trans(x->argument_at(0), this);
++  LIRItem m(x->argument_at(1), this);
++  LIRItem n(x->argument_at(2), this);
++  LIRItem alpha(x->argument_at(3), this);
++  LIRItem array_a(x->argument_at(4), this);
++  LIRItem array_a_offset(x->argument_at(5), this);
++  LIRItem lda(x->argument_at(6), this);
++  LIRItem array_x(x->argument_at(7), this);
++  LIRItem array_x_offset(x->argument_at(8), this);
++  LIRItem incx(x->argument_at(9), this);
++  LIRItem beta(x->argument_at(10), this);
++  LIRItem array_y(x->argument_at(11), this);
++  LIRItem array_y_offset(x->argument_at(12), this);
++  LIRItem incy(x->argument_at(13), this);
++
++  trans.load_item();
++  m.load_item();
++  n.load_item();
++  alpha.load_item();
++  array_a.load_item();
++  array_a_offset.load_nonconstant();
++  lda.load_item();
++  array_x.load_item();
++  array_x_offset.load_nonconstant();
++  incx.load_item();
++  beta.load_item();
++  array_y.load_item();
++  array_y_offset.load_nonconstant();
++  incy.load_item();
++
++  LIR_Opr res_trans_base = trans.result();
++  LIR_Opr res_m = m.result();
++  LIR_Opr res_n = n.result();
++  LIR_Opr res_alpha = alpha.result();
++  LIR_Opr res_a_base = array_a.result();
++  LIR_Opr res_a_offset = array_a_offset.result();
++  LIR_Opr res_lda = lda.result();
++  LIR_Opr res_x_base = array_x.result();
++  LIR_Opr res_x_offset = array_x_offset.result();
++  LIR_Opr res_incx = incx.result();
++  LIR_Opr res_beta = beta.result();
++  LIR_Opr res_y_base = array_y.result();
++  LIR_Opr res_y_offset = array_y_offset.result();
++  LIR_Opr res_incy = incy.result();
++
++  LIR_Opr addr_trans_base = LIRGenerator::load_String_value(res_trans_base);
++  LIR_Opr addr_trans_offset = LIRGenerator::load_String_offset(res_trans_base);
++  LIR_Address* addr_trans = emit_array_address(addr_trans_base, addr_trans_offset, T_CHAR, false);
++
++  LIR_Address* addr_a = emit_array_address(res_a_base, res_a_offset, T_DOUBLE, false);
++  LIR_Address* addr_x = emit_array_address(res_x_base, res_x_offset, T_DOUBLE, false);
++  LIR_Address* addr_y = emit_array_address(res_y_base, res_y_offset, T_DOUBLE, false);
++
++  // load addr to register
++  LIR_Opr tmp = new_pointer_register();
++  LIR_Opr trans_addr = new_register(T_ADDRESS);
++  __ leal(LIR_OprFact::address(addr_trans), tmp);
++  __ move(tmp, trans_addr);
++
++  LIR_Opr tmp1 = new_pointer_register();
++  LIR_Opr a_addr = new_register(T_ADDRESS);
++  __ leal(LIR_OprFact::address(addr_a), tmp1);
++  __ move(tmp1, a_addr);
++
++  LIR_Opr tmp2 = new_pointer_register();
++  LIR_Opr x_addr = new_register(T_ADDRESS);
++  __ leal(LIR_OprFact::address(addr_x), tmp2);
++  __ move(tmp2, x_addr);
++
++  LIR_Opr tmp3 = new_pointer_register();
++  LIR_Opr y_addr = new_register(T_ADDRESS);
++  __ leal(LIR_OprFact::address(addr_y), tmp3);
++  __ move(tmp3, y_addr);
++
++  BasicTypeList signature(11);
++  signature.append(T_ADDRESS);
++  signature.append(T_INT);
++  signature.append(T_INT);
++  signature.append(T_DOUBLE);
++  signature.append(T_ADDRESS);
++  signature.append(T_INT);
++  signature.append(T_ADDRESS);
++  signature.append(T_INT);
++  signature.append(T_DOUBLE);
++  signature.append(T_ADDRESS);
++  signature.append(T_INT);
++
++  LIR_OprList* args = new LIR_OprList();
++  args->append(trans_addr);
++  args->append(res_m);
++  args->append(res_n);
++  args->append(res_alpha);
++  args->append(a_addr);
++  args->append(res_lda);
++  args->append(x_addr);
++  args->append(res_incx);
++  args->append(res_beta);
++  args->append(y_addr);
++  args->append(res_incy);
++
++  assert(StubRoutines::dgemvDgemv() != NULL, "invalid stub entry");
++  call_runtime(&signature, args, StubRoutines::dgemvDgemv(), voidType, NULL);
++  set_no_result(x);
++}
++
+ // _i2l, _i2f, _i2d, _l2i, _l2f, _l2d, _f2i, _f2l, _f2d, _d2i, _d2l, _d2f
+ // _i2b, _i2c, _i2s
+ void LIRGenerator::do_Convert(Convert* x) {
+diff --git a/hotspot/src/cpu/aarch64/vm/interpreterGenerator_aarch64.hpp b/hotspot/src/cpu/aarch64/vm/interpreterGenerator_aarch64.hpp
+index c0aaa1de4..a275a6a99 100644
+--- a/hotspot/src/cpu/aarch64/vm/interpreterGenerator_aarch64.hpp
++++ b/hotspot/src/cpu/aarch64/vm/interpreterGenerator_aarch64.hpp
+@@ -50,6 +50,11 @@ void generate_transcendental_entry(AbstractInterpreter::MethodKind kind, int fpa
+   address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
+   void lock_method(void);
+   void generate_stack_overflow_check(void);
++  void load_String_value(Register src, Register dst);
++  void load_String_offset(Register src, Register dst);
++  void emit_array_address(Register src, Register idx, Register dst, BasicType type);
++  address generate_Dgemm_dgemm_entry();
++  address generate_Dgemv_dgemv_entry();
+ 
+   void generate_counter_incr(Label* overflow, Label* profile_method, Label* profile_method_continue);
+   void generate_counter_overflow(Label* do_continue);
+diff --git a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
+index c5ec637a1..125983179 100644
+--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
++++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
+@@ -3221,6 +3221,44 @@ class StubGenerator: public StubCodeGenerator {
+     return start;
+   }
+ 
++  address load_BLAS_library() {
++    // Try to load BLAS library.
++    const char library_name[] = "openblas";
++    char err_buf[1024] = {0};
++    char path[JVM_MAXPATHLEN] = {0};
++    os::jvm_path(path, sizeof(path));
++    int jvm_offset = -1;
++
++    // Match "jvm[^/]*" in jvm_path.
++    const char* last_name = strrchr(path, '/');
++    last_name = last_name ? last_name : path;
++    const char* last_lib_name = strstr(last_name, "jvm");
++    if (last_lib_name != NULL) {
++      jvm_offset = last_lib_name - path;
++    }
++
++    address library = NULL;
++    // Find the BLAS shared library.
++    // Search path: <home>/jre/lib/<arch>/<vm>/libopenblas.so
++    if (jvm_offset >= 0) {
++      if (jvm_offset + strlen(library_name) + strlen(os::dll_file_extension()) < JVM_MAXPATHLEN) {
++        strncpy(&path[jvm_offset], library_name, strlen(library_name));
++        strncat(&path[jvm_offset], os::dll_file_extension(), strlen(os::dll_file_extension()));
++        library = (address)os::dll_load(path, err_buf, sizeof(err_buf));
++      }
++    }
++    return library;
++  }
++
++  address get_BLAS_func_entry(address library, const char* func_name) {
++    if (library == NULL) {
++        return NULL;
++    }
++
++    // Try to find BLAS function entry.
++    return (address)os::dll_lookup((void*)library, func_name);
++  }
++
+   /**
+    *  Arguments:
+    *
+@@ -3254,6 +3292,218 @@ class StubGenerator: public StubCodeGenerator {
+     return start;
+   }
+ 
++  // Parameter conversion from JVM to native BLAS
++  //
++  // Register:
++  // r0: transa                         r0: transa
++  // r1: transb                         r1: transb
++  // r2: m                              r2: &m
++  // r3: n                              r3: &n
++  // r4: k            =========>        r4: &k
++  // r5: A                              r5: &alpha
++  // r6: lda                            r6: A
++  // r7: B                              r7: &lda
++  // v0: alpha
++  // v1: beta
++  //
++  // Stack:
++  // |-------|                          |-------|
++  // | ldc   |                          | ldc   |
++  // |-------|                          |-------|
++  // | C     |                          | C     |
++  // |-------|                          |-------|
++  // | ldb   |                          | ldb   |
++  // |-------| <-- sp                   |-------|
++  // |       |                          | m     |
++  // |-------|                          |-------|
++  // |       |                          | n     |
++  // |-------|                          |-------|
++  // |       |                          | k     |
++  // |-------|                          |-------|
++  // |       |                          | lda   |
++  // |-------|                          |-------|
++  // |       |                          | alpha |
++  // |-------|                          |-------|
++  // |       |                          | beta  |
++  // |-------|        =========>        |-------|
++  // |       |                          | lr    |
++  // |-------|                          |-------|
++  // |       |                          | rfp   |
++  // |-------|                          |-------| <-- fp
++  // | ...   |                          | ...   |
++  // |-------|                          |-------|
++  // |       |                          | &ldc  |
++  // |-------|                          |-------|
++  // |       |                          | C     |
++  // |-------|                          |-------|
++  // |       |                          | &bata |
++  // |-------|                          |-------|
++  // |       |                          | &ldb  |
++  // |-------|                          |-------|
++  // |       |                          | B     |
++  // |-------|                          |-------| <-- sp
++  address generate_dgemmDgemm(address library) {
++    __ align(CodeEntryAlignment);
++    StubCodeMark mark(this, "StubRoutines", "dgemm_dgemm");
++
++    address fn = get_BLAS_func_entry(library, "dgemm_");
++    if (fn == NULL) return NULL;
++
++    address start = __ pc();
++
++    const Register transa       = c_rarg0;
++    const Register transb       = c_rarg1;
++    const Register m            = c_rarg2;
++    const Register n            = c_rarg3;
++    const Register k            = c_rarg4;
++    const FloatRegister alpha   = c_farg0;
++    const Register A            = c_rarg5;
++    const Register lda          = c_rarg6;
++    const Register B            = c_rarg7;
++    const FloatRegister beta    = c_farg1;
++
++    BLOCK_COMMENT("Entry:");
++
++    // extend stack
++    __ sub(sp, sp, 0x60);
++    __ stp(rfp, lr, Address(sp, 48));
++    __ add(rfp, sp, 0x30);
++    // load BLAS function entry
++    __ mov(rscratch1, fn);
++    // C
++    __ ldr(rscratch2, Address(rfp, 56));
++    // store m / n to stack
++    __ stpw(n, m, Address(rfp, 40));
++    // &beta
++    __ add(r2, rfp, 0x10);
++    // store k / lda to stack
++    __ stpw(lda, k, Address(rfp, 32));
++    // load ldc
++    __ add(r3, rfp, 0x40);
++    // store C / &beta
++    __ stp(r2, rscratch2, Address(sp, 16));
++    // &ldb
++    __ add(r2, rfp, 0x30);
++    // store B
++    __ str(B, Address(sp));
++    // move A from r5 to r6
++    __ mov(r6, A);
++    // store ldc
++    __ str(r3, Address(sp, 32));
++    // &alpha
++    __ add(r5, rfp, 0x18);
++    // store &ldb
++    __ str(r2, Address(sp, 8));
++    // &k
++    __ add(r4, rfp, 0x24);
++    // store alpha / beta
++    __ stpd(beta, alpha, Address(rfp, 16));
++    // load &lda to r7
++    __ add(r7, rfp, 0x20);
++    // load &n
++    __ add(r3, rfp, 0x28);
++    // load &m
++    __ add(r2, rfp, 0x2c);
++    // call dgemm
++    __ blr(rscratch1);
++
++    // restore rfp and lr
++    __ ldp(rfp, lr, Address(sp, 48));
++    // exit stack
++    __ add(sp, sp, 0x60);
++    __ ret(lr);
++
++    return start;
++  }
++
++  /**
++   *  public void dgemv(String trans, int m, int n,
++   *                    double alpha, double[] a, int lda,
++   *                    double[] x, int incx,
++   *                    double beta, double[] y, int incy)
++   *
++   *  Arguments:
++   *
++   *  Inputs:
++   *   c_rarg0         - char* trans
++   *   c_rarg1         - int m
++   *   c_rarg2         - int n
++   *   d0/c_farg0      - double alpha
++   *   c_rarg3         - double[] a
++   *   c_rarg4         - int lda
++   *   c_rarg5         - double[] x
++   *   c_rarg6         - int incx
++   *   d1/c_farg1      - double beta
++   *   c_rarg7         - double[] y
++   *   [sp]            - int incy
++   *
++   *  Output:
++   *       null
++   *
++   */
++
++  address generate_dgemvDgemv(address library) {
++    __ align(CodeEntryAlignment);
++    StubCodeMark mark(this, "StubRoutines", "dgemv_dgemv");
++
++    address fn = get_BLAS_func_entry(library, "dgemv_");
++    if (fn == NULL) return NULL;
++
++    address start = __ pc();
++    BLOCK_COMMENT("Entry: ");
++
++    Register trans = c_rarg0;
++    Register m = c_rarg1;
++    Register n = c_rarg2;
++    Register a = c_rarg3;
++    Register lda = c_rarg4;
++    Register x = c_rarg5;
++    Register incx = c_rarg6;
++    Register y = c_rarg7;
++
++    FloatRegister alpha = c_farg0;
++    FloatRegister beta = c_farg1;
++
++    __ sub(sp, sp, 0x50);
++    __ stp(rfp, lr, Address(sp, 32));
++    __ add(rfp, sp, 0x20);
++
++    // no need for saving trans to tmp register, keep it in register x0
++    __ strw(m, Address(rfp, 44));
++    __ strw(n, Address(rfp, 40));
++    __ strd(alpha, Address(rfp, 32));
++    __ strw(lda, Address(rfp, 28));
++    __ strw(incx, Address(rfp, 24));
++    __ strd(beta, Address(rfp, 16));
++
++    // pre call
++    // load incy and push on stack, order incy --> y --> beta
++    __ add(r1, rfp, 0x30);
++    __ str(r1, Address(sp, 16));
++    __ str(y, Address(sp, 8));
++    __ add(r1, rfp, 0x10);
++    __ str(r1, Address(sp));
++
++    __ add(r7, rfp, 0x18);
++    __ mov(r6, x);
++    __ add(r5, rfp, 0x1c);
++    __ mov(r4, a);
++    __ add(r3, rfp, 0x20);
++    __ add(r2, rfp, 0x28);
++    __ add(r1, rfp, 0x2c);
++
++    __ mov(rscratch1, fn);
++    __ blr(rscratch1);
++
++    __ ldp(rfp, lr, Address(sp, 32));
++    __ add(sp, sp, 0x50);
++    __ ret(lr);
++
++    return start;
++  }
++
++
++
+   /**
+    *  Arguments:
+    *
+@@ -4252,6 +4502,14 @@ class StubGenerator: public StubCodeGenerator {
+       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
+       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
+     }
++
++    if (UseF2jBLASIntrinsics) {
++      StubRoutines::_BLAS_library = load_BLAS_library();
++      // F2jBLAS intrinsics will use the implements in BLAS dynamic library
++      StubRoutines::_ddotF2jBLAS = generate_ddotF2jBLAS();
++      StubRoutines::_dgemmDgemm = generate_dgemmDgemm(StubRoutines::_BLAS_library);
++      StubRoutines::_dgemvDgemv = generate_dgemvDgemv(StubRoutines::_BLAS_library);
++    }
+   }
+ 
+   void generate_all() {
+@@ -4296,10 +4554,6 @@ class StubGenerator: public StubCodeGenerator {
+       StubRoutines::_montgomerySquare = g.generate_multiply();
+     }
+ 
+-    if (UseF2jBLASIntrinsics) {
+-      StubRoutines::_ddotF2jBLAS = generate_ddotF2jBLAS();
+-    }
+-
+     if (UseAESIntrinsics) {
+       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+diff --git a/hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp b/hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp
+index ae5cb3f32..924b6670f 100644
+--- a/hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp
++++ b/hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp
+@@ -856,6 +856,250 @@ address InterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractInterpret
+   return generate_native_entry(false);
+ }
+ 
++// Access the char-array of String
++void InterpreterGenerator::load_String_value(Register src, Register dst) {
++  //  Need to cooperate with JDK-8243996
++  int value_offset = java_lang_String::value_offset_in_bytes();
++
++  __ add(src, src, value_offset);
++  __ load_heap_oop(dst, Address(src));
++}
++
++void InterpreterGenerator::load_String_offset(Register src, Register dst) {
++  __ mov(dst, 0);
++
++  // Get String value offset, because of order of initialization for Interpreter,
++  // we have to hardcode the offset for String value. (JDK-8243996)
++  if (java_lang_String::has_offset_field()) {
++    int offset_offset = java_lang_String::offset_offset_in_bytes();
++    __ add(src, src, offset_offset);
++    __ ldrw(dst, Address(src));
++  }
++}
++
++void InterpreterGenerator::emit_array_address(Register src, Register idx,
++                                              Register dst, BasicType type) {
++  int offset_in_bytes = arrayOopDesc::base_offset_in_bytes(type);
++  int elem_size = type2aelembytes(type);
++  int shift = exact_log2(elem_size);
++
++  __ lsl(idx, idx, shift);
++  __ add(idx, idx, offset_in_bytes);
++  __ add(dst, src, idx);
++}
++
++/**
++ * Stub Arguments:
++ *
++ *   c_rarg0   - char* transa
++ *   c_rarg1   - char* transb
++ *   c_rarg2   - int m
++ *   c_rarg3   - int n
++ *   c_rarg4   - int k
++ *   d0        - double alpha
++ *   c_rarg5   - double[] A
++ *   c_rarg6   - int lda
++ *   c_rarg7   - double[] B
++ *   d1        - double beta
++ *   [sp + 16] - int ldc
++ *   [sp + 8]  - double[] C
++ *   [sp]      - int ldb
++ *
++ */
++address InterpreterGenerator::generate_Dgemm_dgemm_entry() {
++  if (!UseF2jBLASIntrinsics || (StubRoutines::dgemmDgemm() == NULL)) return NULL;
++
++  address entry = __ pc();
++
++  // r13: senderSP must preserved for slow path
++
++  // Arguments are reversed on java expression stack
++  const Register ta         = c_rarg0;
++  const Register tb         = c_rarg1;
++  const Register m          = c_rarg2;
++  const Register n          = c_rarg3;
++  const Register k          = c_rarg4;
++  const FloatRegister alpha = c_farg0;
++  const Register A          = c_rarg5;
++  const Register lda        = c_rarg6;
++  const Register B          = c_rarg7;
++  const FloatRegister beta  = c_farg1;
++  const Register tmp1       = rscratch1;
++  const Register tmp2       = rscratch2;
++
++  // trana
++  __ ldr(ta, Address(esp, 17 * wordSize));
++  load_String_value(ta, tmp1);
++  load_String_offset(ta, tmp2);
++  emit_array_address(tmp1, tmp2, ta, T_CHAR);
++  // tranb
++  __ ldr(tb, Address(esp, 16 * wordSize));
++  load_String_value(tb, tmp1);
++  load_String_offset(tb, tmp2);
++  emit_array_address(tmp1, tmp2, tb, T_CHAR);
++  // m, n, k
++  __ ldrw(m, Address(esp, 15 * wordSize));
++  __ ldrw(n, Address(esp, 14 * wordSize));
++  __ ldrw(k, Address(esp, 13 * wordSize));
++  // alpha
++  __ ldrd(alpha, Address(esp, 11 * wordSize));
++  // A
++  __ ldr(tmp1, Address(esp, 10 * wordSize));
++  __ mov(tmp2, 0);
++  __ ldrw(tmp2, Address(esp, 9 * wordSize));
++  emit_array_address(tmp1, tmp2, A, T_DOUBLE);
++  // lda
++  __ ldrw(lda, Address(esp, 8 * wordSize));
++  // B
++  __ ldr(tmp1, Address(esp, 7 * wordSize));
++  __ ldrw(tmp2, Address(esp, 6 * wordSize));
++  emit_array_address(tmp1, tmp2, B, T_DOUBLE);
++  // beta
++  __ ldrd(beta, Address(esp, 3 * wordSize));
++  // Start pushing arguments to machine stack.
++  //
++  // Remove the incoming args, peeling the machine SP back to where it
++  // was in the caller.  This is not strictly necessary, but unless we
++  // do so the stack frame may have a garbage FP; this ensures a
++  // correct call stack that we can always unwind.  The ANDR should be
++  // unnecessary because the sender SP in r13 is always aligned, but
++  // it doesn't hurt.
++  __ andr(sp, r13, -16);
++  __ str(lr, Address(sp, -wordSize));
++  // ldc
++  __ ldrw(tmp1, Address(esp, 0x0));
++  __ strw(tmp1, Address(sp, 2 * -wordSize));
++  // C
++  __ ldr(tmp1, Address(esp, 2 * wordSize));
++  __ ldrw(tmp2, Address(esp, wordSize));
++  emit_array_address(tmp1, tmp2, tmp1, T_DOUBLE);
++  __ str(tmp1, Address(sp, 3 * -wordSize));
++  // ldb
++  __ ldrw(tmp2, Address(esp, 5 * wordSize));
++  __ strw(tmp2, Address(sp, 4 * -wordSize));
++
++  // Call function
++  __ add(sp, sp, 4 * -wordSize);
++  address fn = CAST_FROM_FN_PTR(address, StubRoutines::dgemmDgemm());
++  __ mov(tmp1, fn);
++  __ blr(tmp1);
++
++  __ ldr(lr, Address(sp, 3 * wordSize));
++  // For assert(Rd != sp || imm % 16 == 0)
++  __ add(sp, sp, 4 * wordSize);
++  __ br(lr);
++
++  return entry;
++}
++
++address InterpreterGenerator::generate_Dgemv_dgemv_entry() {
++  if (StubRoutines::dgemvDgemv() == NULL) return NULL;
++  address entry = __ pc();
++
++  const Register trans = c_rarg0;              // trans
++  const Register m = c_rarg1;                  // m
++  const Register n = c_rarg2;                  // n
++  const Register a = c_rarg3;                  // array a addr
++  const Register lda = c_rarg4;                // lda
++  const Register x = c_rarg5;                  // array x addr
++  const Register incx = c_rarg6;               // incx
++  const Register y = c_rarg7;                  // array y addr
++
++  const FloatRegister alpha = v0;              // alpha
++  const FloatRegister beta = v1;               // beta
++
++  const Register tmp1 = rscratch1;
++  const Register tmp2 = rscratch2;
++
++  // esp: expression stack of caller
++  // dgemv parameter ---> the position in stack ---> move to register
++  // | char* trans  |        | esp + 15 |                |  r0  |
++  // | int m        |        | esp + 14 |                |  r1  |
++  // | int n        |        | esp + 13 |                |  r2  |
++  // | double alpha |        | esp + 11 |                |  v0  |
++  // ----------------        ------------                --------
++  // | double* a    |        | esp + 10 |                |      |
++  // |              |        |          |                |  r3  |
++  // | int a_offset |        | esp + 9  |                |      |
++  // ----------------        ------------                --------
++  // | int lda      |        | esp + 8  |                |  r4  |
++  // ----------------        ------------                --------
++  // | double* x    |        | esp + 7  |                |      |
++  // |              |        |          |                |  r5  |
++  // | int x_offset |        | esp + 6  |                |      |
++  // ----------------        ------------                --------
++  // | int incx     |        | esp + 5  |                |  r6  |
++  // | double beta  |        | esp + 3  |                |  v1  |
++  // ----------------        ------------                --------
++  // | double* y    |        | esp + 2  |                |      |
++  // |              |        |          |                |  r7  |
++  // | int y_offset |        | esp + 1  |                |      |
++  // ----------------        ------------                --------
++  // | int incy     |        | esp      |                | [sp] |
++
++
++  // trans
++  __ ldr(trans, Address(esp, 15 * wordSize));
++  load_String_value(trans, tmp1);
++  load_String_offset(trans, tmp2);
++  emit_array_address(tmp1, tmp2, trans, T_CHAR);
++  // m, n
++  __ ldrw(m, Address(esp, 14 * wordSize));
++  __ ldrw(n, Address(esp, 13 * wordSize));
++
++  // alpha
++  __ ldrd(alpha, Address(esp, 11 * wordSize));
++
++  // a
++  __ ldr(tmp1, Address(esp, 10 * wordSize));
++  __ mov(tmp2, zr);
++  __ ldrw(tmp2, Address(esp, 9 * wordSize));
++  emit_array_address(tmp1, tmp2, a, T_DOUBLE);
++
++  // lda
++  __ ldrw(lda, Address(esp, 8 * wordSize));
++
++  // x
++  __ ldr(tmp1, Address(esp, 7 * wordSize));
++  __ mov(tmp2, zr);
++  __ ldrw(tmp2, Address(esp, 6 * wordSize));
++  emit_array_address(tmp1, tmp2, x, T_DOUBLE);
++
++  // incx
++  __ ldrw(incx, Address(esp, 5 * wordSize));
++
++  // beta
++  __ ldrd(beta, Address(esp, 3 * wordSize));
++
++  // y
++  __ ldr(tmp1, Address(esp, 2 * wordSize));
++  __ mov(tmp2, zr);
++  __ ldrw(tmp2, Address(esp, wordSize));
++  emit_array_address(tmp1, tmp2, y, T_DOUBLE);
++
++  // resume sp, restore lr
++  __ andr(sp, r13, -16);
++  __ str(lr, Address(sp, -wordSize));
++
++  // incy, push on stack
++  __ ldrw(tmp1, Address(esp, 0));
++  __ strw(tmp1, Address(sp, 2 * -wordSize));
++
++  __ add(sp, sp, -2 * wordSize);
++
++  // call function
++  address fn = CAST_FROM_FN_PTR(address, StubRoutines::dgemvDgemv());
++  __ mov(tmp1, fn);
++  __ blr(tmp1);
++
++  // resume lr
++  __ ldr(lr, Address(sp, wordSize));
++  __ add(sp, sp, 2 * wordSize);
++  __ br(lr);
++
++  return  entry;
++}
++
+ void InterpreterGenerator::bang_stack_shadow_pages(bool native_call) {
+   // Bang each page in the shadow zone. We can't assume it's been done for
+   // an interpreter frame with greater than a page of locals, so each page
+@@ -1575,6 +1819,10 @@ address AbstractInterpreterGenerator::generate_method_entry(
+                                            : // fall thru
+   case Interpreter::java_util_zip_CRC32_updateByteBuffer
+                                            : entry_point = ((InterpreterGenerator*)this)->generate_CRC32_updateBytes_entry(kind); break;
++  case Interpreter::org_netlib_blas_Dgemm_dgemm
++                                           : entry_point = ((InterpreterGenerator*)this)->generate_Dgemm_dgemm_entry(); break;
++  case Interpreter::org_netlib_blas_Dgemv_dgemv
++                                           : entry_point = ((InterpreterGenerator*)this)->generate_Dgemv_dgemv_entry(); break;
+   default                                  : ShouldNotReachHere();                                                       break;
+   }
+ 
+diff --git a/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp b/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp
+index f1160792a..477c6e550 100644
+--- a/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp
++++ b/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp
+@@ -754,6 +754,13 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
+   }
+ }
+ 
++void LIRGenerator::do_dgemm_dgemm(Intrinsic* x) {
++  fatal("BLAS intrinsics are not implemented on this platform!");
++}
++
++void LIRGenerator::do_dgemv_dgemv(Intrinsic* x) {
++  fatal("BLAS intrinsics are not implemented on this platform!");
++}
+ 
+ void LIRGenerator::do_ArrayCopy(Intrinsic* x) {
+   assert(x->number_of_arguments() == 5, "wrong type");
+diff --git a/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp b/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
+index dd23f005b..d1ecbaeb4 100644
+--- a/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
++++ b/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
+@@ -896,6 +896,13 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
+   }
+ }
+ 
++void LIRGenerator::do_dgemm_dgemm(Intrinsic* x) {
++  fatal("BLAS intrinsics are not implemented on this platform!");
++}
++
++void LIRGenerator::do_dgemv_dgemv(Intrinsic *x) {
++  fatal("Blas intrinsics are not implemented on this platform!");
++}
+ 
+ void LIRGenerator::do_ArrayCopy(Intrinsic* x) {
+   assert(x->number_of_arguments() == 5, "wrong type");
+diff --git a/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp b/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp
+index 459315cb7..79b2b2bb1 100644
+--- a/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp
++++ b/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp
+@@ -3672,6 +3672,20 @@ bool GraphBuilder::try_inline_intrinsics(ciMethod* callee) {
+     case vmIntrinsics::_fullFence :
+       break;
+ 
++    case vmIntrinsics::_dgemm_dgemm:
++      if (!UseF2jBLASIntrinsics || (StubRoutines::dgemmDgemm() == NULL)) {
++        return false;
++      }
++      cantrap = false;
++      preserves_state = true;
++      break;
++
++    case vmIntrinsics::_dgemv_dgemv:
++      if (!UseF2jBLASIntrinsics || (StubRoutines::dgemvDgemv() == NULL)) return false;
++      cantrap = false;
++      preserves_state = true;
++      break;
++
+     default                       : return false; // do not inline
+   }
+   // create intrinsic node
+diff --git a/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp b/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp
+index 65c04e3e5..070fd8052 100644
+--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp
++++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp
+@@ -1208,7 +1208,7 @@ void LIRGenerator::do_Return(Return* x) {
+   set_no_result(x);
+ }
+ 
+-// Examble: ref.get()
++// Example: ref.get()
+ // Combination of LoadField and g1 pre-write barrier
+ void LIRGenerator::do_Reference_get(Intrinsic* x) {
+ 
+@@ -1220,7 +1220,7 @@ void LIRGenerator::do_Reference_get(Intrinsic* x) {
+   LIRItem reference(x->argument_at(0), this);
+   reference.load_item();
+ 
+-  // need to perform the null check on the reference objecy
++  // need to perform the null check on the reference object
+   CodeEmitInfo* info = NULL;
+   if (x->needs_null_check()) {
+     info = state_for(x);
+@@ -1422,6 +1422,44 @@ LIR_Opr LIRGenerator::load_constant(LIR_Const* c) {
+   return result;
+ }
+ 
++// Access the char-array of String
++LIR_Opr LIRGenerator::load_String_value(LIR_Opr str) {
++  int value_offset = java_lang_String::value_offset_in_bytes();
++  LIR_Opr value = new_register(T_ARRAY);
++  LIR_Opr tmp = new_pointer_register();
++
++  __ add(str, LIR_OprFact::intConst(value_offset), tmp);
++  LIR_Address* array_addr = new LIR_Address(tmp, T_ARRAY);
++#if INCLUDE_ALL_GCS
++  if (UseShenandoahGC) {
++    LIR_Opr tmp = new_register(T_OBJECT);
++    LIR_Opr addr = ShenandoahBarrierSet::barrier_set()->bsc1()->resolve_address(this, array_addr, T_OBJECT, NULL);
++    __ load(addr->as_address_ptr(), tmp);
++    tmp = ShenandoahBarrierSet::barrier_set()->bsc1()->load_reference_barrier(this, tmp, addr);
++    __ move(tmp, value);
++  } else
++#endif
++  __ load(array_addr, value);
++
++  return value;
++}
++
++LIR_Opr LIRGenerator::load_String_offset(LIR_Opr str) {
++  LIR_Opr offset = new_register(T_INT);
++
++  if (java_lang_String::has_offset_field()) {
++    LIR_Opr tmp = new_pointer_register();
++    int offset_offset = java_lang_String::offset_offset_in_bytes();
++    __ add(str, LIR_OprFact::intConst(offset_offset), tmp);
++    LIR_Address* addr = new LIR_Address(tmp, T_INT);
++    __ load(addr, offset);
++  } else {
++    offset = LIR_OprFact::intConst(0);
++  }
++
++  return offset;
++}
++
+ // Various barriers
+ 
+ void LIRGenerator::pre_barrier(LIR_Opr addr_opr, LIR_Opr pre_val,
+@@ -3290,6 +3328,14 @@ void LIRGenerator::do_Intrinsic(Intrinsic* x) {
+     do_update_CRC32(x);
+     break;
+ 
++  case vmIntrinsics::_dgemm_dgemm:
++    do_dgemm_dgemm(x);
++    break;
++
++  case vmIntrinsics::_dgemv_dgemv:
++    do_dgemv_dgemv(x);
++    break;
++
+   default: ShouldNotReachHere(); break;
+   }
+ }
+diff --git a/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp b/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp
+index 24d072b36..57d675c5b 100644
+--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp
++++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp
+@@ -210,6 +210,10 @@ class LIRGenerator: public InstructionVisitor, public BlockClosure {
+   // Given an immediate value, return an operand usable in logical ops.
+   LIR_Opr load_immediate(int x, BasicType type);
+ 
++  // Get String value and offset
++  LIR_Opr load_String_value(LIR_Opr str);
++  LIR_Opr load_String_offset(LIR_Opr str);
++
+   void  set_result(Value x, LIR_Opr opr)           {
+     assert(opr->is_valid(), "must set to valid value");
+     assert(x->operand()->is_illegal(), "operand should never change");
+@@ -251,6 +255,8 @@ class LIRGenerator: public InstructionVisitor, public BlockClosure {
+   void do_FPIntrinsics(Intrinsic* x);
+   void do_Reference_get(Intrinsic* x);
+   void do_update_CRC32(Intrinsic* x);
++  void do_dgemm_dgemm(Intrinsic* x);
++  void do_dgemv_dgemv(Intrinsic* x);
+ 
+   void do_UnsafePrefetch(UnsafePrefetch* x, bool is_store);
+ 
+diff --git a/hotspot/src/share/vm/c1/c1_Runtime1.cpp b/hotspot/src/share/vm/c1/c1_Runtime1.cpp
+index f379a0395..3ece7f6ea 100644
+--- a/hotspot/src/share/vm/c1/c1_Runtime1.cpp
++++ b/hotspot/src/share/vm/c1/c1_Runtime1.cpp
+@@ -305,6 +305,8 @@ const char* Runtime1::name_for_address(address entry) {
+   FUNCTION_CASE(entry, JFR_TIME_FUNCTION);
+ #endif
+   FUNCTION_CASE(entry, StubRoutines::updateBytesCRC32());
++  FUNCTION_CASE(entry, StubRoutines::dgemmDgemm());
++  FUNCTION_CASE(entry, StubRoutines::dgemvDgemv());
+ 
+ #undef FUNCTION_CASE
+ 
+diff --git a/hotspot/src/share/vm/classfile/vmSymbols.cpp b/hotspot/src/share/vm/classfile/vmSymbols.cpp
+index a5f89dbf8..34514022a 100644
+--- a/hotspot/src/share/vm/classfile/vmSymbols.cpp
++++ b/hotspot/src/share/vm/classfile/vmSymbols.cpp
+@@ -333,6 +333,8 @@ bool vmIntrinsics::should_be_pinned(vmIntrinsics::ID id) {
+ #endif
+   case vmIntrinsics::_currentTimeMillis:
+   case vmIntrinsics::_nanoTime:
++  case vmIntrinsics::_dgemm_dgemm:
++  case vmIntrinsics::_dgemv_dgemv:
+     return true;
+   default:
+     return false;
+diff --git a/hotspot/src/share/vm/classfile/vmSymbols.hpp b/hotspot/src/share/vm/classfile/vmSymbols.hpp
+index 6bd8dbedd..942d172a1 100644
+--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp
++++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp
+@@ -857,6 +857,14 @@
+   do_intrinsic(_f2jblas_ddot, com_github_fommil_netlib_f2jblas, ddot_name, ddot_signature, F_R)                         \
+    do_name(     ddot_name,                                         "ddot")                                              \
+    do_signature(ddot_signature,                                    "(I[DI[DI)D")                                        \
++  do_class(org_netlib_blas_dgemm,                       "org/netlib/blas/Dgemm")                                        \
++  do_intrinsic(_dgemm_dgemm, org_netlib_blas_dgemm, dgemm_name, dgemm_signature, F_S)                                    \
++   do_name(     dgemm_name,                                        "dgemm")                                             \
++   do_signature(dgemm_signature,                                   "(Ljava/lang/String;Ljava/lang/String;IIID[DII[DIID[DII)V")  \
++  do_class(org_netlib_blas_dgemv,                       "org/netlib/blas/Dgemv")                                        \
++  do_intrinsic(_dgemv_dgemv, org_netlib_blas_dgemv, dgemv_name, dgemv_signature, F_S)                                   \
++   do_name(     dgemv_name,                                         "dgemv")                                            \
++   do_signature(dgemv_signature,                                    "(Ljava/lang/String;IID[DII[DIID[DII)V")            \
+                                                                                                                         \
+   /* support for sun.security.provider.SHA2 */                                                                          \
+   do_class(sun_security_provider_sha2,                             "sun/security/provider/SHA2")                        \
+diff --git a/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp b/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp
+index e14c50bf0..293382b3c 100644
+--- a/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp
++++ b/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp
+@@ -100,6 +100,8 @@ class AbstractInterpreter: AllStatic {
+     java_util_zip_CRC32_update,                                 // implementation of java.util.zip.CRC32.update()
+     java_util_zip_CRC32_updateBytes,                            // implementation of java.util.zip.CRC32.updateBytes()
+     java_util_zip_CRC32_updateByteBuffer,                       // implementation of java.util.zip.CRC32.updateByteBuffer()
++    org_netlib_blas_Dgemm_dgemm,                                // implementation of org.netlib.blas.Dgemm.dgemm()
++    org_netlib_blas_Dgemv_dgemv,                                // implementation of org.netlib.blas.Dgemv.dgemv()
+     number_of_method_entries,
+     invalid = -1
+   };
+diff --git a/hotspot/src/share/vm/interpreter/cppInterpreter.cpp b/hotspot/src/share/vm/interpreter/cppInterpreter.cpp
+index 0007aa8be..9e48a1d94 100644
+--- a/hotspot/src/share/vm/interpreter/cppInterpreter.cpp
++++ b/hotspot/src/share/vm/interpreter/cppInterpreter.cpp
+@@ -31,17 +31,20 @@
+ #ifdef CC_INTERP
+ # define __ _masm->
+ 
+-void CppInterpreter::initialize() {
++void CppInterpreter::initialize_stub() {
+   if (_code != NULL) return;
++  int code_size = InterpreterCodeSize;
++  NOT_PRODUCT(code_size *= 4;)  // debug uses extra interpreter code space
++  _code = new StubQueue(new InterpreterCodeletInterface, code_size, NULL,
++                        "Interpreter");
++}
++
++void CppInterpreter::initialize_code() {
+   AbstractInterpreter::initialize();
+ 
+   // generate interpreter
+   { ResourceMark rm;
+     TraceTime timer("Interpreter generation", TraceStartupTime);
+-    int code_size = InterpreterCodeSize;
+-    NOT_PRODUCT(code_size *= 4;)  // debug uses extra interpreter code space
+-    _code = new StubQueue(new InterpreterCodeletInterface, code_size, NULL,
+-                          "Interpreter");
+     InterpreterGenerator g(_code);
+     if (PrintInterpreter) print();
+   }
+diff --git a/hotspot/src/share/vm/interpreter/cppInterpreter.hpp b/hotspot/src/share/vm/interpreter/cppInterpreter.hpp
+index 6a6447503..58efcfaf2 100644
+--- a/hotspot/src/share/vm/interpreter/cppInterpreter.hpp
++++ b/hotspot/src/share/vm/interpreter/cppInterpreter.hpp
+@@ -54,7 +54,8 @@ class CppInterpreter: public AbstractInterpreter {
+ 
+  public:
+   // Initialization/debugging
+-  static void       initialize();
++  static void       initialize_stub();
++  static void       initialize_code();
+   // this only returns whether a pc is within generated code for the interpreter.
+ 
+   // This is a moderately dubious interface for the c++ interpreter. Only
+diff --git a/hotspot/src/share/vm/interpreter/interpreter.cpp b/hotspot/src/share/vm/interpreter/interpreter.cpp
+index 7ce4bdbb3..a313f2e63 100644
+--- a/hotspot/src/share/vm/interpreter/interpreter.cpp
++++ b/hotspot/src/share/vm/interpreter/interpreter.cpp
+@@ -85,8 +85,6 @@ void InterpreterCodelet::print_on(outputStream* st) const {
+ // Implementation of  platform independent aspects of Interpreter
+ 
+ void AbstractInterpreter::initialize() {
+-  if (_code != NULL) return;
+-
+   // make sure 'imported' classes are initialized
+   if (CountBytecodes || TraceBytecodes || StopInterpreterAt) BytecodeCounter::reset();
+   if (PrintBytecodeHistogram)                                BytecodeHistogram::reset();
+@@ -114,8 +112,22 @@ void AbstractInterpreter::print() {
+ }
+ 
+ 
+-void interpreter_init() {
+-  Interpreter::initialize();
++// The reason that interpreter initialization is split into two parts is that the first part
++// needs to run before methods are loaded (which with CDS implies linked also), and the other
++// part needs to run after. The reason is that when methods are loaded (with CDS) or linked
++// (without CDS), the i2c adapters are generated that assert we are currently in the interpreter.
++// Asserting that requires knowledge about where the interpreter is in memory. Therefore,
++// establishing the interpreter address must be done before methods are loaded. However,
++// we would like to actually generate the interpreter after methods are loaded. That allows
++// us to remove otherwise hardcoded offsets regarding fields that are needed in the interpreter
++// code. This leads to a split if 1. reserving the memory for the interpreter, 2. loading methods
++// and 3. generating the interpreter.
++void interpreter_init_stub() {
++  Interpreter::initialize_stub();
++}
++
++void interpreter_init_code() {
++  Interpreter::initialize_code();
+ #ifndef PRODUCT
+   if (TraceBytecodes) BytecodeTracer::set_closure(BytecodeTracer::std_closure());
+ #endif // PRODUCT
+@@ -251,6 +263,13 @@ AbstractInterpreter::MethodKind AbstractInterpreter::method_kind(methodHandle m)
+                                 return java_lang_ref_reference_get;
+   }
+ 
++  if (UseF2jBLASIntrinsics) {
++    switch (m->intrinsic_id()) {
++      case vmIntrinsics::_dgemm_dgemm: return org_netlib_blas_Dgemm_dgemm;
++      case vmIntrinsics::_dgemv_dgemv: return org_netlib_blas_Dgemv_dgemv;
++    }
++  }
++
+   // Accessor method?
+   if (m->is_accessor()) {
+     assert(m->size_of_parameters() == 1, "fast code for accessors assumes parameter size = 1");
+@@ -311,6 +330,8 @@ void AbstractInterpreter::print_method_kind(MethodKind kind) {
+     case java_util_zip_CRC32_update           : tty->print("java_util_zip_CRC32_update"); break;
+     case java_util_zip_CRC32_updateBytes      : tty->print("java_util_zip_CRC32_updateBytes"); break;
+     case java_util_zip_CRC32_updateByteBuffer : tty->print("java_util_zip_CRC32_updateByteBuffer"); break;
++    case org_netlib_blas_Dgemm_dgemm : tty->print("org_netlib_blas_Dgemm_dgemm"); break;
++    case org_netlib_blas_Dgemv_dgemv : tty->print("org_netlib_blas_Dgemv_dgemv"); break;
+     default:
+       if (kind >= method_handle_invoke_FIRST &&
+           kind <= method_handle_invoke_LAST) {
+diff --git a/hotspot/src/share/vm/interpreter/templateInterpreter.cpp b/hotspot/src/share/vm/interpreter/templateInterpreter.cpp
+index 1520c7b1c..f38f05117 100644
+--- a/hotspot/src/share/vm/interpreter/templateInterpreter.cpp
++++ b/hotspot/src/share/vm/interpreter/templateInterpreter.cpp
+@@ -32,12 +32,20 @@
+ 
+ # define __ _masm->
+ 
+-void TemplateInterpreter::initialize() {
++void TemplateInterpreter::initialize_stub() {
+   if (_code != NULL) return;
+   // assertions
+   assert((int)Bytecodes::number_of_codes <= (int)DispatchTable::length,
+          "dispatch table too small");
+ 
++  // allocate interpreter
++  int code_size = InterpreterCodeSize;
++  NOT_PRODUCT(code_size *= 4;)  // debug uses extra interpreter code space
++  _code = new StubQueue(new InterpreterCodeletInterface, code_size, NULL,
++                        "Interpreter");
++}
++
++void TemplateInterpreter::initialize_code() {
+   AbstractInterpreter::initialize();
+ 
+   TemplateTable::initialize();
+@@ -45,10 +53,6 @@ void TemplateInterpreter::initialize() {
+   // generate interpreter
+   { ResourceMark rm;
+     TraceTime timer("Interpreter generation", TraceStartupTime);
+-    int code_size = InterpreterCodeSize;
+-    NOT_PRODUCT(code_size *= 4;)  // debug uses extra interpreter code space
+-    _code = new StubQueue(new InterpreterCodeletInterface, code_size, NULL,
+-                          "Interpreter");
+     InterpreterGenerator g(_code);
+     if (PrintInterpreter) print();
+   }
+@@ -401,6 +405,11 @@ void TemplateInterpreterGenerator::generate_all() {
+     method_entry(java_util_zip_CRC32_updateByteBuffer)
+   }
+ 
++  if (UseF2jBLASIntrinsics) {
++    method_entry(org_netlib_blas_Dgemm_dgemm)
++    method_entry(org_netlib_blas_Dgemv_dgemv)
++  }
++
+   initialize_method_handle_entries();
+ 
+   // all native method kinds (must be one contiguous block)
+diff --git a/hotspot/src/share/vm/interpreter/templateInterpreter.hpp b/hotspot/src/share/vm/interpreter/templateInterpreter.hpp
+index 5f76dca8a..96da6353c 100644
+--- a/hotspot/src/share/vm/interpreter/templateInterpreter.hpp
++++ b/hotspot/src/share/vm/interpreter/templateInterpreter.hpp
+@@ -132,7 +132,8 @@ class TemplateInterpreter: public AbstractInterpreter {
+ 
+  public:
+   // Initialization/debugging
+-  static void       initialize();
++  static void       initialize_stub();
++  static void       initialize_code();
+   // this only returns whether a pc is within generated code for the interpreter.
+   static bool       contains(address pc)                        { return _code != NULL && _code->contains(pc); }
+ 
+diff --git a/hotspot/src/share/vm/opto/escape.cpp b/hotspot/src/share/vm/opto/escape.cpp
+index 68631dbf2..0e0cc1028 100644
+--- a/hotspot/src/share/vm/opto/escape.cpp
++++ b/hotspot/src/share/vm/opto/escape.cpp
+@@ -979,7 +979,9 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
+                   strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0 ||
+                   strcmp(call->as_CallLeaf()->_name, "montgomery_multiply") == 0 ||
+                   strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0 ||
+-                  strcmp(call->as_CallLeaf()->_name, "f2jblas_ddot") == 0)
++                  strcmp(call->as_CallLeaf()->_name, "f2jblas_ddot") == 0 ||
++                  strcmp(call->as_CallLeaf()->_name, "dgemm_dgemm") == 0) ||
++                  strcmp(call->as_CallLeaf()->_name, "dgemv_dgemv") == 0
+                  ))) {
+             call->dump();
+             fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name));
+diff --git a/hotspot/src/share/vm/opto/graphKit.cpp b/hotspot/src/share/vm/opto/graphKit.cpp
+index 41a067ce2..1c3bc2e8c 100644
+--- a/hotspot/src/share/vm/opto/graphKit.cpp
++++ b/hotspot/src/share/vm/opto/graphKit.cpp
+@@ -2372,7 +2372,11 @@ Node* GraphKit::make_runtime_call(int flags,
+                                   Node* parm0, Node* parm1,
+                                   Node* parm2, Node* parm3,
+                                   Node* parm4, Node* parm5,
+-                                  Node* parm6, Node* parm7) {
++                                  Node* parm6, Node* parm7,
++                                  Node* parm8, Node* parm9,
++                                  Node* parm10, Node* parm11,
++                                  Node* parm12, Node* parm13,
++                                  Node* parm14, Node* parm15) {
+   // Slow-path call
+   bool is_leaf = !(flags & RC_NO_LEAF);
+   bool has_io  = (!is_leaf && !(flags & RC_NO_IO));
+@@ -2415,7 +2419,15 @@ Node* GraphKit::make_runtime_call(int flags,
+   if (parm5 != NULL) { call->init_req(TypeFunc::Parms+5, parm5);
+   if (parm6 != NULL) { call->init_req(TypeFunc::Parms+6, parm6);
+   if (parm7 != NULL) { call->init_req(TypeFunc::Parms+7, parm7);
+-    /* close each nested if ===> */  } } } } } } } }
++  if (parm8 != NULL) { call->init_req(TypeFunc::Parms+8, parm8);
++  if (parm9 != NULL) { call->init_req(TypeFunc::Parms+9, parm9);
++  if (parm10 != NULL) { call->init_req(TypeFunc::Parms+10, parm10);
++  if (parm11 != NULL) { call->init_req(TypeFunc::Parms+11, parm11);
++  if (parm12 != NULL) { call->init_req(TypeFunc::Parms+12, parm12);
++  if (parm13 != NULL) { call->init_req(TypeFunc::Parms+13, parm13);
++  if (parm14 != NULL) { call->init_req(TypeFunc::Parms+14, parm14);
++  if (parm15 != NULL) { call->init_req(TypeFunc::Parms+15, parm15);
++    /* close each nested if ===> */  } } } } } } } } } } } } } } } }
+   assert(call->in(call->req()-1) != NULL, "must initialize all parms");
+ 
+   if (!is_leaf) {
+diff --git a/hotspot/src/share/vm/opto/graphKit.hpp b/hotspot/src/share/vm/opto/graphKit.hpp
+index 7a363fd33..e9a061acf 100644
+--- a/hotspot/src/share/vm/opto/graphKit.hpp
++++ b/hotspot/src/share/vm/opto/graphKit.hpp
+@@ -818,7 +818,11 @@ class GraphKit : public Phase {
+                           Node* parm0 = NULL, Node* parm1 = NULL,
+                           Node* parm2 = NULL, Node* parm3 = NULL,
+                           Node* parm4 = NULL, Node* parm5 = NULL,
+-                          Node* parm6 = NULL, Node* parm7 = NULL);
++                          Node* parm6 = NULL, Node* parm7 = NULL,
++                          Node* parm8 = NULL, Node* parm9 = NULL,
++                          Node* parm10 = NULL, Node* parm11 = NULL,
++                          Node* parm12 = NULL, Node* parm13 = NULL,
++                          Node* parm14 = NULL, Node* parm15 = NULL);
+   enum {  // flag values for make_runtime_call
+     RC_NO_FP = 1,               // CallLeafNoFPNode
+     RC_NO_IO = 2,               // do not hook IO edges
+diff --git a/hotspot/src/share/vm/opto/library_call.cpp b/hotspot/src/share/vm/opto/library_call.cpp
+index 5cbc0f012..10eeea217 100644
+--- a/hotspot/src/share/vm/opto/library_call.cpp
++++ b/hotspot/src/share/vm/opto/library_call.cpp
+@@ -336,6 +336,8 @@ class LibraryCallKit : public GraphKit {
+   bool inline_montgomeryMultiply();
+   bool inline_montgomerySquare();
+   bool inline_ddotF2jBLAS();
++  bool inline_dgemmDgemm();
++  bool inline_dgemvDgemv();
+ 
+   bool inline_profileBoolean();
+ };
+@@ -589,6 +591,8 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
+     break;
+ 
+   case vmIntrinsics::_f2jblas_ddot:
++  case vmIntrinsics::_dgemm_dgemm:
++  case vmIntrinsics::_dgemv_dgemv:
+     if (!UseF2jBLASIntrinsics) return NULL;
+     break;
+ 
+@@ -988,9 +992,13 @@ bool LibraryCallKit::try_to_inline(int predicate) {
+ 
+   case vmIntrinsics::_profileBoolean:
+     return inline_profileBoolean();
++
+   case vmIntrinsics::_f2jblas_ddot:
+     return inline_ddotF2jBLAS();
+-
++  case vmIntrinsics::_dgemm_dgemm:
++    return inline_dgemmDgemm();
++  case vmIntrinsics::_dgemv_dgemv:
++    return inline_dgemvDgemv();
+   default:
+     // If you get here, it may be that someone has added a new intrinsic
+     // to the list in vmSymbols.hpp without implementing it here.
+@@ -6353,6 +6361,144 @@ bool LibraryCallKit::inline_ddotF2jBLAS() {
+   return true;
+ }
+ 
++/**
++ * double org.netlib.blas.Dgemm.dgemm(java.lang.String transa,
++ *                                    java.lang.String transb, int m, int n, int k,
++ *                                    double alpha, double[] a, int offset_a, int lda,
++ *                                    double[] b, int offset_b, int ldb, double beta,
++ *                                    double[] c, int offset_c, int Ldc)
++ */
++bool LibraryCallKit::inline_dgemmDgemm() {
++  assert(callee()->signature()->count() == 16, "Dgemm.dgemm has 16 parameters");
++
++  address stubAddr = StubRoutines::dgemmDgemm();
++  if (stubAddr == NULL) return false;
++
++  Node* transa = argument(0);
++  Node* transb = argument(1);
++  Node* m = argument(2);
++  Node* n = argument(3);
++  Node* k = argument(4);
++  Node* alpha = round_double_node(argument(5));
++  Node* a = argument(7);
++  Node* a_offset = argument(8);
++  Node* lda = argument(9);
++  Node* b = argument(10);
++  Node* b_offset = argument(11);
++  Node* ldb = argument(12);
++  Node* beta = round_double_node(argument(13));
++  Node* c = argument(15);
++  Node* c_offset = argument(16);
++  Node* ldc = argument(17);
++
++  const Type* a_type = a->Value(&_gvn);
++  const Type* b_type = b->Value(&_gvn);
++  const Type* c_type = c->Value(&_gvn);
++  const TypeAryPtr* a_base_type = a_type->isa_aryptr();
++  const TypeAryPtr* b_base_type = b_type->isa_aryptr();
++  const TypeAryPtr* c_base_type = c_type->isa_aryptr();
++  if (a_base_type == NULL || b_base_type == NULL || c_base_type == NULL) return false;
++
++  ciKlass* a_klass = a_base_type->klass();
++  ciKlass* b_klass = b_base_type->klass();
++  ciKlass* c_klass = c_base_type->klass();
++  if (a_klass == NULL || b_klass == NULL || c_klass == NULL) return false;
++
++  BasicType a_elem_type = a_klass->as_array_klass()->element_type()->basic_type();
++  BasicType b_elem_type = b_klass->as_array_klass()->element_type()->basic_type();
++  BasicType c_elem_type = a_klass->as_array_klass()->element_type()->basic_type();
++  if (a_elem_type != T_DOUBLE || b_elem_type != T_DOUBLE || c_elem_type != T_DOUBLE) return false;
++
++  // get array a/b/c's addr
++  Node* a_start = array_element_address(a, a_offset, a_elem_type);
++  Node* b_start = array_element_address(b, b_offset, b_elem_type);
++  Node* c_start = array_element_address(c, c_offset, c_elem_type);
++
++  // Get start addr of string
++  Node* transa_value   = load_String_value(NULL, transa);
++  Node* transa_offset  = load_String_offset(NULL, transa);
++  Node* transa_start   = array_element_address(transa_value, transa_offset, T_CHAR);
++  Node* transb_value   = load_String_value(NULL, transb);
++  Node* transb_offset  = load_String_offset(NULL, transb);
++  Node* transb_start   = array_element_address(transb_value, transb_offset, T_CHAR);
++
++  const char *stubName = "dgemm_dgemm";
++  make_runtime_call(RC_LEAF, OptoRuntime::dgemmDgemm_Type(),
++                            stubAddr, stubName, TypePtr::BOTTOM,
++                            transa_start, transb_start, m, n, k, alpha, top(),
++                            a_start, lda, b_start, ldb, beta, top(), c_start, ldc);
++
++  return true;
++}
++
++/**
++ * void org.netlib.blas.Dgemv.dgemv(string trans, int m, int n, double alpha,
++ *                                  double[] a, int _a_offset, int lda,
++ *                                  double[] x, int _x_offset, int incx, double beta,
++ *                                  double[] y, int _y_offset, int incy)
++ */
++bool LibraryCallKit::inline_dgemvDgemv() {
++  assert(callee()->signature()->count() == 14, "F2jBLAS.dgemv has 14 parameters");
++  Node* trans = argument(0);
++  Node* m = argument(1);
++  Node* n = argument(2);
++  Node* alpha = round_double_node(argument(3));
++  Node* a = argument(5);
++  Node* a_offset = argument(6);
++  Node* lda = argument(7);
++  Node* x = argument(8);
++  Node* x_offset = argument(9);
++  Node* incx = argument(10);
++  Node* beta = round_double_node(argument(11));
++  Node* y = argument(13);
++  Node* y_offset = argument(14);
++  Node* incy = argument(15);
++
++  const Type* a_type = a->Value(&_gvn);
++  const Type* x_type = x->Value(&_gvn);
++  const Type* y_type = y->Value(&_gvn);
++  const TypeAryPtr* a_base_type = a_type->isa_aryptr();
++  const TypeAryPtr* x_base_type = x_type->isa_aryptr();
++  const TypeAryPtr* y_base_type = y_type->isa_aryptr();
++  if (a_base_type == NULL || x_base_type == NULL || y_base_type == NULL) return false;
++
++  ciKlass* a_klass = a_base_type->klass();
++  ciKlass* x_klass = x_base_type->klass();
++  ciKlass* y_klass = y_base_type->klass();
++
++  if (a_klass == NULL || x_klass == NULL || y_klass == NULL) return false;
++
++  BasicType a_elem_type = a_klass->as_array_klass()->element_type()->basic_type();
++  BasicType x_elem_type = x_klass->as_array_klass()->element_type()->basic_type();
++  BasicType y_elem_type = y_klass->as_array_klass()->element_type()->basic_type();
++
++  if (a_elem_type != T_DOUBLE || x_elem_type != T_DOUBLE || y_elem_type != T_DOUBLE) return false;
++
++
++  address stubAddr = StubRoutines::dgemvDgemv();
++  if (stubAddr == NULL) return false;
++
++  // 'a_start' points to array a + scaled offset
++  Node* a_start = array_element_address(a, a_offset, a_elem_type);
++  // 'x_start' points to array x + scaled offset
++  Node* x_start = array_element_address(x, x_offset, x_elem_type);
++  // 'y_start' points to array y + scaled offset
++  Node* y_start = array_element_address(y, y_offset, y_elem_type);
++
++  Node* no_ctrl = NULL;
++
++  // get start addr of string
++  Node* trans_value   = load_String_value(no_ctrl, trans);
++  Node* trans_offset  = load_String_offset(no_ctrl, trans);
++  Node* trans_start   = array_element_address(trans_value, trans_offset, T_CHAR);
++
++  const char *stubName = "dgemv_dgemv";
++  Node* call = make_runtime_call(RC_LEAF, OptoRuntime::dgemvDgemv_Type(), stubAddr, stubName,
++                                 TypePtr::BOTTOM, trans_start, m, n, alpha, top(), a_start,
++                                 lda, x_start, incx, beta, top(), y_start, incy);
++  return true;
++}
++
+ /**
+  * Calculate CRC32 for ByteBuffer.
+  * int java.util.zip.CRC32.updateByteBuffer(int crc, long buf, int off, int len)
+diff --git a/hotspot/src/share/vm/opto/runtime.cpp b/hotspot/src/share/vm/opto/runtime.cpp
+index f1fe4d666..dc8f0c774 100644
+--- a/hotspot/src/share/vm/opto/runtime.cpp
++++ b/hotspot/src/share/vm/opto/runtime.cpp
+@@ -944,6 +944,81 @@ const TypeFunc* OptoRuntime::ddotF2jBLAS_Type() {
+   return TypeFunc::make(domain, range);
+ }
+ 
++/**
++ * double org.netlib.blas.Dgemm.dgemm(java.lang.String transa,
++ *                                    java.lang.String transb, int m, int n, int k,
++ *                                    double alpha, double[] a, int offset_a, int lda,
++ *                                    double[] b, int offset_b, int ldb, double beta,
++ *                                    double[] c, int offset_c, int Ldc)
++ */
++const TypeFunc* OptoRuntime::dgemmDgemm_Type() {
++    // create input type (domain)
++    int num_args = 15;
++    int argcnt = num_args;
++    const Type** fields = TypeTuple::fields(argcnt);
++    int argp = TypeFunc::Parms;
++
++    fields[argp++] = TypeAryPtr::CHARS;       // char[]
++    fields[argp++] = TypeAryPtr::CHARS;       // char[]
++    fields[argp++] = TypeInt::INT;            // int m
++    fields[argp++] = TypeInt::INT;            // int n
++    fields[argp++] = TypeInt::INT;            // int k
++    fields[argp++] = Type::DOUBLE;            // double alpha
++    fields[argp++] = Type::HALF;
++    fields[argp++] = TypeAryPtr::DOUBLES;     // double[] a
++    fields[argp++] = TypeInt::INT;            // int lda
++    fields[argp++] = TypeAryPtr::DOUBLES;     // double[] b
++    fields[argp++] = TypeInt::INT;            // int ldb
++    fields[argp++] = Type::DOUBLE;            // double beta
++    fields[argp++] = Type::HALF;
++    fields[argp++] = TypeAryPtr::DOUBLES;     // double[] c
++    fields[argp++] = TypeInt::INT;            // int ldc
++    assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
++    const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields);
++
++    // no result type needed
++    fields = TypeTuple::fields(1);
++    fields[TypeFunc::Parms + 0] = NULL;       // void
++    const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
++    return TypeFunc::make(domain, range);
++}
++
++/**
++ * void dgemv(String trans, int m, int n, double alpha,
++ *            double[] a, int _a_offset, int lda,
++ *            double[] x, int _x_offset, int incx, double beta,
++ *            double[] y, int _y_offset, int incy)
++ */
++const TypeFunc* OptoRuntime::dgemvDgemv_Type() {
++  // create input type (domain)
++  int num_args = 13;
++  int argcnt = num_args;
++  const Type** fields = TypeTuple::fields(argcnt);
++  int argp = TypeFunc::Parms;
++
++  fields[argp++] = TypeAryPtr::CHARS;       // char[]
++  fields[argp++] = TypeInt::INT;            // int m
++  fields[argp++] = TypeInt::INT;            // int n
++  fields[argp++] = Type::DOUBLE;            // double alpha
++  fields[argp++] = Type::HALF;
++  fields[argp++] = TypeAryPtr::DOUBLES;     // double[] a
++  fields[argp++] = TypeInt::INT;            // int lda
++  fields[argp++] = TypeAryPtr::DOUBLES;     // double[] x
++  fields[argp++] = TypeInt::INT;            // int incx
++  fields[argp++] = Type::DOUBLE;            // double beta
++  fields[argp++] = Type::HALF;
++  fields[argp++] = TypeAryPtr::DOUBLES;     // double[] y
++  fields[argp++] = TypeInt::INT;            // int incy
++  assert(argp == TypeFunc::Parms + argcnt, "correct decoding");
++  const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields);
++
++  // no result type needed
++  fields = TypeTuple::fields(1);
++  fields[TypeFunc::Parms + 0] = NULL;       // void
++  const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
++  return TypeFunc::make(domain, range);
++}
++
+ // for cipherBlockChaining calls of aescrypt encrypt/decrypt, four pointers and a length, returning int
+ const TypeFunc* OptoRuntime::cipherBlockChaining_aescrypt_Type() {
+   // create input type (domain)
+diff --git a/hotspot/src/share/vm/opto/runtime.hpp b/hotspot/src/share/vm/opto/runtime.hpp
+index 66d393c5c..e07c34c15 100644
+--- a/hotspot/src/share/vm/opto/runtime.hpp
++++ b/hotspot/src/share/vm/opto/runtime.hpp
+@@ -318,6 +318,8 @@ private:
+   static const TypeFunc* updateBytesCRC32_Type();
+ 
+   static const TypeFunc* ddotF2jBLAS_Type();
++  static const TypeFunc* dgemmDgemm_Type();
++  static const TypeFunc* dgemvDgemv_Type();
+ 
+   // leaf on stack replacement interpreter accessor types
+   static const TypeFunc* osr_end_Type();
+diff --git a/hotspot/src/share/vm/runtime/init.cpp b/hotspot/src/share/vm/runtime/init.cpp
+index 1512ccc96..4c133bd4e 100644
+--- a/hotspot/src/share/vm/runtime/init.cpp
++++ b/hotspot/src/share/vm/runtime/init.cpp
+@@ -54,7 +54,8 @@ void VM_Version_init();
+ void os_init_globals();        // depends on VM_Version_init, before universe_init
+ void stubRoutines_init1();
+ jint universe_init();          // depends on codeCache_init and stubRoutines_init
+-void interpreter_init();       // before any methods loaded
++void interpreter_init_stub();  // before any methods loaded
++void interpreter_init_code();  // after methods loaded, but before they are linked
+ void invocationCounter_init(); // before any methods loaded
+ void marksweep_init();
+ void accessFlags_init();
+@@ -106,7 +107,7 @@ jint init_globals() {
+   if (status != JNI_OK)
+     return status;
+ 
+-  interpreter_init();  // before any methods loaded
++  interpreter_init_stub(); // before methods get loaded
+   invocationCounter_init();  // before any methods loaded
+   marksweep_init();
+   accessFlags_init();
+@@ -114,6 +115,7 @@ jint init_globals() {
+   InterfaceSupport_init();
+   SharedRuntime::generate_stubs();
+   universe2_init();  // dependent on codeCache_init and stubRoutines_init1
++  interpreter_init_code(); // after universe2_init and before any method gets linked
+   referenceProcessor_init();
+   jni_handles_init();
+ #if INCLUDE_VM_STRUCTS
+diff --git a/hotspot/src/share/vm/runtime/stubRoutines.cpp b/hotspot/src/share/vm/runtime/stubRoutines.cpp
+index 10f438bc5..f2106d13a 100644
+--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp
++++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp
+@@ -136,7 +136,10 @@ address StubRoutines::_sha512_implCompressMB = NULL;
+ address StubRoutines::_updateBytesCRC32 = NULL;
+ address StubRoutines::_crc_table_adr = NULL;
+ 
++address StubRoutines::_BLAS_library = NULL;
+ address StubRoutines::_ddotF2jBLAS = NULL;
++address StubRoutines::_dgemmDgemm = NULL;
++address StubRoutines::_dgemvDgemv = NULL;
+ 
+ address StubRoutines::_multiplyToLen = NULL;
+ address StubRoutines::_squareToLen = NULL;
+diff --git a/hotspot/src/share/vm/runtime/stubRoutines.hpp b/hotspot/src/share/vm/runtime/stubRoutines.hpp
+index a4eeb910d..16075d9f4 100644
+--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp
++++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp
+@@ -214,7 +214,10 @@ class StubRoutines: AllStatic {
+   static address _updateBytesCRC32;
+   static address _crc_table_adr;
+ 
++  static address _BLAS_library;
+   static address _ddotF2jBLAS;
++  static address _dgemmDgemm;
++  static address _dgemvDgemv;
+ 
+   static address _multiplyToLen;
+   static address _squareToLen;
+@@ -380,6 +383,8 @@ class StubRoutines: AllStatic {
+   static address crc_table_addr()      { return _crc_table_adr; }
+ 
+   static address ddotF2jBLAS()         { return _ddotF2jBLAS; }
++  static address dgemmDgemm()          { return _dgemmDgemm; }
++  static address dgemvDgemv()          { return _dgemvDgemv; }
+ 
+   static address multiplyToLen()       {return _multiplyToLen; }
+   static address squareToLen()         {return _squareToLen; }
diff --git a/openjdk-1.8.0.spec b/openjdk-1.8.0.spec
index c70b9a0..a818907 100644
--- a/openjdk-1.8.0.spec
+++ b/openjdk-1.8.0.spec
@@ -918,7 +918,7 @@ Provides: java-%{javaver}-%{origin}-accessibility%{?1} = %{epoch}:%{version}-%{r
 
 Name:    java-%{javaver}-%{origin}
 Version: %{javaver}.%{updatever}.%{buildver}
-Release: 11
+Release: 12
 # java-1.5.0-ibm from jpackage.org set Epoch to 1 for unknown reasons
 # and this change was brought into RHEL-4. java-1.5.0-ibm packages
 # also included the epoch in their virtual provides. This created a
@@ -1107,6 +1107,7 @@ Patch193: improve_algorithmConstraints_checkAlgorithm_performance.patch
 Patch194: modify_the_default_iteration_time_and_forks_in_the_JMH_of_KAEProvider.patch
 Patch195: support_CMS_parallel_inspection.patch
 Patch196: g1gc-numa-aware-Implementation.patch
+Patch197: implementation_of_Blas_hotspot_function_in_Intrinsics.patch
 
 #############################################
 #
@@ -1562,6 +1563,7 @@ pushd %{top_level_dir_name}
 %patch194 -p1
 %patch195 -p1
 %patch196 -p1
+%patch197 -p1
 popd
 
 # System library fixes
@@ -2178,7 +2180,10 @@ require "copy_jdk_configs.lua"
 %endif
 
 %changelog
-* Sat Jun 12 2021 hu_bo_dao <hubodao@huawei.com> - 1:1.8.0.292-b10.11
+* Sat Jun 12 2021 kuenking111 <wangkun49@huawei.com> - 1:1.8.0.292-b10.12
+- add implementation_of_Blas_hotspot_function_in_Intrinsics.patch
+
+* Sat Jun 12 2021 kuenking111 <wangkun49@huawei.com> - 1:1.8.0.292-b10.11
 - add g1gc-numa-aware-Implementation.patch
 
 * Wed Jun 10 2021 hu_bo_dao <hubodao@huawei.com> - 1:1.8.0.292-b10.10
-- 
Gitee


From 68def87080d4a4d46548c7d8cbdad1c9fc0be364 Mon Sep 17 00:00:00 2001
From: kuenking111 <wangkun49@huawei.com>
Date: Wed, 16 Jun 2021 16:24:22 +0800
Subject: [PATCH 4/6] I3VT8V: fix G1GC memory leak in numa

---
 fix_G1GC_memory_leak_in_numa.patch | 38 ++++++++++++++++++++++++++++++
 openjdk-1.8.0.spec                 |  7 +++++-
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100755 fix_G1GC_memory_leak_in_numa.patch

diff --git a/fix_G1GC_memory_leak_in_numa.patch b/fix_G1GC_memory_leak_in_numa.patch
new file mode 100755
index 0000000..5bf5185
--- /dev/null
+++ b/fix_G1GC_memory_leak_in_numa.patch
@@ -0,0 +1,38 @@
+diff --git a/hotspot/src/share/vm/gc_implementation/g1/g1Allocator.hpp b/hotspot/src/share/vm/gc_implementation/g1/g1Allocator.hpp
+index 9b26168a8..f6a80bf8d 100644
+--- a/hotspot/src/share/vm/gc_implementation/g1/g1Allocator.hpp
++++ b/hotspot/src/share/vm/gc_implementation/g1/g1Allocator.hpp
+@@ -45,6 +45,7 @@ protected:
+ public:
+    G1Allocator(G1CollectedHeap* heap) :
+      _g1h(heap), _summary_bytes_used(0) { }
++   virtual ~G1Allocator() { }
+ 
+    // Node index of current thread.
+    virtual uint current_node_index() const = 0;
+@@ -126,7 +127,7 @@ protected:
+ 
+ public:
+   G1DefaultAllocator(G1CollectedHeap* heap);
+-  ~G1DefaultAllocator();
++  virtual ~G1DefaultAllocator();
+ 
+   uint current_node_index() const;
+   uint num_nodes() { return (uint)_num_alloc_regions; }
+@@ -253,6 +254,7 @@ protected:
+ 
+ public:
+   G1ParGCAllocator(G1CollectedHeap* g1h);
++  virtual ~G1ParGCAllocator() { }
+ 
+   static G1ParGCAllocator* create_allocator(G1CollectedHeap* g1h);
+ 
+@@ -308,7 +310,7 @@ class G1DefaultParGCAllocator : public G1ParGCAllocator {
+ 
+ public:
+   G1DefaultParGCAllocator(G1CollectedHeap* g1h);
+-  ~G1DefaultParGCAllocator();
++  virtual ~G1DefaultParGCAllocator();
+ 
+   virtual G1ParGCAllocBuffer* alloc_buffer(InCSetState dest, AllocationContext_t context, uint node_index) {
+     assert(dest.is_valid(),
diff --git a/openjdk-1.8.0.spec b/openjdk-1.8.0.spec
index a818907..1770944 100644
--- a/openjdk-1.8.0.spec
+++ b/openjdk-1.8.0.spec
@@ -918,7 +918,7 @@ Provides: java-%{javaver}-%{origin}-accessibility%{?1} = %{epoch}:%{version}-%{r
 
 Name:    java-%{javaver}-%{origin}
 Version: %{javaver}.%{updatever}.%{buildver}
-Release: 12
+Release: 13
 # java-1.5.0-ibm from jpackage.org set Epoch to 1 for unknown reasons
 # and this change was brought into RHEL-4. java-1.5.0-ibm packages
 # also included the epoch in their virtual provides. This created a
@@ -1108,6 +1108,7 @@ Patch194: modify_the_default_iteration_time_and_forks_in_the_JMH_of_KAEProvider.
 Patch195: support_CMS_parallel_inspection.patch
 Patch196: g1gc-numa-aware-Implementation.patch
 Patch197: implementation_of_Blas_hotspot_function_in_Intrinsics.patch
+Patch198: fix_G1GC_memory_leak_in_numa.patch
 
 #############################################
 #
@@ -1564,6 +1565,7 @@ pushd %{top_level_dir_name}
 %patch195 -p1
 %patch196 -p1
 %patch197 -p1
+%patch198 -p1
 popd
 
 # System library fixes
@@ -2180,6 +2182,9 @@ require "copy_jdk_configs.lua"
 %endif
 
 %changelog
+* Wed Jun 16 2021 kuenking111 <wangkun49@huawei.com> - 1:1.8.0.292-b10.13
+- add fix_G1GC_memory_leak_in_numa.patch
+
 * Sat Jun 12 2021 kuenking111 <wangkun49@huawei.com> - 1:1.8.0.292-b10.12
 - add implementation_of_Blas_hotspot_function_in_Intrinsics.patch
 
-- 
Gitee


From 20b163193944e373e4e0e9cb4927d373bdfd46d8 Mon Sep 17 00:00:00 2001
From: kuenking111 <wangkun49@huawei.com>
Date: Thu, 17 Jun 2021 19:02:54 +0800
Subject: [PATCH 5/6] I3W1BL: systemDictionary reslove class parser miss
 resourceMark

---
 openjdk-1.8.0.spec                    | 5 ++++-
 update-to-keep-same-with-master.patch | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/openjdk-1.8.0.spec b/openjdk-1.8.0.spec
index 1770944..f63e9a9 100644
--- a/openjdk-1.8.0.spec
+++ b/openjdk-1.8.0.spec
@@ -918,7 +918,7 @@ Provides: java-%{javaver}-%{origin}-accessibility%{?1} = %{epoch}:%{version}-%{r
 
 Name:    java-%{javaver}-%{origin}
 Version: %{javaver}.%{updatever}.%{buildver}
-Release: 13
+Release: 14
 # java-1.5.0-ibm from jpackage.org set Epoch to 1 for unknown reasons
 # and this change was brought into RHEL-4. java-1.5.0-ibm packages
 # also included the epoch in their virtual provides. This created a
@@ -2182,6 +2182,9 @@ require "copy_jdk_configs.lua"
 %endif
 
 %changelog
+* Thu Jun 17 2021 kuenking111 <wangkun49@huawei.com> - 1:1.8.0.292-b10.14
+- fix systemDictionary resolve_from_stream ResourceMark
+
 * Wed Jun 16 2021 kuenking111 <wangkun49@huawei.com> - 1:1.8.0.292-b10.13
 - add fix_G1GC_memory_leak_in_numa.patch
 
diff --git a/update-to-keep-same-with-master.patch b/update-to-keep-same-with-master.patch
index aeae53f..032e067 100644
--- a/update-to-keep-same-with-master.patch
+++ b/update-to-keep-same-with-master.patch
@@ -23,7 +23,7 @@ index c3dec0a30..201dd9594 100644
      *index = '\0'; // chop to just the package name
      while ((index = strchr(name, '/')) != NULL) {
        *index = '.'; // replace '/' with '.' in package name
-@@ -1170,29 +1170,31 @@ Klass* SystemDictionary::resolve_from_stream(Symbol* class_name,
+@@ -1170,29 +1170,32 @@ Klass* SystemDictionary::resolve_from_stream(Symbol* class_name,
        !class_loader.is_null() &&
        parsed_name != NULL &&
        parsed_name->utf8_length() >= (int)pkglen) {
@@ -50,6 +50,7 @@ index c3dec0a30..201dd9594 100644
 -      Exceptions::_throw_msg(THREAD_AND_LOCATION,
 -        vmSymbols::java_lang_SecurityException(), message);
 -    }
++      ResourceMark rm(THREAD);
 +      bool prohibited;
 +      const jbyte* base = parsed_name->base();
 +      if ((base[0] | base[1] | base[2] | base[3] | base[4]) & 0x80) {
-- 
Gitee


From 1f4bec334afa64f094c5b4a12bb02043b0bb9454 Mon Sep 17 00:00:00 2001
From: kuenking111 <wangkun49@huawei.com>
Date: Mon, 28 Jun 2021 14:43:29 +0800
Subject: [PATCH 6/6] I3Y4ON: delete untrustworthy cacert soneraclass2ca

---
 ..._untrustworthy_cacert_soneraclass2ca.patch | 74 +++++++++++++++++++
 openjdk-1.8.0.spec                            |  9 ++-
 2 files changed, 81 insertions(+), 2 deletions(-)
 create mode 100755 delete_untrustworthy_cacert_soneraclass2ca.patch

diff --git a/delete_untrustworthy_cacert_soneraclass2ca.patch b/delete_untrustworthy_cacert_soneraclass2ca.patch
new file mode 100755
index 0000000..fe7f5c8
--- /dev/null
+++ b/delete_untrustworthy_cacert_soneraclass2ca.patch
@@ -0,0 +1,74 @@
+diff --git a/jdk/make/data/cacerts/soneraclass2ca b/jdk/make/data/cacerts/soneraclass2ca
+deleted file mode 100644
+index 43faa5e2..00000000
+--- a/jdk/make/data/cacerts/soneraclass2ca
++++ /dev/null
+@@ -1,26 +0,0 @@
+-Owner: CN=Sonera Class2 CA, O=Sonera, C=FI
+-Issuer: CN=Sonera Class2 CA, O=Sonera, C=FI
+-Serial number: 1d
+-Valid from: Fri Apr 06 07:29:40 GMT 2001 until: Tue Apr 06 07:29:40 GMT 2021
+-Signature algorithm name: SHA1withRSA
+-Subject Public Key Algorithm: 2048-bit RSA key
+-Version: 3
+------BEGIN CERTIFICATE-----
+-MIIDIDCCAgigAwIBAgIBHTANBgkqhkiG9w0BAQUFADA5MQswCQYDVQQGEwJGSTEP
+-MA0GA1UEChMGU29uZXJhMRkwFwYDVQQDExBTb25lcmEgQ2xhc3MyIENBMB4XDTAx
+-MDQwNjA3Mjk0MFoXDTIxMDQwNjA3Mjk0MFowOTELMAkGA1UEBhMCRkkxDzANBgNV
+-BAoTBlNvbmVyYTEZMBcGA1UEAxMQU29uZXJhIENsYXNzMiBDQTCCASIwDQYJKoZI
+-hvcNAQEBBQADggEPADCCAQoCggEBAJAXSjWdyvANlsdE+hY3/Ei9vX+ALTU74W+o
+-Z6m/AxxNjG8yR9VBaKQTBME1DJqEQ/xcHf+Js+gXGM2RX/uJ4+q/Tl18GybTdXnt
+-5oTjV+WtKcT0OijnpXuENmmz/V52vaMtmdOQTiMofRhj8VQ7Jp12W5dCsv+u8E7s
+-3TmVToMGf+dJQMjFAbJUWmYdPfz56TwKnoG4cPABi+QjVHzIrviQHgCWctRUz2Ej
+-vOr7nQKV0ba5cTppCD8PtOFCx4j1P5iop7oc4HFx71hXgVB6XGt0Rg6DA5jDjqhu
+-8nYybieDwnPz3BjotJPqdURrBGAgcVeHnfO+oJAjPYok4doh28MCAwEAAaMzMDEw
+-DwYDVR0TAQH/BAUwAwEB/zARBgNVHQ4ECgQISqCqWITTXjwwCwYDVR0PBAQDAgEG
+-MA0GCSqGSIb3DQEBBQUAA4IBAQBazof5FnIVV0sd2ZvnoiYw7JNn39Yt0jSv9zil
+-zqsWuasvfDXLrNAPtEwr/IDva4yRXzZ299uzGxnq9LIR/WFxRL8oszodv7ND6J+/
+-3DEIcbCdjdY0RzKQxmUk96BKfARzjzlvF4xytb1LyHr4e4PDKE6cCepnP7JnBBvD
+-FNr450kkkdAdavphOe9r5yF1BgfYErQhIHBCcYHaPJo2vqZbDWpsmh+Re/n570K6
+-Tk6ezAyNlNzZRZxe7EJQY670XcSxEtzKO6gunRRaBXW37Ndj4ro1tgQIkejanZz2
+-ZrUYrAqmVCY0M9IbwdR/GjqOC6oybtv8TyWf2TLHllpwrN9M
+------END CERTIFICATE-----
+diff --git a/jdk/test/sun/security/lib/cacerts/VerifyCACerts.java b/jdk/test/sun/security/lib/cacerts/VerifyCACerts.java
+index 9053b796..d1a7879d 100644
+--- a/jdk/test/sun/security/lib/cacerts/VerifyCACerts.java
++++ b/jdk/test/sun/security/lib/cacerts/VerifyCACerts.java
+@@ -53,12 +53,12 @@ public class VerifyCACerts {
+             + File.separator + "security" + File.separator + "cacerts";
+ 
+     // The numbers of certs now.
+-    private static final int COUNT = 90;
++    private static final int COUNT = 89;
+ 
+     // SHA-256 of cacerts, can be generated with
+     // shasum -a 256 cacerts | sed -e 's/../&:/g' | tr '[:lower:]' '[:upper:]' | cut -c1-95
+     private static final String CHECKSUM
+-            = "DC:22:7E:D7:F3:46:1F:8B:A8:4E:EE:C2:A8:4B:8E:26:89:4F:95:5C:71:A3:1B:5A:6E:A6:48:FD:CB:C9:F2:95";
++            = "E6:F5:ED:92:CE:E2:35:5C:84:56:78:C7:72:29:29:A9:83:99:19:D9:54:F4:FF:7F:F7:D4:DB:2D:34:36:20:B5";
+ 
+     // map of cert alias to SHA-256 fingerprint
+     @SuppressWarnings("serial")
+@@ -167,8 +167,6 @@ public class VerifyCACerts {
+                     "3B:22:2E:56:67:11:E9:92:30:0D:C0:B1:5A:B9:47:3D:AF:DE:F8:C8:4D:0C:EF:7D:33:17:B4:C1:82:1D:14:36");
+             put("swisssignsilverg2ca [jdk]",
+                     "BE:6C:4D:A2:BB:B9:BA:59:B6:F3:93:97:68:37:42:46:C3:C0:05:99:3F:A9:8F:02:0D:1D:ED:BE:D4:8A:81:D5");
+-            put("soneraclass2ca [jdk]",
+-                    "79:08:B4:03:14:C1:38:10:0B:51:8D:07:35:80:7F:FB:FC:F8:51:8A:00:95:33:71:05:BA:38:6B:15:3D:D9:27");
+             put("securetrustca [jdk]",
+                     "F1:C1:B5:0A:E5:A2:0D:D8:03:0E:C9:F6:BC:24:82:3D:D3:67:B5:25:57:59:B4:E7:1B:61:FC:E9:F7:37:5D:73");
+             put("xrampglobalca [jdk]",
+@@ -245,12 +243,7 @@ public class VerifyCACerts {
+     // Exception list to 90 days expiry policy
+     // No error will be reported if certificate in this list expires
+     @SuppressWarnings("serial")
+-    private static final HashSet<String> EXPIRY_EXC_ENTRIES = new HashSet<String>() {
+-        {
+-            // Valid until: Tue Apr 06 15:29:40 HKT 2021
+-            add("soneraclass2ca [jdk]");
+-        }
+-    };
++    private static final HashSet<String> EXPIRY_EXC_ENTRIES = new HashSet<String>();
+ 
+     // Ninety days in milliseconds
+     private static final long NINETY_DAYS = 7776000000L;
diff --git a/openjdk-1.8.0.spec b/openjdk-1.8.0.spec
index f63e9a9..d83a699 100644
--- a/openjdk-1.8.0.spec
+++ b/openjdk-1.8.0.spec
@@ -918,7 +918,7 @@ Provides: java-%{javaver}-%{origin}-accessibility%{?1} = %{epoch}:%{version}-%{r
 
 Name:    java-%{javaver}-%{origin}
 Version: %{javaver}.%{updatever}.%{buildver}
-Release: 14
+Release: 15
 # java-1.5.0-ibm from jpackage.org set Epoch to 1 for unknown reasons
 # and this change was brought into RHEL-4. java-1.5.0-ibm packages
 # also included the epoch in their virtual provides. This created a
@@ -1109,6 +1109,7 @@ Patch195: support_CMS_parallel_inspection.patch
 Patch196: g1gc-numa-aware-Implementation.patch
 Patch197: implementation_of_Blas_hotspot_function_in_Intrinsics.patch
 Patch198: fix_G1GC_memory_leak_in_numa.patch
+Patch199: delete_untrustworthy_cacert_soneraclass2ca.patch
 
 #############################################
 #
@@ -1566,6 +1567,7 @@ pushd %{top_level_dir_name}
 %patch196 -p1
 %patch197 -p1
 %patch198 -p1
+%patch199 -p1
 popd
 
 # System library fixes
@@ -1639,7 +1641,7 @@ export ARCH_DATA_MODEL=64
 
 # We use ourcppflags because the OpenJDK build seems to
 # pass EXTRA_CFLAGS to the HotSpot C++ compiler...
-EXTRA_CFLAGS="%ourcppflags -Wno-error -fcommon"
+EXTRA_CFLAGS="%ourcppflags -Wno-error -fcommon -fsigned-char"
 EXTRA_CPP_FLAGS="%ourcppflags -Wno-error"
 
 EXTRA_ASFLAGS="${EXTRA_CFLAGS} -Wa,--generate-missing-build-notes=yes"
@@ -2182,6 +2184,9 @@ require "copy_jdk_configs.lua"
 %endif
 
 %changelog
+* Mon Jun 28 2021 kuenking111 <wangkun49@huawei.com> - 1:1.8.0.292-b10.15
+- fix delete_untrustworthy_cacert_soneraclass2ca.patch
+
 * Thu Jun 17 2021 kuenking111 <wangkun49@huawei.com> - 1:1.8.0.292-b10.14
 - fix systemDictionary resolve_from_stream ResourceMark
 
-- 
Gitee