diff --git a/threadcounter-threadpool-crc32-sqlbypass.patch b/threadcounter-threadpool-crc32-sqlbypass.patch new file mode 100644 index 0000000000000000000000000000000000000000..b181597f6048d8cbb3ab4daa5e87dc5c67dd3db1 --- /dev/null +++ b/threadcounter-threadpool-crc32-sqlbypass.patch @@ -0,0 +1,5702 @@ +From 7abc9f46613b5089c672d87d2c4c9b45bbbc0789 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?=E9=A6=99=E8=95=89=E5=82=A8=E8=93=84=E6=89=80?= + <727854256@qq.com> +Date: Tue, 8 Jul 2025 20:08:27 +0800 +Subject: [PATCH] threadcounter + +threadpool + +crc32 + +sqlbypass +--- + mysql-test/r/all_persisted_variables.result | 8 +- + mysql-test/r/mysqld--help-notwin.result | 4 + + mysql-test/suite/sys_vars/r/all_vars.result | 2 + + mysql-test/t/all_persisted_variables.test | 2 +- + plugin/thread_pool/CMakeLists.txt | 41 + + plugin/thread_pool/numa_affinity_manager.cc | 432 +++++ + plugin/thread_pool/numa_affinity_manager.h | 81 + + plugin/thread_pool/threadpool.h | 104 ++ + plugin/thread_pool/threadpool_common.cc | 981 ++++++++++ + plugin/thread_pool/threadpool_rwlock.h | 101 + + plugin/thread_pool/threadpool_unix.cc | 1843 +++++++++++++++++++ + plugin/thread_pool/threadpool_unix.h | 135 ++ + sql/CMakeLists.txt | 1 + + sql/event_scheduler.cc | 2 - + sql/mysqld.cc | 14 + + sql/mysqld.h | 2 + + sql/mysqld_thd_manager.cc | 26 + + sql/mysqld_thd_manager.h | 15 +- + sql/opt_range.cc | 280 +++ + sql/opt_range.h | 4 + + sql/sql_class.cc | 17 +- + sql/sql_executor.cc | 6 +- + sql/sql_executor.h | 5 + + sql/sql_lex.cc | 1 - + sql/sql_lex.h | 13 + + sql/sql_opt_exec_shared.h | 1 + + sql/sql_optimizer.cc | 36 +- + sql/sql_optimizer.h | 8 +- + sql/sql_parse.cc | 3 - + sql/sql_plan_cache.cc | 334 ++++ + sql/sql_plan_cache.h | 59 + + sql/sql_prepare.cc | 2 + + sql/sql_resolver.cc | 31 + + sql/sql_select.cc | 140 +- + sql/sql_union.cc | 31 +- + sql/sys_vars.cc | 7 + + storage/innobase/ut/crc32.cc | 38 +- + storage/perfschema/pfs_variable.cc | 69 + + unittest/gunit/thd_manager-t.cc | 8 - + 39 files changed, 4816 insertions(+), 71 deletions(-) + create mode 100644 plugin/thread_pool/CMakeLists.txt + create mode 100644 plugin/thread_pool/numa_affinity_manager.cc + create mode 100644 plugin/thread_pool/numa_affinity_manager.h + create mode 100644 plugin/thread_pool/threadpool.h + create mode 100644 plugin/thread_pool/threadpool_common.cc + create mode 100644 plugin/thread_pool/threadpool_rwlock.h + create mode 100644 plugin/thread_pool/threadpool_unix.cc + create mode 100644 plugin/thread_pool/threadpool_unix.h + create mode 100644 sql/sql_plan_cache.cc + create mode 100644 sql/sql_plan_cache.h + +diff --git a/mysql-test/r/all_persisted_variables.result b/mysql-test/r/all_persisted_variables.result +index 2d4eae332..2e5993ce5 100644 +--- a/mysql-test/r/all_persisted_variables.result ++++ b/mysql-test/r/all_persisted_variables.result +@@ -39,7 +39,7 @@ include/assert.inc [Expect 500+ variables in the table. Due to open Bugs, we are + + # Test SET PERSIST + +-include/assert.inc [Expect 412 persisted variables in the table.] ++include/assert.inc [Expect 413 persisted variables in the table.] + + ************************************************************ + * 3. Restart server, it must preserve the persisted variable +@@ -47,9 +47,9 @@ include/assert.inc [Expect 412 persisted variables in the table.] + ************************************************************ + # restart + +-include/assert.inc [Expect 412 persisted variables in persisted_variables table.] +-include/assert.inc [Expect 412 persisted variables shown as PERSISTED in variables_info table.] +-include/assert.inc [Expect 412 persisted variables with matching peristed and global values.] ++include/assert.inc [Expect 413 persisted variables in persisted_variables table.] ++include/assert.inc [Expect 413 persisted variables shown as PERSISTED in variables_info table.] ++include/assert.inc [Expect 413 persisted variables with matching peristed and global values.] + + ************************************************************ + * 4. Test RESET PERSIST IF EXISTS. Verify persisted variable +diff --git a/mysql-test/r/mysqld--help-notwin.result b/mysql-test/r/mysqld--help-notwin.result +index 231d543a2..6f41abe40 100644 +--- a/mysql-test/r/mysqld--help-notwin.result ++++ b/mysql-test/r/mysqld--help-notwin.result +@@ -996,6 +996,9 @@ The following options may be given as the first argument: + even if present. + (Defaults to on; use --skip-persisted-globals-load to disable.) + --pid-file=name Pid file used by safe_mysqld ++ --plan-cache Sys_plan_cache when first execute prepare stmt the remove ++ product execute plan time ++ (Defaults to on; use --skip-plan-cache to disable.) + --plugin-dir=name Directory for plugins + --plugin-load=name Optional semicolon-separated list of plugins to load, + where each plugin is identified as name=library, where +@@ -1743,6 +1746,7 @@ performance-schema-show-processlist FALSE + performance-schema-users-size -1 + persist-only-admin-x509-subject + persisted-globals-load TRUE ++plan-cache TRUE + port #### + port-open-timeout 0 + preload-buffer-size 32768 +diff --git a/mysql-test/suite/sys_vars/r/all_vars.result b/mysql-test/suite/sys_vars/r/all_vars.result +index 6b771ac40..39bef2be2 100644 +--- a/mysql-test/suite/sys_vars/r/all_vars.result ++++ b/mysql-test/suite/sys_vars/r/all_vars.result +@@ -48,6 +48,8 @@ partial_revokes + partial_revokes + password_require_current + password_require_current ++plan_cache ++plan_cache + regexp_stack_limit + regexp_stack_limit + regexp_time_limit +diff --git a/mysql-test/t/all_persisted_variables.test b/mysql-test/t/all_persisted_variables.test +index 00c707b52..158f6fa33 100644 +--- a/mysql-test/t/all_persisted_variables.test ++++ b/mysql-test/t/all_persisted_variables.test +@@ -41,7 +41,7 @@ call mtr.add_suppression("\\[Warning\\] .*MY-\\d+.* Changing innodb_extend_and_i + call mtr.add_suppression("Failed to initialize TLS for channel: mysql_main"); + + let $total_global_vars=`SELECT COUNT(*) FROM performance_schema.global_variables where variable_name NOT LIKE 'ndb_%'`; +-let $total_persistent_vars=412; ++let $total_persistent_vars=413; + + --echo *************************************************************** + --echo * 0. Verify that variables present in performance_schema.global +diff --git a/plugin/thread_pool/CMakeLists.txt b/plugin/thread_pool/CMakeLists.txt +new file mode 100644 +index 000000000..595ca67a9 +--- /dev/null ++++ b/plugin/thread_pool/CMakeLists.txt +@@ -0,0 +1,41 @@ ++# Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. ++# Copyright (c) 2022 Huawei Technologies Co., Ltd. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; version 2 of the License. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write to the Free Software ++# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++INCLUDE(${CMAKE_SOURCE_DIR}/cmake/mysql_version.cmake) ++ ++IF(VERSION VERSION_GREATER_EQUAL "8.0.27") ++ MY_ADD_COMPILE_DEFINITIONS( ++ COMPILE_DEFINITIONS MYSQL_DYNAMIC_PLUGIN) ++ELSE() ++ ADD_COMPILE_DEFINITIONS( ++ COMPILE_DEFINITIONS MYSQL_DYNAMIC_PLUGIN) ++ENDIF() ++ ++IF(VERSION VERSION_GREATER_EQUAL "8.0.33") ++ SET(RAPIDJSON_OPT "extra::rapidjson") ++ELSE() ++ SET(RAPIDJSON_OPT "") ++ENDIF() ++ ++MYSQL_ADD_PLUGIN(thread_pool ++ numa_affinity_manager.cc ++ threadpool_common.cc ++ threadpool_unix.cc ++ LINK_LIBRARIES ++ ${RAPIDJSON_OPT} ++ numa ++ MODULE_ONLY ++ MODULE_OUTPUT_NAME "thread_pool" ++ ) +diff --git a/plugin/thread_pool/numa_affinity_manager.cc b/plugin/thread_pool/numa_affinity_manager.cc +new file mode 100644 +index 000000000..acc99439c +--- /dev/null ++++ b/plugin/thread_pool/numa_affinity_manager.cc +@@ -0,0 +1,432 @@ ++/* Copyright (C) 2012 Monty Program Ab ++Copyright (C) 2023 Huawei Technologies Co., Ltd ++ ++This program is free software; you can redistribute it and/or modify ++it under the terms of the GNU General Public License as published by ++the Free Software Foundation; version 2 of the License. ++ ++This program is distributed in the hope that it will be useful, ++but WITHOUT ANY WARRANTY; without even the implied warranty of ++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++GNU General Public License for more details. ++ ++You should have received a copy of the GNU General Public License ++along with this program; if not, write to the Free Software ++Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ ++ ++#include "numa_affinity_manager.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "threadpool.h" ++#include "sql/log.h" // need remove ++ ++using namespace std; ++ ++static const std::string THREAD_NAME_LOG_WRITER = "thread/innodb/log_writer_thread"; ++static const std::string THREAD_NAME_LOG_FLUSHER = "thread/innodb/log_flusher_thread"; ++static const std::string THREAD_NAME_LOG_WRITER_NTF = "thread/innodb/log_write_notifier_thread"; ++static const std::string THREAD_NAME_LOG_FLUSHER_NTF = "thread/innodb/log_flush_notifier_thread"; ++static const std::string THREAD_NAME_CHECK_POINTER = "thread/innodb/log_checkpointer_thread"; ++static const std::string THREAD_NAME_PURGE = "thread/innodb/srv_purge_thread"; ++ ++numa_affinity_manager::numa_affinity_manager() ++{ ++} ++ ++numa_affinity_manager::~numa_affinity_manager() ++{ ++} ++ ++bool numa_affinity_manager::init() { ++ XLockGuard lk(lock); ++ initok = false; ++ if (!init_foreground_cpu_map()) { ++ return false; ++ } ++ bind_background_threads(); ++ initok = true; ++ bind_background_ok = false; // no purge threads, need call try bind again later ++ print_state(); ++ return true; ++} ++ ++bool numa_affinity_manager::init_foreground_cpu_map() { ++ cpu_count = get_sys_cpu(); ++ numa_count = get_sys_numa(); ++ if (cpu_count <= 0 || numa_count <= 0 || ++ cpu_count % numa_count != 0) { ++ return false; ++ } ++ ++ int cpu_per_numa = cpu_count / numa_count; ++ int start = 0; ++ foreground_cpu_map.clear(); ++ foreground_cpu_map_opts = parse_cpustring(threadpool_sched_affinity_foreground_thread); ++ ++ // 所有的core 要包含在配置的前端线程范围 是否排除后端线程的绑核由前端线程配置决定 ++ for (int i = 0; i < numa_count; i++) { ++ auto msk = numa_allocate_cpumask(); ++ if (msk == nullptr) { ++ return false; ++ } ++ ++ bool msk_bitset = false; ++ for (int j = 0; j < cpu_per_numa; j++) { ++ if (foreground_cpu_map_opts == nullptr || ++ numa_bitmask_isbitset(foreground_cpu_map_opts.get(), start + j)) { ++ numa_bitmask_setbit(msk, start + j); ++ msk_bitset = true; ++ } ++ } ++ if (msk_bitset) { ++ foreground_cpu_map.emplace_back(msk, free_cpumask_func); ++ } else { ++ free_cpumask_func(msk); ++ } ++ ++ start += cpu_per_numa; ++ } ++ ++ return true; ++} ++ ++bool numa_affinity_manager::bind_foreground_thread(int group_id) { ++ if (!threadpool_sched_affinity) { ++ XLockGuard lk(lock); ++ my_thread_os_id_t tid = my_thread_os_id(); ++ foreground_threads[tid] = group_id; ++ return false; ++ } ++ ++ // for bind the purge thread ++ if (!bind_background_ok) { ++ XLockGuard lk(lock); ++ if (!bind_background_ok) { ++ bind_background_threads(); ++ bind_background_ok = true; ++ print_state(); ++ } ++ } ++ ++ bool ret = true; ++ XLockGuard lk(lock); ++ if (initok) { ++ my_thread_os_id_t tid = my_thread_os_id(); ++ bitmask *msk = foreground_cpu_map[group_id%foreground_cpu_map.size()].get(); ++ ret = bind_thread(tid, msk); ++ if (ret) { ++ foreground_threads[tid] = group_id; ++ } ++ } ++ return !ret; ++} ++ ++void numa_affinity_manager::update_bind_foreground_threads() { ++ XLockGuard lk(lock); ++ bind_foreground_threads(); ++ print_state(); ++} ++ ++void numa_affinity_manager::update_bind_background_threads() { ++ XLockGuard lk(lock); ++ bind_background_threads(); ++ print_state(); ++} ++ ++void numa_affinity_manager::update_bind_threads() { ++ XLockGuard lk(lock); ++ bind_foreground_threads(); ++ bind_background_threads(); ++ print_state(); ++} ++ ++void numa_affinity_manager::print_state() { ++#ifndef NDEBUG ++ // assert has own lock of s or x ++ std::cout << "initok: " << initok << std::endl; ++ std::cout << "cpu_count: " << cpu_count << std::endl; ++ std::cout << "numa_count: " << numa_count << std::endl; ++ ++ for (unsigned int i = 0; i < foreground_cpu_map.size(); i++) { ++ std::string name = "foreground_cpu_map[" + std::to_string(i) + "]"; ++ std::cout << name << ": " << cpumask_to_string(foreground_cpu_map[i].get()) << std::endl; ++ } ++ ++ for (auto &pi : background_threads) { ++ for (auto &it : pi.second) { ++ std::cout << "back-ground thread " << pi.first << " : " << it << std::endl; ++ } ++ } ++ ++ std::string log_writer_cpustring = threadpool_sched_affinity_log_writer == nullptr? "" : threadpool_sched_affinity_log_writer; ++ std::string log_flusher_cpustring = threadpool_sched_affinity_log_flusher == nullptr? "" : threadpool_sched_affinity_log_flusher; ++ std::string log_write_notifier_cpustring = threadpool_sched_affinity_log_write_notifier == nullptr? "" : threadpool_sched_affinity_log_write_notifier; ++ std::string log_flush_notifier_cpustring = threadpool_sched_affinity_log_flush_notifier == nullptr? "" : threadpool_sched_affinity_log_flush_notifier; ++ std::string log_checkpointer_cpustring = threadpool_sched_affinity_log_checkpointer == nullptr? "" : threadpool_sched_affinity_log_checkpointer; ++ std::string purge_coordinator_cpustring = threadpool_sched_affinity_purge_coordinator == nullptr? "" : threadpool_sched_affinity_purge_coordinator; ++ ++ std::cout << "thread_pool_sched_affinity_log_writer: " << log_writer_cpustring << std::endl; ++ std::cout << "thread_pool_sched_affinity_log_flusher: " << log_flusher_cpustring << std::endl; ++ std::cout << "thread_pool_sched_affinity_log_write_notifier: " << log_write_notifier_cpustring << std::endl; ++ std::cout << "thread_pool_sched_affinity_log_flush_notifier: " << log_flush_notifier_cpustring << std::endl; ++ std::cout << "thread_pool_sched_affinity_log_checkpointer: " << log_checkpointer_cpustring << std::endl; ++ std::cout << "thread_pool_sched_affinity_purge_coordinator: " << purge_coordinator_cpustring << std::endl; ++#endif ++} ++ ++void numa_affinity_manager::free_cpumask_func(bitmask *ptr) { ++ if (ptr != nullptr) { ++ numa_bitmask_free(ptr); ++ } ++} ++ ++int numa_affinity_manager::get_sys_cpu() { ++ return sysconf(_SC_NPROCESSORS_CONF); ++} ++ ++int numa_affinity_manager::get_sys_cpu_only() { ++ return sysconf(_SC_NPROCESSORS_ONLN); ++} ++ ++int numa_affinity_manager::get_sys_numa() { ++ return numa_num_configured_nodes(); ++} ++ ++bool numa_affinity_manager::bind_thread(my_thread_os_id_t tid, bitmask *msk) { ++ bool ret = false; ++ int i = 0; ++ for (i = 0; i < 10; i++) { ++ if (numa_sched_setaffinity(tid, msk) == 0) { ++ ret = true; ++ break; ++ } ++ ++ std::this_thread::yield(); ++ } ++ std::cout << "bind thread " << (ret? "succeed":"failed") << ": " ++ << tid << " - " << cpumask_to_string(msk) << std::endl; ++ return ret; ++} ++ ++bool numa_affinity_manager::check_cpustring(const char *opt) { ++ if (opt == nullptr) { ++ return true; ++ } ++ std::string out; ++ if (!normalize_cpustring(opt, out)) { ++ return false; ++ } ++ std::shared_ptr ret(numa_parse_cpustring(out.c_str())); ++ return ret.get() != nullptr; ++} ++ ++const std::string numa_affinity_manager::cpumask_to_string(bitmask *msk) { ++ std::string ret; ++ for (unsigned int i = 0; i < msk->size; i++) { ++ if (numa_bitmask_isbitset(msk, i)) { ++ ret += std::to_string(i) + " "; ++ } ++ } ++ return ret; ++} ++ ++bool numa_affinity_manager::normalize_cpustring(const std::string &cpu_string, std::string &out) { ++ std::string normalized_cpu_string; ++ bool invalid_cpu_string = false; ++ const int INVALID_CORE_ID = -1; ++ int core_id = INVALID_CORE_ID; ++ for (auto c : cpu_string) { ++ switch (c) { ++ case ' ': ++ break; ++ case '-': ++ case ',': ++ if (core_id == INVALID_CORE_ID) { ++ invalid_cpu_string = true; ++ } else { ++ normalized_cpu_string += std::to_string(core_id); ++ normalized_cpu_string += c; ++ core_id = INVALID_CORE_ID; ++ } ++ break; ++ case '0' ... '9': ++ if (core_id == INVALID_CORE_ID) { ++ core_id = (c - '0'); ++ } else { ++ core_id = core_id * 10 + (c - '0'); ++ } ++ break; ++ default: ++ invalid_cpu_string = true; ++ break; ++ } ++ if (invalid_cpu_string) { ++ break; ++ } ++ } ++ if (core_id != INVALID_CORE_ID) { ++ normalized_cpu_string += std::to_string(core_id); ++ } ++ if (!normalized_cpu_string.empty() && ++ (*normalized_cpu_string.rbegin() == '-' || ++ *normalized_cpu_string.rbegin() == ',')) { ++ invalid_cpu_string = true; ++ } ++ if (invalid_cpu_string) { ++ out = ""; ++ return false; ++ } ++ out = normalized_cpu_string; ++ return true; ++} ++ ++std::shared_ptr numa_affinity_manager::parse_cpustring(const char *opt) { ++ std::shared_ptr ret(nullptr, free_cpumask_func); ++ if (opt == nullptr || (std::string(opt)).empty()) { ++ return ret; ++ } ++ ++ std::string out; ++ if (normalize_cpustring(opt, out)) { ++ ret.reset(numa_parse_cpustring(out.c_str()), free_cpumask_func); ++ } ++ return ret; ++} ++ ++void numa_affinity_manager::apply_background_options() { ++ for (auto &pi : background_threads) { ++ auto cpu_map = background_cpu_map_opts.find(pi.first); ++ if (cpu_map == background_cpu_map_opts.end() || cpu_map->second == nullptr) { ++ continue; ++ } ++ for (auto it : pi.second) { ++ bind_thread(it, cpu_map->second.get()); ++ } ++ } ++} ++ ++void numa_affinity_manager::fetch_background_options() { ++ background_cpu_map_opts.clear(); ++ background_cpu_map_opts[THREAD_NAME_LOG_WRITER] = parse_cpustring(threadpool_sched_affinity_log_writer); ++ background_cpu_map_opts[THREAD_NAME_LOG_FLUSHER] = parse_cpustring(threadpool_sched_affinity_log_flusher); ++ background_cpu_map_opts[THREAD_NAME_LOG_WRITER_NTF] = parse_cpustring(threadpool_sched_affinity_log_write_notifier); ++ background_cpu_map_opts[THREAD_NAME_LOG_FLUSHER_NTF] = parse_cpustring(threadpool_sched_affinity_log_flush_notifier); ++ background_cpu_map_opts[THREAD_NAME_CHECK_POINTER] = parse_cpustring(threadpool_sched_affinity_log_checkpointer); ++ background_cpu_map_opts[THREAD_NAME_PURGE] = parse_cpustring(threadpool_sched_affinity_purge_coordinator); ++} ++ ++void numa_affinity_manager::fetch_background_threads() { ++ PFS_simple_index pos(0); /** Current position. */ ++ PFS_simple_index next_pos(0); /** Next position. */ ++ pfs_optimistic_state lock; ++ pfs_optimistic_state session_lock; ++ ++ background_threads.clear(); ++ background_threads[THREAD_NAME_LOG_WRITER] = std::vector(); ++ background_threads[THREAD_NAME_LOG_FLUSHER] = std::vector(); ++ background_threads[THREAD_NAME_LOG_WRITER_NTF] = std::vector(); ++ background_threads[THREAD_NAME_LOG_FLUSHER_NTF] = std::vector(); ++ background_threads[THREAD_NAME_CHECK_POINTER] = std::vector(); ++ background_threads[THREAD_NAME_PURGE] = std::vector(); ++ ++ while (1) { ++ pos.set_at(&next_pos); ++ PFS_thread_iterator it = global_thread_container.iterate(pos.m_index); ++ PFS_thread *pfs = it.scan_next(&pos.m_index); ++ ++ if (pfs == nullptr) { ++ break; ++ } ++ ++ next_pos.set_after(&pos); ++ ++ pfs->m_lock.begin_optimistic_lock(&lock); /* Protect this reader against thread termination */ ++ pfs->m_session_lock.begin_optimistic_lock(&session_lock); /* Protect this reader against session attribute changes */ ++ ++ PFS_thread_class *safe_class = sanitize_thread_class(pfs->m_class); ++ if (unlikely(safe_class == nullptr)) { ++ pfs->m_session_lock.end_optimistic_lock(&session_lock); ++ pfs->m_lock.end_optimistic_lock(&lock); ++ // HA_ERR_RECORD_DELETED ++ continue; ++ } ++#if (MYSQL_VERSION_ID>80025) ++ const char *nameInstrument = safe_class->m_name.str(); ++#else ++ const char *nameInstrument = safe_class->m_name; ++#endif ++ std::cout << nameInstrument << " " << pfs->m_thread_os_id << std::endl; ++ auto itp = background_threads.find(nameInstrument); ++ if (itp != background_threads.end()) { ++ itp->second.push_back(pfs->m_thread_os_id); ++ } ++ ++ pfs->m_session_lock.end_optimistic_lock(&session_lock); ++ pfs->m_lock.end_optimistic_lock(&lock); ++ } ++} ++ ++void numa_affinity_manager::bind_foreground_threads() ++{ ++ init_foreground_cpu_map(); ++ if (threadpool_sched_affinity) { ++ for (auto it = foreground_threads.begin(); it != foreground_threads.end(); it++) { ++ auto tid = it->first; ++ int group_id = it->second; ++ bitmask *msk = foreground_cpu_map[group_id%foreground_cpu_map.size()].get(); ++ bind_thread(tid, msk); ++ } ++ } ++} ++ ++void numa_affinity_manager::bind_background_threads() ++{ ++ fetch_background_threads(); ++ fetch_background_options(); ++ if (threadpool_sched_affinity) { ++ apply_background_options(); ++ } ++} ++ ++void numa_affinity_manager::remove_foreground_thread() ++{ ++ my_thread_os_id_t tid = my_thread_os_id(); ++ XLockGuard lk(lock); ++ foreground_threads.erase(tid); ++} ++ ++void numa_affinity_manager::unbind_foreground_threads() ++{ ++ auto msk = numa_bitmask_alloc(cpu_count); ++ numa_bitmask_setall(msk); ++ for (auto it = foreground_threads.begin(); it != foreground_threads.end(); it++) { ++ auto tid = it->first; ++ bind_thread(tid, msk); ++ } ++ free_cpumask_func(msk); ++} ++ ++void numa_affinity_manager::unbind_background_threads() ++{ ++ auto msk = numa_bitmask_alloc(cpu_count); ++ numa_bitmask_setall(msk); ++ for (auto &it : background_threads) { ++ for (auto tid : it.second) { ++ bind_thread(tid, msk); ++ } ++ } ++ free_cpumask_func(msk); ++} ++ ++void numa_affinity_manager::unbind_threads() ++{ ++ SLockGuard lk(lock); ++ unbind_foreground_threads(); ++ unbind_background_threads(); ++} +\ No newline at end of file +diff --git a/plugin/thread_pool/numa_affinity_manager.h b/plugin/thread_pool/numa_affinity_manager.h +new file mode 100644 +index 000000000..be5f005fe +--- /dev/null ++++ b/plugin/thread_pool/numa_affinity_manager.h +@@ -0,0 +1,81 @@ ++/* Copyright (C) 2012 Monty Program Ab ++ Copyright (C) 2022 Huawei Technologies Co., Ltd ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; version 2 of the License. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ ++#ifndef NUMA_AFFINITY_MANAGER_H_ ++#define NUMA_AFFINITY_MANAGER_H_ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "storage/perfschema/pfs_buffer_container.h" ++#include "storage/perfschema/pfs_engine_table.h" ++#include "threadpool_rwlock.h" ++ ++class numa_affinity_manager ++{ ++public: ++ numa_affinity_manager(); ++ virtual ~numa_affinity_manager(); ++ ++ bool init(); ++ bool init_foreground_cpu_map(); ++ bool bind_foreground_thread(int group_id); ++ void update_bind_threads(); ++ void update_bind_foreground_threads(); ++ void update_bind_background_threads(); ++ void unbind_threads(); ++ void remove_foreground_thread(); ++ void unbind_foreground_threads(); ++ void unbind_background_threads(); ++ void print_state(); ++ ++ static void free_cpumask_func(bitmask *ptr); ++ static int get_sys_cpu(); ++ static int get_sys_cpu_only(); ++ static int get_sys_numa(); ++ static bool bind_thread(my_thread_os_id_t id, bitmask *msk); ++ static bool unbind_thread(my_thread_os_id_t id); ++ static bool check_cpustring(const char *opt); ++ static const std::string cpumask_to_string(bitmask *msk); ++ static bool normalize_cpustring(const std::string &cpu_string, std::string &out); ++ static std::shared_ptr parse_cpustring(const char *opt); ++ ++protected: ++ void apply_background_options(); ++ void fetch_background_options(); ++ void fetch_background_threads(); ++ void bind_foreground_threads(); ++ void bind_background_threads(); ++ ++ ++private: ++ thread_pool_rwlock_t lock; ++ bool initok{false}; ++ int cpu_count{0}; ++ int numa_count{0}; ++ std::vector > foreground_cpu_map; ++ std::shared_ptr foreground_cpu_map_opts; ++ std::map > background_cpu_map_opts; ++ bool bind_background_ok{false}; ++ std::map > background_threads; ++ std::unordered_map foreground_threads; ++}; ++ ++#endif // NUMA_AFFINITY_MANAGER_H_ +diff --git a/plugin/thread_pool/threadpool.h b/plugin/thread_pool/threadpool.h +new file mode 100644 +index 000000000..1647f8c97 +--- /dev/null ++++ b/plugin/thread_pool/threadpool.h +@@ -0,0 +1,104 @@ ++/* Copyright (C) 2012 Monty Program Ab ++ Copyright (C) 2022 Huawei Technologies Co., Ltd ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; version 2 of the License. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ ++#ifndef THREADPOOL_H_ ++#define THREADPOOL_H_ ++ ++#include "sql/sql_class.h" ++#include "sql/mysqld_thd_manager.h" ++#include "sql/conn_handler/connection_handler_manager.h" ++#include "sql/conn_handler/channel_info.h" ++#include "mysql_version.h" ++#include "threadpool_rwlock.h" ++ ++struct SHOW_VAR; ++ ++#define MAX_THREAD_GROUPS 1024 ++#define MAX_CONNECTIONS 100000 ++ ++ ++enum tp_high_prio_mode_t { ++ TP_HIGH_PRIO_MODE_TRANSACTIONS, ++ TP_HIGH_PRIO_MODE_STATEMENTS, ++ TP_HIGH_PRIO_MODE_NONE ++}; ++ ++/* Threadpool parameters */ ++extern uint threadpool_idle_timeout; /* Shutdown idle worker threads after this timeout */ ++extern bool threadpool_dedicated_listener; /* Control whether listener be dedicated */ ++extern uint threadpool_size; /* Number of parallel executing threads */ ++extern uint threadpool_max_threads; ++extern uint threadpool_stall_limit; /* time interval in 10 ms units for stall checks*/ ++extern uint threadpool_oversubscribe; /* Maximum active threads in group */ ++extern uint threadpool_toobusy; /* Maximum active and waiting threads in group */ ++extern bool threadpool_connection_balance; /* Control whether conncetions migrating to another thread group so that they are evenly distributed */ ++ ++extern bool threadpool_sched_affinity; /* Control whether thread group scheduling affinity */ ++extern char *threadpool_sched_affinity_foreground_thread; ++extern char *threadpool_sched_affinity_log_writer; ++extern char *threadpool_sched_affinity_log_flusher; ++extern char *threadpool_sched_affinity_log_write_notifier; ++extern char *threadpool_sched_affinity_log_flush_notifier; ++extern char *threadpool_sched_affinity_log_checkpointer; ++extern char *threadpool_sched_affinity_purge_coordinator; ++ ++/* Possible values for thread_pool_high_prio_mode */ ++extern const char *threadpool_high_prio_mode_names[]; ++ ++/* Common thread pool routines, suitable for different implementations */ ++extern void threadpool_remove_connection(THD *thd); ++extern int threadpool_process_request(THD *thd); ++extern int threadpool_add_connection(THD *thd); ++ ++/* ++ Functions used by scheduler. ++ OS-specific implementations are in ++ threadpool_unix.cc or threadpool_win.cc ++*/ ++extern bool tp_init(); ++extern void tp_wait_begin(THD *, int); ++extern void tp_wait_end(THD *); ++extern void tp_post_kill_notification(THD *thd) noexcept; ++extern bool tp_add_connection(Channel_info *); ++extern void tp_end(void); ++extern bool thread_attach(THD *thd); ++extern void tp_deinit(); ++ ++extern THD_event_functions tp_event_functions; ++ ++ ++/* ++ Threadpool statistics ++*/ ++struct TP_STATISTICS { ++ /* Current number of worker thread. */ ++ std::atomic num_worker_threads; ++}; ++ ++extern TP_STATISTICS tp_stats; ++ ++/* Functions to set threadpool parameters */ ++extern void tp_set_threadpool_size(uint val) noexcept; ++extern void tp_set_threadpool_stall_limit(uint val) noexcept; ++ ++extern uint tp_get_thdvar_high_prio_tickets(THD *thd); ++extern uint tp_get_thdvar_high_prio_mode(THD *thd); ++ ++extern st_plugin_int *gPluginPtr; ++extern thread_pool_rwlock_t gPluginLock; ++extern bool gPluginUninstalling; ++ ++#endif // THREADPOOL_H_ ++ +diff --git a/plugin/thread_pool/threadpool_common.cc b/plugin/thread_pool/threadpool_common.cc +new file mode 100644 +index 000000000..57d9778c2 +--- /dev/null ++++ b/plugin/thread_pool/threadpool_common.cc +@@ -0,0 +1,981 @@ ++/* Copyright (C) 2012 Monty Program Ab ++ Copyright (C) 2022 Huawei Technologies Co., Ltd ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; version 2 of the License. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ ++ ++#include ++#include ++ ++#include "threadpool.h" ++#include "threadpool_unix.h" ++#include "my_thread_local.h" ++#include "my_sys.h" ++#include "mysql/plugin.h" ++#include "mysql/psi/mysql_idle.h" ++#include "mysql/thread_pool_priv.h" ++#include "sql/debug_sync.h" ++#include "sql/mysqld.h" ++#include "sql/sql_class.h" ++#include "sql/sql_connect.h" ++#include "sql/protocol_classic.h" ++#include "sql/sql_parse.h" ++#include "sql/sql_table.h" ++#include "sql/field.h" ++#include "sql/sql_show.h" ++#include "sql/sql_class.h" ++ ++#define MYSQL_SERVER 1 ++ ++/* Threadpool parameters */ ++uint threadpool_idle_timeout; ++bool threadpool_dedicated_listener; ++uint threadpool_size; ++uint threadpool_stall_limit; ++uint threadpool_max_threads; ++uint threadpool_oversubscribe; ++uint threadpool_toobusy; ++bool threadpool_connection_balance; ++ ++bool threadpool_sched_affinity = false; ++char *threadpool_sched_affinity_foreground_thread = nullptr; ++char *threadpool_sched_affinity_log_writer = nullptr; ++char *threadpool_sched_affinity_log_flusher = nullptr; ++char *threadpool_sched_affinity_log_write_notifier = nullptr; ++char *threadpool_sched_affinity_log_flush_notifier = nullptr; ++char *threadpool_sched_affinity_log_checkpointer = nullptr; ++char *threadpool_sched_affinity_purge_coordinator = nullptr; ++ ++extern thread_pool_rwlock_t change_group_rwlock; ++ ++/* Stats */ ++TP_STATISTICS tp_stats; ++ ++/* ++ Worker threads contexts, and THD contexts. ++ ========================================= ++ ++ Both worker threads and connections have their sets of thread local variables ++ At the moment it is mysys_var (this has specific data for dbug, my_error and ++ similar goodies), and PSI per-client structure. ++ ++ Whenever query is executed following needs to be done: ++ ++ 1. Save worker thread context. ++ 2. Change TLS variables to connection specific ones using thread_attach(THD*). ++ This function does some additional work. ++ 3. Process query ++ 4. Restore worker thread context. ++ ++ Connection login and termination follows similar schema w.r.t saving and ++ restoring contexts. ++ ++ For both worker thread, and for the connection, mysys variables are created ++ using my_thread_init() and freed with my_thread_end(). ++ ++*/ ++class Worker_thread_context { ++#ifdef HAVE_PSI_THREAD_INTERFACE ++ PSI_thread *const psi_thread; ++#endif ++#ifndef NDEBUG ++ const my_thread_id thread_id; ++#endif ++ public: ++ Worker_thread_context() noexcept ++ : ++#ifdef HAVE_PSI_THREAD_INTERFACE ++ psi_thread(PSI_THREAD_CALL(get_thread)()) ++#endif ++#ifndef NDEBUG ++ , ++ thread_id(my_thread_var_id()) ++#endif ++ { ++ } ++ ++ ~Worker_thread_context() noexcept { ++#ifdef HAVE_PSI_THREAD_INTERFACE ++ PSI_THREAD_CALL(set_thread)(psi_thread); ++#endif ++#ifndef NDEBUG ++ set_my_thread_var_id(thread_id); ++#endif ++ THR_MALLOC = nullptr; ++ } ++}; ++ ++/* ++ Attach/associate the connection with the OS thread, ++*/ ++bool thread_attach(THD *thd) { ++#ifndef NDEBUG ++ set_my_thread_var_id(thd->thread_id()); ++#endif ++ thd->thread_stack = (char *)&thd; ++ thd->store_globals(); ++#ifdef HAVE_PSI_THREAD_INTERFACE ++ PSI_THREAD_CALL(set_thread)(thd->get_psi()); ++#endif ++ mysql_socket_set_thread_owner( ++ thd->get_protocol_classic()->get_vio()->mysql_socket); ++ return 0; ++} ++ ++#ifdef HAVE_PSI_STATEMENT_INTERFACE ++extern PSI_statement_info stmt_info_new_packet; ++#endif ++ ++static void threadpool_net_before_header_psi_noop(NET * /* net */, ++ void * /* user_data */, ++ size_t /* count */) {} ++ ++static void threadpool_init_net_server_extension(THD *thd) { ++#ifdef HAVE_PSI_INTERFACE ++ // socket_connection.cc:init_net_server_extension should have been called ++ // already for us. We only need to overwrite the "before" callback ++ assert(thd->m_net_server_extension.m_user_data == thd); ++ thd->m_net_server_extension.m_before_header = ++ threadpool_net_before_header_psi_noop; ++#else ++ assert(thd->get_protocol_classic()->get_net()->extension == NULL); ++#endif ++} ++ ++int threadpool_add_connection(THD *thd) { ++ int retval = 1; ++ Worker_thread_context worker_context; ++ ++ my_thread_init(); ++ ++ /* Create new PSI thread for use with the THD. */ ++#ifdef HAVE_PSI_THREAD_INTERFACE ++#if (MYSQL_VERSION_ID>80026) ++ thd->set_psi(PSI_THREAD_CALL(new_thread)(key_thread_one_connection, 0, thd, ++ thd->thread_id())); ++#else ++ thd->set_psi(PSI_THREAD_CALL(new_thread)(key_thread_one_connection, thd, ++ thd->thread_id())); ++#endif ++#endif ++ ++ /* Login. */ ++ thread_attach(thd); ++ thd->start_utime = my_micro_time(); ++ thd->store_globals(); ++ ++ if (thd_prepare_connection(thd)) { ++ goto end; ++ } ++ ++ /* ++ Check if THD is ok, as prepare_new_connection_state() ++ can fail, for example if init command failed. ++ */ ++ if (thd_connection_alive(thd)) { ++ retval = 0; ++ thd_set_net_read_write(thd, 1); ++ MYSQL_SOCKET_SET_STATE(thd->get_protocol_classic()->get_vio()->mysql_socket, ++ PSI_SOCKET_STATE_IDLE); ++ thd->m_server_idle = true; ++ threadpool_init_net_server_extension(thd); ++ } ++ ++end: ++ if (retval) { ++ Connection_handler_manager *handler_manager = ++ Connection_handler_manager::get_instance(); ++ handler_manager->inc_aborted_connects(); ++ } ++ return retval; ++} ++ ++ ++static Connection_handler_functions tp_chf = { ++ 0, ++ tp_add_connection, ++ tp_end ++}; ++ ++THD_event_functions tp_event_functions = { ++ tp_wait_begin, ++ tp_wait_end, ++ tp_post_kill_notification ++}; ++ ++ ++void threadpool_remove_connection(THD *thd) { ++ Worker_thread_context worker_context; ++ ++ thread_attach(thd); ++ thd_set_net_read_write(thd, 0); ++ ++ end_connection(thd); ++ close_connection(thd, 0); ++ ++ thd->release_resources(); ++ ++#ifdef HAVE_PSI_THREAD_INTERFACE ++ PSI_THREAD_CALL(delete_thread)(thd->get_psi()); ++#endif ++ ++ Global_THD_manager::get_instance()->remove_thd(thd); ++ Connection_handler_manager::dec_connection_count(); ++ delete thd; ++} ++ ++/** ++ Process a single client request or a single batch. ++*/ ++int threadpool_process_request(THD *thd) { ++ int retval = 0; ++ Worker_thread_context worker_context; ++ ++ thread_attach(thd); ++ ++ if (thd->killed == THD::KILL_CONNECTION) { ++ /* ++ killed flag was set by timeout handler ++ or KILL command. Return error. ++ */ ++ retval = 1; ++ goto end; ++ } ++ ++ /* ++ In the loop below, the flow is essentially the copy of thead-per-connections ++ logic, see do_handle_one_connection() in sql_connect.c ++ ++ The goal is to execute a single query, thus the loop is normally executed ++ only once. However for SSL connections, it can be executed multiple times ++ (SSL can preread and cache incoming data, and vio->has_data() checks if it ++ was the case). ++ */ ++ for (;;) { ++ Vio *vio; ++ thd_set_net_read_write(thd, 0); ++ ++ if ((retval = do_command(thd)) != 0) goto end; ++ ++ if (!thd_connection_alive(thd)) { ++ retval = 1; ++ goto end; ++ } ++ ++ vio = thd->get_protocol_classic()->get_vio(); ++ if (!vio->has_data(vio)) { ++ /* More info on this debug sync is in sql_parse.cc*/ ++ DEBUG_SYNC(thd, "before_do_command_net_read"); ++ thd_set_net_read_write(thd, 1); ++ goto end; ++ } ++ if (!thd->m_server_idle) { ++ MYSQL_SOCKET_SET_STATE(vio->mysql_socket, PSI_SOCKET_STATE_IDLE); ++ MYSQL_START_IDLE_WAIT(thd->m_idle_psi, &thd->m_idle_state); ++ thd->m_server_idle = true; ++ } ++ } ++ ++end: ++ if (!retval && !thd->m_server_idle) { ++ MYSQL_SOCKET_SET_STATE(thd->get_protocol_classic()->get_vio()->mysql_socket, ++ PSI_SOCKET_STATE_IDLE); ++ MYSQL_START_IDLE_WAIT(thd->m_idle_psi, &thd->m_idle_state); ++ thd->m_server_idle = true; ++ } ++ ++ return retval; ++} ++ ++static void fix_threadpool_size(THD*, ++ struct SYS_VAR *, void*, const void* value) ++{ ++ threadpool_size = *static_cast(value); ++ tp_set_threadpool_size(threadpool_size); ++} ++ ++static void fix_threadpool_stall_limit(THD*, struct SYS_VAR *, void*, const void* value) ++{ ++ threadpool_stall_limit = *static_cast(value); ++ tp_set_threadpool_stall_limit(threadpool_stall_limit); ++} ++ ++static void fix_threadpool_connection_balance(THD*, struct SYS_VAR *, void*, const void* value) ++{ ++ change_group_rwlock.xlock(); ++ threadpool_connection_balance = *static_cast(value); ++ change_group_rwlock.unlock(); ++} ++ ++static inline int my_getncpus() noexcept { ++#ifdef _SC_NPROCESSORS_ONLN ++ return sysconf(_SC_NPROCESSORS_ONLN); ++#else ++ return 2; /* The value returned by the old my_getncpus implementation */ ++#endif ++} ++ ++static MYSQL_SYSVAR_UINT(idle_timeout, threadpool_idle_timeout, ++ PLUGIN_VAR_RQCMDARG, ++ "Timeout in seconds for an idle thread in the thread pool." ++ "Worker thread will be shut down after timeout", ++ NULL, NULL, 60, 1, UINT_MAX, 1); ++ ++static MYSQL_SYSVAR_UINT(oversubscribe, threadpool_oversubscribe, ++ PLUGIN_VAR_RQCMDARG, ++ "How many additional active worker threads in a group are allowed.", ++ NULL, NULL, 3, 1, 1000, 1); ++ ++static MYSQL_SYSVAR_UINT(toobusy, threadpool_toobusy, ++ PLUGIN_VAR_RQCMDARG, ++ "How many additional active and waiting worker threads in a group are allowed.", ++ NULL, NULL, 13, 1, 1000, 1); ++ ++static MYSQL_SYSVAR_BOOL(dedicated_listener, threadpool_dedicated_listener, ++ PLUGIN_VAR_RQCMDARG, ++ "Control whether listener be dedicated", nullptr, ++ nullptr, false); ++ ++static MYSQL_SYSVAR_UINT(size, threadpool_size, ++ PLUGIN_VAR_RQCMDARG, ++ "Number of thread groups in the pool. " ++ "This parameter is roughly equivalent to maximum number of concurrently " ++ "executing threads (threads in a waiting state do not count as executing).", ++ NULL, fix_threadpool_size, (uint)my_getncpus(), 1, MAX_THREAD_GROUPS, 1); ++ ++static MYSQL_SYSVAR_UINT(stall_limit, threadpool_stall_limit, ++ PLUGIN_VAR_RQCMDARG, ++ "Maximum query execution time in milliseconds," ++ "before an executing non-yielding thread is considered stalled." ++ "If a worker thread is stalled, additional worker thread " ++ "may be created to handle remaining clients.", ++ NULL, fix_threadpool_stall_limit, 500, 10, UINT_MAX, 1); ++ ++static MYSQL_SYSVAR_UINT(max_threads, threadpool_max_threads, ++ PLUGIN_VAR_RQCMDARG, ++ "Maximum allowed number of worker threads in the thread pool", ++ NULL, NULL, MAX_CONNECTIONS, 1, MAX_CONNECTIONS, 1); ++ ++static MYSQL_SYSVAR_BOOL(connection_balance, threadpool_connection_balance, ++PLUGIN_VAR_RQCMDARG, ++"Control whether thread group migrating connections" ++"so that they are evenly distributed.", nullptr, ++fix_threadpool_connection_balance, false); ++ ++static int check_fix_sched_affinity_cpustring(MYSQL_THD thd, SYS_VAR *, void *save, ++ struct st_mysql_value *value) { ++ std::string tmp(1024, '\0'); ++ char *buff = const_cast(tmp.data()); ++ const char *str = nullptr; ++ (*(const char **)save) = nullptr; ++ int length = tmp.size(); ++ if ((str = value->val_str(value, buff, &length))) { ++ str = thd->strmake(str, length); ++ } ++ ++ // char *c = var->save_result.string_value.str; ++ if (str != nullptr && !numa_affinity_manager::check_cpustring(str)) { ++ // my_error("Invalid cpu string %s.", MYF(0), c); ++ return 1; ++ } ++ ++ *(const char **)save = str; ++ return 0; ++} ++ ++static void fix_sched_affinity_background_cpustring(MYSQL_THD, SYS_VAR *, void *var_ptr, const void *save) { ++ const char *new_option_val = *static_cast(save); ++ *static_cast(var_ptr) = new_option_val; ++ group_affinity.update_bind_background_threads(); ++} ++ ++static void fix_sched_affinity_foreground_cpustring(MYSQL_THD, SYS_VAR *, void *var_ptr, const void *save) { ++ const char *new_option_val = *static_cast(save); ++ *static_cast(var_ptr) = new_option_val; ++ group_affinity.update_bind_foreground_threads(); ++} ++ ++static void fix_sched_affinity(MYSQL_THD, SYS_VAR *, void *var_ptr, const void *save) { ++ bool newval = *static_cast(save); ++ *static_cast(var_ptr) = newval; ++ if (threadpool_sched_affinity) { ++ group_affinity.update_bind_threads(); ++ } else { ++ group_affinity.unbind_threads(); ++ } ++} ++ ++static MYSQL_SYSVAR_BOOL(sched_affinity, threadpool_sched_affinity, ++ PLUGIN_VAR_RQCMDARG, ++ "Control whether thread group scheduling affinity.", ++ nullptr, // check func ++ fix_sched_affinity, // update func ++ false); ++ ++static MYSQL_SYSVAR_STR(sched_affinity_foreground_thread, threadpool_sched_affinity_foreground_thread, ++ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, ++ "The set of cpus which foreground threads will run on.", ++ check_fix_sched_affinity_cpustring, // check func ++ fix_sched_affinity_foreground_cpustring, // update func ++ nullptr); // default value ++ ++static MYSQL_SYSVAR_STR(sched_affinity_log_writer, threadpool_sched_affinity_log_writer, ++ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, ++ "log_writer", ++ check_fix_sched_affinity_cpustring, // check func ++ fix_sched_affinity_background_cpustring, // update func ++ nullptr); // default value ++ ++static MYSQL_SYSVAR_STR(sched_affinity_log_flusher, threadpool_sched_affinity_log_flusher, ++ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, ++ "log_flusher", ++ check_fix_sched_affinity_cpustring, // check func ++ fix_sched_affinity_background_cpustring, // update func ++ nullptr); // default value ++ ++static MYSQL_SYSVAR_STR(sched_affinity_log_write_notifier, threadpool_sched_affinity_log_write_notifier, ++ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, ++ "log_write_notifier", ++ check_fix_sched_affinity_cpustring, // check func ++ fix_sched_affinity_background_cpustring, // update func ++ nullptr); // default value ++ ++static MYSQL_SYSVAR_STR(sched_affinity_log_flush_notifier, threadpool_sched_affinity_log_flush_notifier, ++ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, ++ "log_flush_notifier", ++ check_fix_sched_affinity_cpustring, // check func ++ fix_sched_affinity_background_cpustring, // update func ++ nullptr); // default value ++ ++static MYSQL_SYSVAR_STR(sched_affinity_log_checkpointer, threadpool_sched_affinity_log_checkpointer, ++ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, ++ "log_checkpointer", ++ check_fix_sched_affinity_cpustring, // check func ++ fix_sched_affinity_background_cpustring, // update func ++ nullptr); // default value ++ ++static MYSQL_SYSVAR_STR(sched_affinity_purge_coordinator, threadpool_sched_affinity_purge_coordinator, ++ PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, ++ "purge_coordinator", ++ check_fix_sched_affinity_cpustring, // check func ++ fix_sched_affinity_background_cpustring, // update func ++ nullptr); // default value ++ ++ ++static bool tptarget(void) ++{ ++ unsigned long long cpuId; ++ __asm__ volatile("mrs %0, MIDR_EL1":"=r"(cpuId)); ++ return (((cpuId >> 0x18) & 0xFF) == 0x48); ++} ++ ++thread_pool_rwlock_t gPluginLock; ++st_plugin_int *gPluginPtr = nullptr; ++bool gPluginUninstalling = false; ++ ++static int threadpool_plugin_init(void *plugin) ++{ ++ DBUG_ENTER("threadpool_plugin_init"); ++ ++ if (!tptarget()) { ++ DBUG_RETURN(-1); ++ } ++ ++ gPluginPtr = static_cast(plugin); ++ gPluginUninstalling = false; ++ tp_init(); ++ my_connection_handler_set(&tp_chf, &tp_event_functions); ++ DBUG_RETURN(0); ++} ++ ++static int threadpool_plugin_check_uninstall(void *) { ++ DBUG_ENTER("threadpool_plugin_check_uninstall"); ++ XLockGuard lk(gPluginLock); ++ if (!gPluginUninstalling) { ++ gPluginUninstalling = true; ++ my_connection_handler_reset(); ++ } ++ DBUG_RETURN(0); ++} ++ ++static int threadpool_plugin_deinit(void *) ++{ ++ DBUG_ENTER("threadpool_plugin_deinit"); ++ XLockGuard lk(gPluginLock); ++ if (!gPluginUninstalling) { ++ gPluginUninstalling = true; ++ my_connection_handler_reset(); ++ } ++ tp_deinit(); ++ DBUG_RETURN(0); ++} ++ ++static MYSQL_THDVAR_UINT(high_prio_tickets, ++ PLUGIN_VAR_RQCMDARG, ++ "Number of tickets to enter the high priority event queue for each " ++ "transaction.", ++ NULL, NULL, UINT_MAX, 0, UINT_MAX, 1); ++ ++const char *threadpool_high_prio_mode_names[] = {"transactions", "statements", ++ "none", NullS}; ++TYPELIB threadpool_high_prio_mode_typelib = { ++ array_elements(threadpool_high_prio_mode_names) - 1, "", ++ threadpool_high_prio_mode_names, NULL ++}; ++ ++static MYSQL_THDVAR_ENUM(high_prio_mode, ++ PLUGIN_VAR_RQCMDARG, ++ "High priority queue mode: one of 'transactions', 'statements' or 'none'. " ++ "In the 'transactions' mode the thread pool uses both high- and low-priority " ++ "queues depending on whether an event is generated by an already started " ++ "transaction and whether it has any high priority tickets (see " ++ "thread_pool_high_prio_tickets). In the 'statements' mode all events (i.e. " ++ "individual statements) always go to the high priority queue, regardless of " ++ "the current transaction state and high priority tickets. " ++ "'none' is the opposite of 'statements', i.e. disables the high priority queue " ++ "completely.", ++ NULL, NULL, TP_HIGH_PRIO_MODE_TRANSACTIONS, &threadpool_high_prio_mode_typelib); ++ ++static uint &idle_timeout = threadpool_idle_timeout; ++static bool &dedicated_listener = threadpool_dedicated_listener; ++static uint &size = threadpool_size; ++static uint &stall_limit = threadpool_stall_limit; ++static uint &max_threads = threadpool_max_threads; ++static uint &oversubscribe = threadpool_oversubscribe; ++static uint &toobusy = threadpool_toobusy; ++static bool &connection_balance = threadpool_connection_balance; ++ ++static auto &sched_affinity = threadpool_sched_affinity; ++static auto &sched_affinity_foreground_thread = threadpool_sched_affinity_foreground_thread; ++static auto &sched_affinity_log_writer = threadpool_sched_affinity_log_writer; ++static auto &sched_affinity_log_flusher = threadpool_sched_affinity_log_flusher; ++static auto &sched_affinity_log_write_notifier = threadpool_sched_affinity_log_write_notifier; ++static auto &sched_affinity_log_flush_notifier = threadpool_sched_affinity_log_flush_notifier; ++static auto &sched_affinity_log_checkpointer = threadpool_sched_affinity_log_checkpointer; ++static auto &sched_affinity_purge_coordinator = threadpool_sched_affinity_purge_coordinator; ++ ++SYS_VAR *system_variables[] = { ++ MYSQL_SYSVAR(idle_timeout), ++ MYSQL_SYSVAR(dedicated_listener), ++ MYSQL_SYSVAR(size), ++ MYSQL_SYSVAR(max_threads), ++ MYSQL_SYSVAR(stall_limit), ++ MYSQL_SYSVAR(oversubscribe), ++ MYSQL_SYSVAR(toobusy), ++ MYSQL_SYSVAR(high_prio_tickets), ++ MYSQL_SYSVAR(connection_balance), ++ MYSQL_SYSVAR(high_prio_mode), ++ MYSQL_SYSVAR(sched_affinity), ++ MYSQL_SYSVAR(sched_affinity_foreground_thread), ++ MYSQL_SYSVAR(sched_affinity_log_writer), ++ MYSQL_SYSVAR(sched_affinity_log_flusher), ++ MYSQL_SYSVAR(sched_affinity_log_write_notifier), ++ MYSQL_SYSVAR(sched_affinity_log_flush_notifier), ++ MYSQL_SYSVAR(sched_affinity_log_checkpointer), ++ MYSQL_SYSVAR(sched_affinity_purge_coordinator), ++ NULL ++}; ++ ++namespace Show { ++ ++static ST_FIELD_INFO groups_fields_info[] = ++{ ++ {"GROUP_ID", 6, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"CONNECTIONS", 6, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"THREADS", 6, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"ACTIVE_THREADS", 6, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"STANDBY_THREADS", 6, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"QUEUE_LENGTH", 6, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"HAS_LISTENER", 1, MYSQL_TYPE_TINY, 0, 0, 0, 0}, ++ {"IS_STALLED", 1, MYSQL_TYPE_TINY, 0, 0, 0, 0}, ++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0, 0} ++}; ++ ++} // namespace Show ++ ++ ++static int groups_fill_table(THD* thd, ++#if (MYSQL_VERSION_ID>80031) ++ Table_ref* tables, ++#else ++ TABLE_LIST* tables, ++#endif ++Item*) ++{ ++ if (!all_groups) ++ return 0; ++ ++ TABLE* table = tables->table; ++ for (uint i = 0; i < MAX_THREAD_GROUPS && all_groups[i].pollfd != -1; i++) ++ { ++ thread_group_t* group = &all_groups[i]; ++ ++ mysql_mutex_lock(&group->mutex); ++ ++ /* ID */ ++ table->field[0]->store(i, true); ++ change_group_rwlock.slock(); ++ /* CONNECTION_COUNT */ ++ table->field[1]->store(group->connection_count, true); ++ change_group_rwlock.unlock(); ++ /* THREAD_COUNT */ ++ table->field[2]->store(group->thread_count, true); ++ /* ACTIVE_THREAD_COUNT */ ++ table->field[3]->store(group->active_thread_count, true); ++ /* STANDBY_THREAD_COUNT */ ++ table->field[4]->store(group->waiting_thread_count, true); ++ /* QUEUE LENGTH */ ++ uint queue_len = group->high_prio_queue.elements() ++ + group->queue.elements(); ++ table->field[5]->store(queue_len, true); ++ /* HAS_LISTENER */ ++ table->field[6]->store((longlong)(group->listener != 0), true); ++ /* IS_STALLED */ ++ table->field[7]->store(group->stalled, true); ++ ++ mysql_mutex_unlock(&group->mutex); ++ ++ if (schema_table_store_record(thd, table)) ++ return 1; ++ } ++ return 0; ++} ++ ++ ++static int groups_init(void* p) ++{ ++ if (!tptarget()) { ++ return -1; ++ } ++ ++ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*)p; ++ schema->fields_info = Show::groups_fields_info; ++ schema->fill_table = groups_fill_table; ++ return 0; ++} ++ ++ ++namespace Show { ++ ++static ST_FIELD_INFO queues_field_info[] = ++{ ++ {"GROUP_ID", 6, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"POSITION", 6, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"PRIORITY", 1, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"CONNECTION_ID", 19, MYSQL_TYPE_LONGLONG, 0, MY_I_S_UNSIGNED, 0, 0}, ++ {"QUEUEING_TIME_MICROSECONDS", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0, 0} ++}; ++ ++} // namespace Show ++ ++typedef connection_queue_t::Iterator connection_queue_iterator; ++ ++static int queues_fill_table(THD* thd, ++#if (MYSQL_VERSION_ID>80031) ++ Table_ref* tables, ++#else ++ TABLE_LIST* tables, ++#endif ++Item*) ++{ ++ if (!all_groups) ++ return 0; ++ ++ TABLE* table = tables->table; ++ for (uint group_id = 0; ++ group_id < MAX_THREAD_GROUPS && all_groups[group_id].pollfd != -1; ++ group_id++) ++ { ++ thread_group_t* group = &all_groups[group_id]; ++ ++ mysql_mutex_lock(&group->mutex); ++ bool err = false; ++ int pos = 0; ++ ulonglong now = my_microsecond_getsystime(); ++ connection_queue_t queues[NQUEUES] = {group->high_prio_queue, group->queue}; ++ for (uint prio = 0; prio < NQUEUES && !err; prio++) ++ { ++ connection_queue_iterator it(queues[prio]); ++ connection_t* c; ++ while ((c = it++) != nullptr) ++ { ++ /* GROUP_ID */ ++ table->field[0]->store(group_id, true); ++ /* POSITION */ ++ table->field[1]->store(pos++, true); ++ /* PRIORITY */ ++ table->field[2]->store(prio, true); ++ /* CONNECTION_ID */ ++ if (c->thd != nullptr) { ++ table->field[3]->store(c->thd->thread_id(), true); ++ } else { ++ table->field[3]->store(0, true); ++ } ++ /* QUEUEING_TIME */ ++ table->field[4]->store(now - c->enqueue_time, true); ++ ++ err = schema_table_store_record(thd, table); ++ if (err) ++ break; ++ } ++ } ++ mysql_mutex_unlock(&group->mutex); ++ if (err) ++ return 1; ++ } ++ return 0; ++} ++ ++static int queues_init(void* p) ++{ ++ if (!tptarget()) { ++ return -1; ++ } ++ ++ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*)p; ++ schema->fields_info = Show::queues_field_info; ++ schema->fill_table = queues_fill_table; ++ return 0; ++} ++ ++namespace Show { ++ ++static ST_FIELD_INFO stats_fields_info[] = ++{ ++ {"GROUP_ID", 6, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"THREAD_CREATIONS", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {"THREAD_CREATIONS_DUE_TO_STALL", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {"WAKES", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {"WAKES_DUE_TO_STALL", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {"THROTTLES", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {"STALLS", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {"POLLS_BY_LISTENER", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {"POLLS_BY_WORKER", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {"DEQUEUES_BY_LISTENER", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {"DEQUEUES_BY_WORKER", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0, 0} ++}; ++ ++} // namespace Show ++ ++ ++static int stats_fill_table(THD* thd, ++#if (MYSQL_VERSION_ID>80031) ++ Table_ref* tables, ++#else ++ TABLE_LIST* tables, ++#endif ++Item*) ++{ ++ if (!all_groups) ++ return 0; ++ ++ TABLE* table = tables->table; ++ for (uint i = 0; i < MAX_THREAD_GROUPS && all_groups[i].pollfd != -1; i++) ++ { ++ table->field[0]->store(i, true); ++ thread_group_t* group = &all_groups[i]; ++ ++ mysql_mutex_lock(&group->mutex); ++ thread_group_counters_t* counters = &group->counters; ++ table->field[1]->store(counters->thread_creations, true); ++ table->field[2]->store(counters->thread_creations_due_to_stall, true); ++ table->field[3]->store(counters->wakes, true); ++ table->field[4]->store(counters->wakes_due_to_stall, true); ++ table->field[5]->store(counters->throttles, true); ++ table->field[6]->store(counters->stalls, true); ++ table->field[7]->store(counters->polls[LISTENER], true); ++ table->field[8]->store(counters->polls[WORKER], true); ++ table->field[9]->store(counters->dequeues[LISTENER], true); ++ table->field[10]->store(counters->dequeues[WORKER], true); ++ mysql_mutex_unlock(&group->mutex); ++ if (schema_table_store_record(thd, table)) ++ return 1; ++ } ++ return 0; ++} ++ ++static int stats_init(void* p) ++{ ++ if (!tptarget()) { ++ return -1; ++ } ++ ++ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*)p; ++ schema->fields_info = Show::stats_fields_info; ++ schema->fill_table = stats_fill_table; ++ return 0; ++} ++ ++ ++namespace Show { ++ ++static ST_FIELD_INFO waits_fields_info[] = ++{ ++ {"REASON", 16, MYSQL_TYPE_STRING, 0, 0, 0, 0}, ++ {"COUNT", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0, 0} ++}; ++ ++} // namespace Show ++ ++/* See thd_wait_type enum for explanation*/ ++static const LEX_CSTRING wait_reasons[THD_WAIT_LAST] = ++{ ++ {STRING_WITH_LEN("UNKNOWN")}, ++ {STRING_WITH_LEN("SLEEP")}, ++ {STRING_WITH_LEN("DISKIO")}, ++ {STRING_WITH_LEN("ROW_LOCK")}, ++ {STRING_WITH_LEN("GLOBAL_LOCK")}, ++ {STRING_WITH_LEN("META_DATA_LOCK")}, ++ {STRING_WITH_LEN("TABLE_LOCK")}, ++ {STRING_WITH_LEN("USER_LOCK")}, ++ {STRING_WITH_LEN("BINLOG")}, ++ {STRING_WITH_LEN("GROUP_COMMIT")}, ++ {STRING_WITH_LEN("SYNC")} ++}; ++ ++extern std::atomic tp_waits[THD_WAIT_LAST]; ++ ++static int waits_fill_table(THD* thd, ++#if (MYSQL_VERSION_ID>80031) ++ Table_ref* tables, ++#else ++ TABLE_LIST* tables, ++#endif ++Item*) ++{ ++ if (!all_groups) ++ return 0; ++ ++ TABLE* table = tables->table; ++ for (unsigned int i = 0; i < THD_WAIT_LAST; i++) ++ { ++ table->field[0]->store(wait_reasons[i].str, wait_reasons[i].length, system_charset_info); ++ table->field[1]->store(tp_waits[i], true); ++ if (schema_table_store_record(thd, table)) ++ return 1; ++ } ++ return 0; ++} ++ ++static int waits_init(void* p) ++{ ++ if (!tptarget()) { ++ return -1; ++ } ++ ++ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*)p; ++ schema->fields_info = Show::waits_fields_info; ++ schema->fill_table = waits_fill_table; ++ return 0; ++} ++ ++struct st_mysql_daemon thread_pool_plugin = ++{ MYSQL_DAEMON_INTERFACE_VERSION }; ++ ++static struct st_mysql_information_schema plugin_descriptor = ++{ MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION }; ++ ++mysql_declare_plugin(thread_pool) ++{ ++ MYSQL_DAEMON_PLUGIN, ++ &thread_pool_plugin, ++ "thread_pool", ++ "TEST_TEST", ++ "thread pool plugin extracted from percona server", ++ PLUGIN_LICENSE_GPL, ++ threadpool_plugin_init, /* Plugin Init */ ++ threadpool_plugin_check_uninstall, /* Plugin Check uninstall */ ++ threadpool_plugin_deinit, /* Plugin Deinit */ ++ 0x0100 /* 1.0 */, ++ nullptr, /* status variables */ ++ system_variables, /* system variables */ ++ nullptr, /* config options */ ++ 0, /* flags */ ++}, ++{ ++ MYSQL_INFORMATION_SCHEMA_PLUGIN, ++ &plugin_descriptor, ++ "THREAD_POOL_GROUPS", ++ "Vladislav Vaintroub", ++ "Provides information about threadpool groups.", ++ PLUGIN_LICENSE_GPL, ++ groups_init, ++ nullptr, ++ nullptr, ++ 0x0100, ++ nullptr, ++ nullptr, ++ nullptr, ++ 0, ++}, ++{ ++ MYSQL_INFORMATION_SCHEMA_PLUGIN, ++ &plugin_descriptor, ++ "THREAD_POOL_QUEUES", ++ "Vladislav Vaintroub", ++ "Provides information about threadpool queues.", ++ PLUGIN_LICENSE_GPL, ++ queues_init, ++ nullptr, ++ nullptr, ++ 0x0100, ++ nullptr, ++ nullptr, ++ nullptr, ++ 0, ++}, ++{ ++ MYSQL_INFORMATION_SCHEMA_PLUGIN, ++ &plugin_descriptor, ++ "THREAD_POOL_STATS", ++ "Vladislav Vaintroub", ++ "Provides performance counter information for threadpool.", ++ PLUGIN_LICENSE_GPL, ++ stats_init, ++ nullptr, ++ nullptr, ++ 0x0100, ++ nullptr, ++ nullptr, ++ nullptr, ++ 0, ++}, ++{ ++ MYSQL_INFORMATION_SCHEMA_PLUGIN, ++ &plugin_descriptor, ++ "THREAD_POOL_WAITS", ++ "Vladislav Vaintroub", ++ "Provides wait counters for threadpool.", ++ PLUGIN_LICENSE_GPL, ++ waits_init, ++ nullptr, ++ nullptr, ++ 0x0100, ++ nullptr, ++ nullptr, ++ nullptr, ++ 0, ++} ++mysql_declare_plugin_end; ++ ++uint tp_get_thdvar_high_prio_tickets(THD *thd) { ++ return THDVAR(thd, high_prio_tickets); ++} ++ ++uint tp_get_thdvar_high_prio_mode(THD *thd) { ++ return THDVAR(thd, high_prio_mode); ++} ++ +diff --git a/plugin/thread_pool/threadpool_rwlock.h b/plugin/thread_pool/threadpool_rwlock.h +new file mode 100644 +index 000000000..e07b37983 +--- /dev/null ++++ b/plugin/thread_pool/threadpool_rwlock.h +@@ -0,0 +1,101 @@ ++/* Copyright (C) 2012 Monty Program Ab ++ Copyright (C) 2022 Huawei Technologies Co., Ltd ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; version 2 of the License. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ ++#ifndef THREADPOOL_RWLOCK_H_ ++#define THREADPOOL_RWLOCK_H_ ++ ++#include "pthread.h" ++ ++class thread_pool_rwlock_t { ++public: ++ thread_pool_rwlock_t() ++ { ++ pthread_rwlock_init(&lk, NULL); ++ } ++ ++ ~thread_pool_rwlock_t() ++ { ++ pthread_rwlock_destroy(&lk); ++ } ++ ++ void slock() ++ { ++ while (pthread_rwlock_rdlock(&lk) != 0) { ++ std::this_thread::yield(); ++ } ++ } ++ ++ void unslock() ++ { ++ while (pthread_rwlock_unlock(&lk) != 0) { ++ std::this_thread::yield(); ++ } ++ } ++ ++ void xlock() ++ { ++ while (pthread_rwlock_wrlock(&lk) != 0) { ++ std::this_thread::yield(); ++ } ++ } ++ ++ void unxlock() ++ { ++ while (pthread_rwlock_unlock(&lk) != 0) { ++ std::this_thread::yield(); ++ } ++ } ++ ++ void lock() ++ { ++ xlock(); ++ } ++ ++ void unlock() ++ { ++ unxlock(); ++ } ++ ++private: ++ pthread_rwlock_t lk; ++}; ++ ++class SLockGuard { ++public: ++ SLockGuard(thread_pool_rwlock_t &lk): lck(lk) { ++ lck.slock(); ++ } ++ ++ ~SLockGuard() { ++ lck.unslock(); ++ } ++ ++ thread_pool_rwlock_t &lck; ++}; ++ ++class XLockGuard { ++public: ++ XLockGuard(thread_pool_rwlock_t &lk): lck(lk) { ++ lck.xlock(); ++ } ++ ++ ~XLockGuard() { ++ lck.unxlock(); ++ } ++ ++ thread_pool_rwlock_t &lck; ++}; ++ ++#endif //THREADPOOL_RWLOCK_H_ +diff --git a/plugin/thread_pool/threadpool_unix.cc b/plugin/thread_pool/threadpool_unix.cc +new file mode 100644 +index 000000000..db980ef1e +--- /dev/null ++++ b/plugin/thread_pool/threadpool_unix.cc +@@ -0,0 +1,1843 @@ ++/* Copyright (C) 2012 Monty Program Ab ++ Copyright (C) 2022 Huawei Technologies Co., Ltd ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; version 2 of the License. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ ++ ++#include "threadpool_unix.h" ++ ++#include ++#include ++#include ++ ++#include "threadpool.h" ++#include "numa_affinity_manager.h" ++#include "sql/debug_sync.h" ++#include "sql/log.h" ++#include "sql/protocol_classic.h" ++#include "sql/set_var.h" ++#include "my_sys.h" ++#include "my_systime.h" ++#include "mysql/thread_pool_priv.h" // thd_is_transaction_active() ++#include "mysql/plugin.h" ++ ++#define MYSQL_SERVER 1 ++ ++/** Maximum number of native events a listener can read in one go */ ++#define MAX_EVENTS 1024 ++ ++/** Define if wait_begin() should create threads if necessary without waiting ++for stall detection to kick in */ ++#define THREADPOOL_CREATE_THREADS_ON_WAIT ++ ++/** Indicates that threadpool was initialized*/ ++static bool threadpool_started = false; ++ ++thread_pool_rwlock_t change_group_rwlock; ++ ++/* ++ Define PSI Keys for performance schema. ++ We have a mutex per group, worker threads, condition per worker thread, ++ and timer thread with its own mutex and condition. ++*/ ++ ++#ifdef HAVE_PSI_INTERFACE ++static PSI_mutex_key key_group_mutex; ++static PSI_mutex_key key_timer_mutex; ++static PSI_mutex_info mutex_list[] = { ++ {&key_group_mutex, "group_mutex", 0, 0, PSI_DOCUMENT_ME}, ++ {&key_timer_mutex, "timer_mutex", PSI_FLAG_SINGLETON, 0, PSI_DOCUMENT_ME}}; ++ ++static PSI_cond_key key_worker_cond; ++static PSI_cond_key key_timer_cond; ++static PSI_cond_info cond_list[] = { ++ {&key_worker_cond, "worker_cond", 0, 0, PSI_DOCUMENT_ME}, ++ {&key_timer_cond, "timer_cond", PSI_FLAG_SINGLETON, 0, PSI_DOCUMENT_ME}}; ++ ++static PSI_thread_key key_worker_thread; ++static PSI_thread_key key_timer_thread; ++ ++#if (MYSQL_VERSION_ID>80026) ++static PSI_thread_info thread_list[] = { ++ {&key_worker_thread, "worker_thread", "worker_th", 0, 0, PSI_DOCUMENT_ME}, ++ {&key_timer_thread, "timer_thread", "timer_th", PSI_FLAG_SINGLETON, 0, ++ PSI_DOCUMENT_ME}}; ++#else ++static PSI_thread_info thread_list[] = { ++ {&key_worker_thread, "worker_thread", 0, 0, PSI_DOCUMENT_ME}, ++ {&key_timer_thread, "timer_thread", PSI_FLAG_SINGLETON, 0, ++ PSI_DOCUMENT_ME}}; ++#endif ++ ++#endif // HAVE_PSI_INTERFACE ++ ++thread_group_t all_groups[MAX_THREAD_GROUPS]; ++numa_affinity_manager group_affinity; ++ ++static uint group_count; ++ ++/** ++ Used for printing "pool blocked" message, see ++ print_pool_blocked_message(); ++*/ ++static ulonglong pool_block_start; ++ ++/* Global timer for all groups */ ++struct pool_timer_t { ++ mysql_mutex_t mutex; ++ mysql_cond_t cond; ++ std::atomic current_microtime; ++ std::atomic next_timeout_check; ++ int tick_interval; ++ bool shutdown; ++}; ++ ++static pool_timer_t pool_timer; ++ ++static void queue_put(thread_group_t *thread_group, connection_t *connection); ++static int wake_thread(thread_group_t *thread_group, ++ bool due_to_stall) noexcept; ++static void handle_event(connection_t *connection); ++static int wake_or_create_thread(thread_group_t *thread_group, ++ bool due_to_stall = false); ++static int create_worker(thread_group_t *thread_group, bool due_to_stall) noexcept; ++static void *admin_port_worker_main(void *param); ++static void *worker_main(void *param); ++static void check_stall(thread_group_t *thread_group); ++static void connection_abort(connection_t *connection); ++static void set_next_timeout_check(ulonglong abstime); ++static void print_pool_blocked_message(bool) noexcept; ++ ++static void tp_lock_plugin_mutex() { ++ mysql_mutex_lock(&LOCK_plugin); ++} ++ ++static void tp_unlock_plugin_mutex() { ++ mysql_mutex_unlock(&LOCK_plugin); ++} ++ ++static void tp_inc_ref_count() { ++ tp_lock_plugin_mutex(); ++ assert(gPluginPtr != nullptr); ++ gPluginPtr->ref_count++; ++ tp_unlock_plugin_mutex(); ++} ++ ++static void tp_dec_ref_count() { ++ tp_lock_plugin_mutex(); ++ assert(gPluginPtr != nullptr); ++ gPluginPtr->ref_count--; ++ tp_unlock_plugin_mutex(); ++} ++ ++class ThreadPoolConnSet { ++public: ++ ThreadPoolConnSet() {} ++ virtual ~ThreadPoolConnSet() {} ++ ++ bool empty() { ++ bool ret = false; ++ mtx.lock(); ++ ret = conns.empty(); ++ mtx.unlock(); ++ return ret; ++ } ++ ++ void insert(connection_t *c) { ++ tp_inc_ref_count(); ++ mtx.lock(); ++ conns.insert(c); ++ mtx.unlock(); ++ } ++ ++ void erase(connection_t *c) { ++ mtx.lock(); ++ conns.erase(c); ++ mtx.unlock(); ++ tp_dec_ref_count(); ++ } ++ ++public: ++ std::set conns; ++ std::mutex mtx; ++}; ++ ++ThreadPoolConnSet threadpool_thds; ++ ++int vio_cancel(Vio *vio, int how) ++{ ++ int r= 0; ++ DBUG_ENTER("vio_cancel"); ++ ++ if (vio->inactive == false) ++ { ++ assert(vio->type == VIO_TYPE_TCPIP || ++ vio->type == VIO_TYPE_SOCKET || ++ vio->type == VIO_TYPE_SSL); ++ ++ assert(mysql_socket_getfd(vio->mysql_socket) >= 0); ++ if (mysql_socket_shutdown(vio->mysql_socket, how)) ++ r= -1; ++ } ++ ++ DBUG_RETURN(r); ++} ++ ++/** ++ Asynchronous network IO. ++ ++ We use native edge-triggered network IO multiplexing facility. ++ This maps to different APIs on different Unixes. ++ ++ Supported are currently Linux with epoll, Solaris with event ports, ++ OSX and BSD with kevent. All those API's are used with one-shot flags ++ (the event is signalled once client has written something into the socket, ++ then socket is removed from the "poll-set" until the command is finished, ++ and we need to re-arm/re-register socket) ++ ++ No implementation for poll/select/AIO is currently provided. ++ ++ The API closely resembles all of the above mentioned platform APIs ++ and consists of following functions. ++ ++ - io_poll_create() ++ Creates an io_poll descriptor ++ On Linux: epoll_create() ++ ++ - io_poll_associate_fd(int poll_fd, int fd, void *data) ++ Associate file descriptor with io poll descriptor ++ On Linux : epoll_ctl(..EPOLL_CTL_ADD)) ++ ++ - io_poll_disassociate_fd(int pollfd, int fd) ++ Associate file descriptor with io poll descriptor ++ On Linux: epoll_ctl(..EPOLL_CTL_DEL) ++ ++ ++ - io_poll_start_read(int poll_fd,int fd, void *data) ++ The same as io_poll_associate_fd(), but cannot be used before ++ io_poll_associate_fd() was called. ++ On Linux : epoll_ctl(..EPOLL_CTL_MOD) ++ ++ - io_poll_wait (int pollfd, native_event *native_events, int maxevents, ++ int timeout_ms) ++ ++ wait until one or more descriptors added with io_poll_associate_fd() ++ or io_poll_start_read() becomes readable. Data associated with ++ descriptors can be retrieved from native_events array, using ++ native_event_get_userdata() function. ++ ++ ++ On Linux: epoll_wait() ++*/ ++ ++#if defined(__linux__) ++#ifndef EPOLLRDHUP ++/* Early 2.6 kernel did not have EPOLLRDHUP */ ++#define EPOLLRDHUP 0 ++#endif ++static int io_poll_create() noexcept { return epoll_create(1); } ++ ++static int io_poll_associate_fd(int pollfd, int fd, void *data) noexcept { ++ struct epoll_event ev; ++ ev.data.u64 = 0; /* Keep valgrind happy */ ++ ev.data.ptr = data; ++ ev.events = EPOLLIN | EPOLLET | EPOLLERR | EPOLLRDHUP | EPOLLONESHOT; ++ return epoll_ctl(pollfd, EPOLL_CTL_ADD, fd, &ev); ++} ++ ++static int io_poll_start_read(int pollfd, int fd, void *data) noexcept { ++ struct epoll_event ev; ++ ev.data.u64 = 0; /* Keep valgrind happy */ ++ ev.data.ptr = data; ++ ev.events = EPOLLIN | EPOLLET | EPOLLERR | EPOLLRDHUP | EPOLLONESHOT; ++ return epoll_ctl(pollfd, EPOLL_CTL_MOD, fd, &ev); ++} ++ ++static int io_poll_disassociate_fd(int pollfd, int fd) noexcept { ++ struct epoll_event ev; ++ return epoll_ctl(pollfd, EPOLL_CTL_DEL, fd, &ev); ++} ++ ++/* ++ Wrapper around epoll_wait. ++ NOTE - in case of EINTR, it restarts with original timeout. Since we use ++ either infinite or 0 timeouts, this is not critical ++*/ ++static int io_poll_wait(int pollfd, native_event *native_events, int maxevents, ++ int timeout_ms) noexcept { ++ int ret; ++ do { ++ ret = epoll_wait(pollfd, native_events, maxevents, timeout_ms); ++ } while (ret == -1 && errno == EINTR); ++ return ret; ++} ++ ++static void *native_event_get_userdata(native_event *event) noexcept { ++ return event->data.ptr; ++} ++ ++#elif defined(__FreeBSD__) || defined(__APPLE__) ++static int io_poll_create() noexcept { return kqueue(); } ++ ++static int io_poll_start_read(int pollfd, int fd, void *data) noexcept { ++ struct kevent ke; ++ EV_SET(&ke, fd, EVFILT_READ, EV_ADD | EV_ONESHOT, 0, 0, data); ++ return kevent(pollfd, &ke, 1, 0, 0, 0); ++} ++ ++static int io_poll_associate_fd(int pollfd, int fd, void *data) noexcept { ++ struct kevent ke; ++ EV_SET(&ke, fd, EVFILT_READ, EV_ADD | EV_ONESHOT, 0, 0, data); ++ return io_poll_start_read(pollfd, fd, data); ++} ++ ++static int io_poll_disassociate_fd(int pollfd, int fd) noexcept { ++ struct kevent ke; ++ EV_SET(&ke, fd, EVFILT_READ, EV_DELETE, 0, 0, nullptr); ++ return kevent(pollfd, &ke, 1, 0, 0, 0); ++} ++ ++static int io_poll_wait(int pollfd, struct kevent *events, int maxevents, ++ int timeout_ms) noexcept { ++ struct timespec ts; ++ int ret; ++ if (timeout_ms >= 0) { ++ ts.tv_sec = timeout_ms / 1000; ++ ts.tv_nsec = (timeout_ms % 1000) * 1000000; ++ } ++ do { ++ ret = kevent(pollfd, 0, 0, events, maxevents, ++ (timeout_ms >= 0) ? &ts : nullptr); ++ } while (ret == -1 && errno == EINTR); ++ return ret; ++} ++ ++static void *native_event_get_userdata(native_event *event) noexcept { ++ return event->udata; ++} ++#else ++#error not ported yet to this OS ++#endif ++ ++namespace { ++ ++/* ++ Prevent too many active threads executing at the same time, if the workload is ++ not CPU bound. ++*/ ++inline bool too_many_active_threads( ++ const thread_group_t &thread_group) noexcept { ++ return (thread_group.active_thread_count >= ++ 1 + (int)threadpool_oversubscribe && ++ !thread_group.stalled); ++} ++ ++/* ++ Limit the number of 'busy' threads by 1 + threadpool_toobusy. A thread ++ is busy if it is in either the active state or the waiting state (i.e. between ++ thd_wait_begin() / thd_wait_end() calls). ++*/ ++inline bool too_many_busy_threads(const thread_group_t &thread_group) noexcept { ++ return (thread_group.active_thread_count + thread_group.waiting_thread_count > ++ 1 + (int)threadpool_toobusy); ++} ++ ++inline bool too_many_connection(const thread_group_t &thread_group) noexcept { ++ return (thread_group.connection_count > (int)threadpool_toobusy - 1); ++} ++ ++/* ++ Checks if a given connection is eligible to enter the high priority queue ++ based on its current thread_pool_high_prio_mode value, available high ++ priority tickets and transactional state and whether any locks are held. ++*/ ++inline bool connection_is_high_prio(const connection_t &c) noexcept { ++ const ulong mode = tp_get_thdvar_high_prio_mode(c.thd); ++ ++ return (mode == TP_HIGH_PRIO_MODE_STATEMENTS) || ++ (mode == TP_HIGH_PRIO_MODE_TRANSACTIONS && c.tickets > 0 && ++ (thd_is_transaction_active(c.thd) || ++ c.thd->variables.option_bits & OPTION_TABLE_LOCK || ++ c.thd->locked_tables_mode != LTM_NONE || ++ c.thd->mdl_context.has_locks() || ++ c.thd->global_read_lock.is_acquired() || ++ c.thd->mdl_context.has_locks(MDL_key::USER_LEVEL_LOCK) || ++ c.thd->mdl_context.has_locks(MDL_key::LOCKING_SERVICE))); ++} ++ ++int change_group(connection_t *c, thread_group_t *group, thread_group_t *to_group) { ++ assert(c->thread_group == group); ++ ++ /* Remove connection from the old group. */ ++ if (c->bound_to_poll_descriptor) { ++ Vio *const vio = c->thd->get_protocol_classic()->get_vio(); ++ const int fd = mysql_socket_getfd(vio->mysql_socket); ++ mysql_mutex_lock(&group->mutex); ++ io_poll_disassociate_fd(group->pollfd, fd); ++ c->bound_to_poll_descriptor = false; ++ mysql_mutex_unlock(&group->mutex); ++ } ++ c->thread_group->connection_count--; ++ ++ /* Add connection to the new group. */ ++ c->thread_group = to_group; ++ to_group->connection_count++; ++ /* Ensure that there is a listener in the new group. */ ++ int ret = 0; ++ mysql_mutex_lock(&to_group->mutex); ++ if (!to_group->thread_count) ret = create_worker(to_group, false); ++ mysql_mutex_unlock(&to_group->mutex); ++ ++ return ret; ++} ++ ++int get_avg_conn_cnt() { ++ int total_conn_cnt = 0; ++ ++ for (uint i = 0; i < group_count; i++) { ++ total_conn_cnt += all_groups[i].connection_count; ++ } ++ return ceil((total_conn_cnt + 0.0) / group_count); ++} ++ ++thread_group_t *get_change_group_to(connection_t *connection) { ++ int avg_conn_cnt = get_avg_conn_cnt(); ++ thread_group_t *group = connection->thread_group; ++ ++ thread_group_t *to_group = &all_groups[(connection->thd->thread_id()) % group_count]; ++ if (to_group->connection_count <= avg_conn_cnt || !threadpool_connection_balance) { ++ return to_group; ++ } ++ ++ for (uint i = 0; i < group_count; i++) { ++ if (group == &all_groups[i]) { ++ continue; ++ } ++ if (all_groups[i].connection_count < avg_conn_cnt || i == group_count - 1) { ++ return &all_groups[i]; ++ } ++ } ++ return &all_groups[group_count - 1]; ++} ++ ++int get_min_conn_cnt() { ++ int min_conn_cnt = INT_MAX; ++ for (uint i = 0; i < group_count; i++) { ++ min_conn_cnt = all_groups[i].connection_count < min_conn_cnt ? ++ all_groups[i].connection_count : min_conn_cnt; ++ } ++ return min_conn_cnt; ++} ++ ++bool check_change_group_low(connection_t *connection) { ++ return (!threadpool_connection_balance && ++ connection->thread_group - all_groups != connection->thd->thread_id() % group_count) || ++ (threadpool_connection_balance && ++ ((connection->thread_group->connection_count > get_avg_conn_cnt() || ++ connection->thread_group->connection_count - get_min_conn_cnt() >= 2) || ++ (connection->thread_group - all_groups >= group_count))); ++} ++ ++int change_group(connection_t *connection) { ++ int ret = -1; ++ change_group_rwlock.xlock(); ++ if (check_change_group_low(connection)) { ++ thread_group_t *to_group = get_change_group_to(connection); ++ ret = change_group(connection, connection->thread_group, to_group); ++ } ++ change_group_rwlock.unxlock(); ++ return ret; ++} ++ ++/** ++ Check if connection needs to migrate to a different group ++ because group_count has changed after thread_pool_size ++ setting or connection_count in each thread group is not ++ evenly distributed. ++*/ ++bool check_change_group(connection_t *connection) { ++ bool ret = false; ++ change_group_rwlock.slock(); ++ ret = check_change_group_low(connection); ++ change_group_rwlock.unslock(); ++ return ret; ++} ++ ++inline bool connection_is_worker_continue(const connection_t &c) noexcept { ++ if (c.thd->is_admin_connection()) { ++ return true; ++ } ++ ++ if (check_change_group(const_cast(&c))) { ++ return false; ++ } ++ ++ if (!too_many_connection(*(c.thread_group))) { ++ return true; ++ } ++ ++ const ulong mode = tp_get_thdvar_high_prio_mode(c.thd); ++ bool ret = (mode == TP_HIGH_PRIO_MODE_TRANSACTIONS && c.tickets > 0 && ++ (thd_is_transaction_active(c.thd) || ++ c.thd->variables.option_bits & OPTION_TABLE_LOCK || ++ c.thd->locked_tables_mode != LTM_NONE || ++ c.thd->mdl_context.has_locks() || ++ c.thd->global_read_lock.is_acquired() || ++ c.thd->mdl_context.has_locks(MDL_key::USER_LEVEL_LOCK) || ++ c.thd->mdl_context.has_locks(MDL_key::LOCKING_SERVICE))); ++ return ret; ++} ++ ++} // namespace ++ ++/* Dequeue element from a workqueue */ ++static connection_t *queue_get(thread_group_t *thread_group) noexcept { ++ DBUG_ENTER("queue_get"); ++ thread_group->queue_event_count++; ++ connection_t *c; ++ ++ if ((c = thread_group->high_prio_queue.front())) { ++ thread_group->high_prio_queue.remove(c); ++ } ++ /* ++ Don't pick events from the low priority queue if there are too many ++ active + waiting threads. ++ */ ++ else if (!too_many_busy_threads(*thread_group) && ++ (c = thread_group->queue.front())) { ++ thread_group->queue.remove(c); ++ } ++ DBUG_RETURN(c); ++} ++ ++static connection_t *queue_get(thread_group_t *group, operation_origin origin) { ++ connection_t *ret = queue_get(group); ++ if (ret != nullptr) { ++ TP_INCREMENT_GROUP_COUNTER(group, dequeues[(int)origin]); ++ } ++ return ret; ++} ++ ++static inline void queue_push(thread_group_t *thread_group, connection_t *connection) ++{ ++ connection->enqueue_time= pool_timer.current_microtime; ++ thread_group->queue.push_back(connection); ++} ++ ++static inline void high_prio_queue_push(thread_group_t *thread_group, connection_t *connection) ++{ ++ connection->enqueue_time= pool_timer.current_microtime; ++ thread_group->high_prio_queue.push_back(connection); ++} ++ ++class Thd_timeout_checker : public Do_THD_Impl { ++ private: ++ pool_timer_t *const m_timer; ++ ++ public: ++ Thd_timeout_checker(pool_timer_t *timer) noexcept : m_timer(timer) {} ++ ++ ~Thd_timeout_checker() override {} ++ ++ void operator()(THD *thd) noexcept override{ ++ if (thd_get_net_read_write(thd) != 1) return; ++ ++ connection_t *connection = (connection_t *)thd->scheduler.data; ++ if (!connection) return; ++ ++ if (connection->abs_wait_timeout < ++ m_timer->current_microtime.load(std::memory_order_relaxed)) { ++ /* Wait timeout exceeded, kill connection. */ ++ mysql_mutex_lock(&thd->LOCK_thd_data); ++ thd->killed = THD::KILL_CONNECTION; ++ tp_post_kill_notification(thd); ++ mysql_mutex_unlock(&thd->LOCK_thd_data); ++ } else { ++ set_next_timeout_check(connection->abs_wait_timeout); ++ } ++ } ++}; ++ ++/* ++ Handle wait timeout : ++ Find connections that have been idle for too long and kill them. ++ Also, recalculate time when next timeout check should run. ++*/ ++static void timeout_check(pool_timer_t *timer) { ++ DBUG_ENTER("timeout_check"); ++ ++ /* Reset next timeout check, it will be recalculated in the loop below */ ++ timer->next_timeout_check.store(ULLONG_MAX, std::memory_order_relaxed); ++ ++ Thd_timeout_checker thd_timeout_checker(timer); ++ Global_THD_manager::get_instance()->do_for_all_thd_copy(&thd_timeout_checker); ++ ++ DBUG_VOID_RETURN; ++} ++ ++/* ++ Timer thread. ++ ++ Periodically, check if one of the thread groups is stalled. Stalls happen if ++ events are not being dequeued from the queue, or from the network, Primary ++ reason for stall can be a lengthy executing non-blocking request. It could ++ also happen that thread is waiting but wait_begin/wait_end is forgotten by ++ storage engine. Timer thread will create a new thread in group in case of ++ a stall. ++ ++ Besides checking for stalls, timer thread is also responsible for terminating ++ clients that have been idle for longer than wait_timeout seconds. ++ ++ TODO: Let the timer sleep for long time if there is no work to be done. ++ Currently it wakes up rather often on and idle server. ++*/ ++static void *timer_thread(void *param) noexcept { ++ my_thread_init(); ++ DBUG_ENTER("timer_thread"); ++ ++ pool_timer_t *timer = (pool_timer_t *)param; ++ timer->next_timeout_check.store(ULLONG_MAX, std::memory_order_relaxed); ++ timer->current_microtime.store(my_microsecond_getsystime(), ++ std::memory_order_relaxed); ++ ++ for (;;) { ++ struct timespec ts; ++ ++ set_timespec_nsec(&ts, timer->tick_interval * 1000000ULL); ++ mysql_mutex_lock(&timer->mutex); ++ int err = mysql_cond_timedwait(&timer->cond, &timer->mutex, &ts); ++ if (timer->shutdown) { ++ mysql_mutex_unlock(&timer->mutex); ++ break; ++ } ++ if (err == ETIMEDOUT) { ++ timer->current_microtime.store(my_microsecond_getsystime(), ++ std::memory_order_relaxed); ++ ++ /* Check stalls in thread groups */ ++ for (size_t i = 0; i < array_elements(all_groups); i++) { ++ if (all_groups[i].connection_count) check_stall(&all_groups[i]); ++ } ++ ++ /* Check if any client exceeded wait_timeout */ ++ if (timer->next_timeout_check.load(std::memory_order_relaxed) <= ++ timer->current_microtime.load(std::memory_order_relaxed)) ++ timeout_check(timer); ++ } ++ mysql_mutex_unlock(&timer->mutex); ++ } ++ ++ mysql_mutex_destroy(&timer->mutex); ++ my_thread_end(); ++ return nullptr; ++} ++ ++/* ++ Check if both the high and low priority queues are empty. ++ ++ NOTE: we also consider the low priority queue empty in case it has events, but ++ they cannot be processed due to the too_many_busy_threads() limit. ++*/ ++static bool queues_are_empty(const thread_group_t &tg) noexcept { ++ return (tg.high_prio_queue.is_empty() && ++ (tg.queue.is_empty() || too_many_busy_threads(tg))); ++} ++ ++static void check_stall(thread_group_t *thread_group) { ++ if (mysql_mutex_trylock(&thread_group->mutex) != 0) { ++ /* Something happens. Don't disturb */ ++ return; ++ } ++ ++ /* ++ Check if listener is present. If not, check whether any IO ++ events were dequeued since last time. If not, this means ++ listener is either in tight loop or thd_wait_begin() ++ was forgotten. Create a new worker(it will make itself listener). ++ */ ++ if (!thread_group->listener && !thread_group->io_event_count) { ++ wake_or_create_thread(thread_group, true); ++ mysql_mutex_unlock(&thread_group->mutex); ++ return; ++ } ++ ++ /* Reset io event count */ ++ thread_group->io_event_count = 0; ++ ++ /* ++ Check whether requests from the workqueues are being dequeued. ++ ++ The stall detection and resolution works as follows: ++ ++ 1. There is a counter thread_group->queue_event_count for the number of ++ events removed from the queues. Timer resets the counter to 0 on each ++ run. ++ 2. Timer determines stall if this counter remains 0 since last check ++ and at least one of the high and low priority queues is not empty. ++ 3. Once timer determined a stall it sets thread_group->stalled flag and ++ wakes and idle worker (or creates a new one, subject to throttling). ++ 4. The stalled flag is reset, when an event is dequeued. ++ ++ Q : Will this handling lead to an unbound growth of threads, if queues ++ stall permanently? ++ A : No. If queues stall permanently, it is an indication for many very long ++ simultaneous queries. The maximum number of simultanoues queries is ++ max_connections, further we have threadpool_max_threads limit, upon which no ++ worker threads are created. So in case there is a flood of very long ++ queries, threadpool would slowly approach thread-per-connection behavior. ++ NOTE: ++ If long queries never wait, creation of the new threads is done by timer, ++ so it is slower than in real thread-per-connection. However if long queries ++ do wait and indicate that via thd_wait_begin/end callbacks, thread creation ++ will be faster. ++ */ ++ if (!thread_group->queue_event_count && !queues_are_empty(*thread_group)) { ++ thread_group->stalled = true; ++ TP_INCREMENT_GROUP_COUNTER(thread_group, stalls); ++ wake_or_create_thread(thread_group, true); ++ } ++ ++ /* Reset queue event count */ ++ thread_group->queue_event_count = 0; ++ ++ mysql_mutex_unlock(&thread_group->mutex); ++} ++ ++my_thread_handle timer_thread_id; ++ ++static void start_timer(pool_timer_t *timer) noexcept { ++ DBUG_ENTER("start_timer"); ++ mysql_mutex_init(key_timer_mutex, &timer->mutex, nullptr); ++ mysql_cond_init(key_timer_cond, &timer->cond); ++ timer->shutdown = false; ++ mysql_thread_create(key_timer_thread, &timer_thread_id, nullptr, timer_thread, timer); ++ DBUG_VOID_RETURN; ++} ++ ++static void stop_timer(pool_timer_t *timer) noexcept { ++ DBUG_ENTER("stop_timer"); ++ mysql_mutex_lock(&timer->mutex); ++ timer->shutdown = true; ++ mysql_cond_signal(&timer->cond); ++ mysql_mutex_unlock(&timer->mutex); ++ my_thread_join(&timer_thread_id, nullptr); ++ DBUG_VOID_RETURN; ++} ++ ++/** ++ Poll for socket events and distribute them to worker threads ++ In many case current thread will handle single event itself. ++ ++ @return a ready connection, or NULL on shutdown ++*/ ++static connection_t *listener(thread_group_t *thread_group) { ++ DBUG_ENTER("listener"); ++ connection_t *retval = nullptr; ++ ++ for (;;) { ++ if (thread_group->shutdown) break; ++ ++ native_event ev[MAX_EVENTS]; ++ int cnt = io_poll_wait(thread_group->pollfd, ev, MAX_EVENTS, -1); ++ ++ DBUG_EXECUTE_IF("threadpool_io_poll_wait_at_least_2_events", ++ { ++ while (cnt < 2) ++ { ++ int cnt_again = io_poll_wait(thread_group->pollfd, ev + cnt, MAX_EVENTS - cnt, -1); ++ cnt += cnt_again; ++ } ++ } ++ ); ++ ++ TP_INCREMENT_GROUP_COUNTER(thread_group, polls[LISTENER]); ++ if (cnt <= 0) { ++ assert(thread_group->shutdown); ++ break; ++ } ++ ++ mysql_mutex_lock(&thread_group->mutex); ++ ++ if (thread_group->shutdown) { ++ mysql_mutex_unlock(&thread_group->mutex); ++ break; ++ } ++ ++ thread_group->io_event_count += cnt; ++ ++ /* ++ We got some network events and need to make decisions : whether ++ listener hould handle events and whether or not any wake worker ++ threads so they can handle events. ++ ++ Q1 : Should listener handle an event itself, or put all events into ++ queue and let workers handle the events? ++ ++ Solution : ++ Generally, listener that handles events itself is preferable. We do not ++ want listener thread to change its state from waiting to running too ++ often, Since listener has just woken from poll, it better uses its time ++ slice and does some work. Besides, not handling events means they go to ++ the queue, and often to wake another worker must wake up to handle the ++ event. This is not good, as we want to avoid wakeups. ++ ++ The downside of listener that also handles queries is that we can ++ potentially leave thread group for long time not picking the new ++ network events. It is not a major problem, because this stall will be ++ detected sooner or later by the timer thread. Still, relying on timer ++ is not always good, because it may "tick" too slow (large timer_interval) ++ ++ We use following strategy to solve this problem - if queue was not empty ++ we suspect flood of network events and listener stays, Otherwise, it ++ handles a query. ++ ++ ++ Q2: If queue is not empty, how many workers to wake? ++ ++ Solution: ++ We generally try to keep one thread per group active (threads handling ++ queries are considered active, unless they stuck in inside some "wait") ++ Thus, we will wake only one worker, and only if there is not active ++ threads currently,and listener is not going to handle a query. When we ++ don't wake, we hope that currently active threads will finish fast and ++ handle the queue. If this does not happen, timer thread will detect stall ++ and wake a worker. ++ ++ NOTE: Currently nothing is done to detect or prevent long queuing times. ++ A solutionc for the future would be to give up "one active thread per ++ group" principle, if events stay in the queue for too long, and just wake ++ more workers. ++ */ ++ ++ const bool listener_picks_event = threadpool_dedicated_listener? false : ++ (thread_group->high_prio_queue.is_empty() && thread_group->queue.is_empty()); ++ ++ /* ++ If listener_picks_event is set, listener thread will handle first event, ++ and put the rest into the queue. If listener_pick_event is not set, all ++ events go to the queue. ++ */ ++ for (int i = (listener_picks_event) ? 1 : 0; i < cnt; i++) { ++ connection_t *c = (connection_t *)native_event_get_userdata(&ev[i]); ++ if (connection_is_high_prio(*c)) { ++ c->tickets--; ++ thread_group->high_prio_queue.push_back(c); ++ } else { ++ c->tickets = tp_get_thdvar_high_prio_tickets(c->thd); ++ queue_push(thread_group, c); ++ } ++ } ++ ++ if (listener_picks_event) { ++ /* Handle the first event. */ ++ retval = (connection_t *)native_event_get_userdata(&ev[0]); ++ TP_INCREMENT_GROUP_COUNTER(thread_group, dequeues[LISTENER]); ++ mysql_mutex_unlock(&thread_group->mutex); ++ break; ++ } ++ ++ /* The remaining threads can be created at most */ ++ int workers_in_need = (int)threadpool_toobusy - ++ thread_group->active_thread_count - thread_group->waiting_thread_count; ++ ++ /* There are no remaining threads and the thread group is stalled */ ++ if (workers_in_need <= 0 && thread_group->active_thread_count == 0) { ++ workers_in_need = 1; ++ } ++ ++ /* The number of threads that can be created and ++ the number of threads that are really needed, whichever is smaller */ ++ workers_in_need = workers_in_need > cnt ? cnt : workers_in_need; ++ ++ /* Wake up or create the required threads */ ++ for (int i = 0; i < workers_in_need; i++) { ++ /* We added some work items to queue, now wake a worker. */ ++ if (wake_thread(thread_group, false)) { ++ /* ++ Wake failed, hence groups has no idle threads. Now check if there are ++ any threads in the group except listener. ++ In order to achieve the best running performance of the ++ number of threads, the conditions for the wake-up or ++ creation of worker threads are relaxed. ++ The queue is not empty, and listener is not going to handle ++ events. In order to drain the queue, we create a worker here. ++ Alternatively, we could just rely on timer to detect stall, and ++ create thread, but waiting for timer would be an inefficient and ++ pointless delay. ++ */ ++ create_worker(thread_group, false); ++ } ++ } ++ mysql_mutex_unlock(&thread_group->mutex); ++ } ++ DBUG_RETURN(retval); ++} ++ ++/** ++ Adjust thread counters in group or global ++ whenever thread is created or is about to exit ++ ++ @param thread_group ++ @param count - 1, when new thread is created ++ -1, when thread is about to exit ++*/ ++static void add_thread_count(thread_group_t *thread_group, ++ int32 count) noexcept { ++ thread_group->thread_count += count; ++ /* worker starts out and end in "active" state */ ++ thread_group->active_thread_count += count; ++ tp_stats.num_worker_threads.fetch_add(count, std::memory_order_relaxed); ++} ++ ++/** ++ Creates a new worker thread. ++ thread_mutex must be held when calling this function ++ ++ NOTE: in rare cases, the number of threads can exceed ++ threadpool_max_threads, because we need at least 2 threads ++ per group to prevent deadlocks (one listener + one worker) ++*/ ++static int create_worker(thread_group_t *thread_group, ++ bool due_to_stall) noexcept { ++ my_thread_handle thread_id; ++ bool max_threads_reached = false; ++ int err; ++ ++ DBUG_ENTER("create_worker"); ++ if (tp_stats.num_worker_threads.load(std::memory_order_relaxed) >= ++ (int)threadpool_max_threads && ++ thread_group->thread_count >= 2) { ++ err = 1; ++ max_threads_reached = true; ++ goto end; ++ } ++ ++ err = mysql_thread_create(key_worker_thread, &thread_id, ++ thread_group->pthread_attr, worker_main, ++ thread_group); ++ if (!err) { ++ thread_group->last_thread_creation_time = my_microsecond_getsystime(); ++ Global_THD_manager::get_instance()->inc_thread_created(); ++ add_thread_count(thread_group, 1); ++ TP_INCREMENT_GROUP_COUNTER(thread_group, thread_creations); ++ ++ if (due_to_stall) { ++ TP_INCREMENT_GROUP_COUNTER(thread_group, thread_creations_due_to_stall); ++ } ++ } else { ++ set_my_errno(errno); ++ } ++ ++end: ++ if (err) { ++ print_pool_blocked_message(max_threads_reached); ++ } else { ++ pool_block_start = 0; /* Reset pool blocked timer, if it was set */ ++ } ++ ++ DBUG_RETURN(err); ++} ++ ++/** ++ Calculate microseconds throttling delay for thread creation. ++ ++ The value depends on how many threads are already in the group: ++ small number of threads means no delay, the more threads the larger ++ the delay. ++ ++ The actual values were not calculated using any scientific methods. ++ They just look right, and behave well in practice. ++ ++ TODO: Should throttling depend on thread_pool_stall_limit? ++*/ ++static ulonglong microsecond_throttling_interval( ++ const thread_group_t &thread_group) noexcept { ++ const int count = thread_group.thread_count; ++ ++ if (count < 4) return 0; ++ ++ if (count < 8) return 50 * 1000; ++ ++ if (count < 16) return 100 * 1000; ++ ++ return 200 * 1000; ++} ++ ++/** ++ Wakes a worker thread, or creates a new one. ++ ++ Worker creation is throttled, so we avoid too many threads ++ to be created during the short time. ++*/ ++static int wake_or_create_thread(thread_group_t *thread_group, ++ bool due_to_stall) { ++ DBUG_ENTER("wake_or_create_thread"); ++ ++ if (thread_group->shutdown) DBUG_RETURN(0); ++ ++ if (wake_thread(thread_group, due_to_stall) == 0) DBUG_RETURN(0); ++ ++ if (thread_group->thread_count > thread_group->connection_count) ++ DBUG_RETURN(-1); ++ ++ /* In order to achieve the best running performance of the ++ number of threads, the conditions for the wake-up or ++ creation of worker threads are relaxed. */ ++ if (thread_group->active_thread_count < ++ (1 + (int)threadpool_oversubscribe)) { ++ /* ++ We're better off creating a new thread here with no delay, either there ++ are not enough active workers, or they all are all blocking and there was no ++ idle thread to wakeup. Smells like a potential deadlock or very slowly ++ executing requests, e.g sleeps or user locks. ++ */ ++ DBUG_RETURN(create_worker(thread_group, due_to_stall)); ++ } ++ ++ const ulonglong now = my_microsecond_getsystime(); ++ const ulonglong time_since_last_thread_created = ++ (now - thread_group->last_thread_creation_time); ++ ++ /* Throttle thread creation. */ ++ if (time_since_last_thread_created > ++ microsecond_throttling_interval(*thread_group)) { ++ DBUG_RETURN(create_worker(thread_group, due_to_stall)); ++ } ++ ++ TP_INCREMENT_GROUP_COUNTER(thread_group, throttles); ++ DBUG_RETURN(-1); ++} ++ ++static int thread_group_init(thread_group_t *thread_group, ++ pthread_attr_t *thread_attr) noexcept { ++ DBUG_ENTER("thread_group_init"); ++ thread_group->pthread_attr = thread_attr; ++ mysql_mutex_init(key_group_mutex, &thread_group->mutex, nullptr); ++ thread_group->pollfd = -1; ++ thread_group->shutdown_pipe[0] = -1; ++ thread_group->shutdown_pipe[1] = -1; ++ thread_group->thread_count = 0; ++ thread_group->admin_port_thread_count = 0; ++ thread_group->dump_thread_count = 0; ++ thread_group->active_thread_count = 0; ++ thread_group->connection_count = 0; ++ thread_group->waiting_thread_count = 0; ++ thread_group->io_event_count = 0; ++ thread_group->queue_event_count = 0; ++ thread_group->shutdown = false; ++ thread_group->stalled = false; ++ DBUG_RETURN(0); ++} ++ ++static void thread_group_destroy(thread_group_t *thread_group) noexcept { ++ mysql_mutex_destroy(&thread_group->mutex); ++ if (thread_group->pollfd != -1) { ++ close(thread_group->pollfd); ++ thread_group->pollfd = -1; ++ } ++ for (int i = 0; i < 2; i++) { ++ if (thread_group->shutdown_pipe[i] != -1) { ++ close(thread_group->shutdown_pipe[i]); ++ thread_group->shutdown_pipe[i] = -1; ++ } ++ } ++} ++ ++/** ++ Wake sleeping thread from waiting list ++*/ ++static int wake_thread(thread_group_t *thread_group, bool due_to_stall) noexcept { ++ DBUG_ENTER("wake_thread"); ++ worker_thread_t *thread = thread_group->waiting_threads.front(); ++ if (thread) { ++ thread->woken = true; ++ thread_group->waiting_threads.remove(thread); ++ mysql_cond_signal(&thread->cond); ++ TP_INCREMENT_GROUP_COUNTER(thread_group, wakes); ++ if (due_to_stall) { ++ TP_INCREMENT_GROUP_COUNTER(thread_group, wakes_due_to_stall); ++ } ++ DBUG_RETURN(0); ++ } ++ DBUG_RETURN(1); /* no thread in waiter list => missed wakeup */ ++} ++ ++/** ++ Shutdown for thread group ++*/ ++static void thread_group_close(thread_group_t *thread_group) noexcept { ++ DBUG_ENTER("thread_group_close"); ++ ++ mysql_mutex_lock(&thread_group->mutex); ++ if (thread_group->thread_count == 0) { ++ mysql_mutex_unlock(&thread_group->mutex); ++ thread_group_destroy(thread_group); ++ DBUG_VOID_RETURN; ++ } ++ ++ thread_group->shutdown = true; ++ thread_group->listener = nullptr; ++ ++ if (pipe(thread_group->shutdown_pipe)) { ++ mysql_mutex_unlock(&thread_group->mutex); ++ DBUG_VOID_RETURN; ++ } ++ ++ /* Wake listener */ ++ if (io_poll_associate_fd(thread_group->pollfd, ++ thread_group->shutdown_pipe[0], nullptr)) { ++ mysql_mutex_unlock(&thread_group->mutex); ++ DBUG_VOID_RETURN; ++ } ++ char c = 0; ++ if (write(thread_group->shutdown_pipe[1], &c, 1) < 0) { ++ mysql_mutex_unlock(&thread_group->mutex); ++ DBUG_VOID_RETURN; ++ } ++ ++ /* Wake all workers. */ ++ while (wake_thread(thread_group, false) == 0) { ++ } ++ ++ mysql_mutex_unlock(&thread_group->mutex); ++ DBUG_VOID_RETURN; ++} ++ ++/* ++ Add work to the queue. Maybe wake a worker if they all sleep. ++ ++ Currently, this function is only used when new connections need to ++ perform login (this is done in worker threads). ++*/ ++static void queue_put(thread_group_t *thread_group, connection_t *connection) { ++ DBUG_ENTER("queue_put"); ++ ++ mysql_mutex_lock(&thread_group->mutex); ++ connection->tickets = tp_get_thdvar_high_prio_tickets(connection->thd); ++ connection->enqueue_time = pool_timer.current_microtime; ++ ++ queue_push(thread_group, connection); ++ ++ /* In order to achieve the best running performance of the ++ number of threads, the conditions for the wake-up or ++ creation of worker threads are relaxed. */ ++ if (thread_group->active_thread_count < ++ 1 + (int)threadpool_oversubscribe) { ++ wake_or_create_thread(thread_group, false); ++ } ++ ++ mysql_mutex_unlock(&thread_group->mutex); ++ ++ DBUG_VOID_RETURN; ++} ++ ++/** ++ Retrieve a connection with pending event. ++ ++ Pending event in our case means that there is either a pending login request ++ (if connection is not yet logged in), or there are unread bytes on the socket. ++ ++ If there are no pending events currently, thread will wait. ++ If timeout specified in abstime parameter passes, the function returns nullptr. ++ ++ @param current_thread - current worker thread ++ @param thread_group - current thread group ++ @param abstime - absolute wait timeout ++ ++ @return ++ connection with pending event. ++ nullptr is returned if timeout has expired,or on shutdown. ++*/ ++static connection_t *get_event(worker_thread_t *current_thread, ++ thread_group_t *thread_group, ++ struct timespec *abstime) { ++ DBUG_ENTER("get_event"); ++ connection_t *connection = nullptr; ++ int err = 0; ++ ++ mysql_mutex_lock(&thread_group->mutex); ++ assert(thread_group->active_thread_count >= 0); ++ ++ for (;;) { ++ const bool oversubscribed = too_many_active_threads(*thread_group); ++ if (thread_group->shutdown) break; ++ ++ /* Check if queue is not empty */ ++ if (!oversubscribed) { ++ connection = queue_get(thread_group, WORKER); ++ if (connection) break; ++ } ++ ++ /* If there is currently no listener in the group, become one. */ ++ if (!thread_group->listener) { ++ thread_group->listener = current_thread; ++ thread_group->active_thread_count--; ++ mysql_mutex_unlock(&thread_group->mutex); ++ ++ connection = listener(thread_group); ++ ++ mysql_mutex_lock(&thread_group->mutex); ++ thread_group->active_thread_count++; ++ /* There is no listener anymore, it just returned. */ ++ thread_group->listener = nullptr; ++ break; ++ } ++ ++ /* ++ Last thing we try before going to sleep is to ++ pick a single event via epoll, without waiting (timeout 0) ++ */ ++ if (!oversubscribed) { ++ native_event nev; ++ if (io_poll_wait(thread_group->pollfd, &nev, 1, 0) == 1) { ++ thread_group->io_event_count++; ++ TP_INCREMENT_GROUP_COUNTER(thread_group, polls[WORKER]); ++ connection = (connection_t *)native_event_get_userdata(&nev); ++ ++ /* ++ Since we are going to perform an out-of-order event processing for the ++ connection, first check whether it is eligible for high priority ++ processing. We can get here even if there are queued events, so it ++ must either have a high priority ticket, or there must be not too many ++ busy threads (as if it was coming from a low priority queue). ++ */ ++ if (connection_is_high_prio(*connection)) ++ connection->tickets--; ++ else if (too_many_busy_threads(*thread_group)) { ++ /* ++ Not eligible for high priority processing. Restore tickets and put ++ it into the low priority queue. ++ */ ++ connection->tickets = tp_get_thdvar_high_prio_tickets(connection->thd); ++ thread_group->queue.push_back(connection); ++ connection = nullptr; ++ } ++ ++ if (connection) { ++ TP_INCREMENT_GROUP_COUNTER(thread_group, dequeues[WORKER]); ++ thread_group->queue_event_count++; ++ break; ++ } ++ } ++ } ++ ++ /* And now, finally sleep */ ++ current_thread->woken = false; /* wake() sets this to true */ ++ ++ /* ++ Add current thread to the head of the waiting list and wait. ++ It is important to add thread to the head rather than tail ++ as it ensures LIFO wakeup order (hot caches, working inactivity timeout) ++ */ ++ thread_group->waiting_threads.push_front(current_thread); ++ ++ thread_group->active_thread_count--; ++ if (abstime) { ++ err = mysql_cond_timedwait(¤t_thread->cond, &thread_group->mutex, ++ abstime); ++ } else { ++ err = mysql_cond_wait(¤t_thread->cond, &thread_group->mutex); ++ } ++ thread_group->active_thread_count++; ++ ++ if (!current_thread->woken) { ++ /* ++ Thread was not signalled by wake(), it might be a spurious wakeup or ++ a timeout. Anyhow, we need to remove ourselves from the list now. ++ If thread was explicitly woken, than caller removed us from the list. ++ */ ++ thread_group->waiting_threads.remove(current_thread); ++ } ++ ++ if (err) break; ++ } ++ ++ thread_group->stalled = false; ++ mysql_mutex_unlock(&thread_group->mutex); ++ ++ DBUG_RETURN(connection); ++} ++ ++/** ++ Tells the pool that worker starts waiting on IO, lock, condition, ++ sleep() or similar. ++*/ ++ ++static void wait_begin(thread_group_t *thread_group) noexcept { ++ DBUG_ENTER("wait_begin"); ++ mysql_mutex_lock(&thread_group->mutex); ++ thread_group->active_thread_count--; ++ thread_group->waiting_thread_count++; ++ ++ assert(thread_group->active_thread_count >= 0); ++ assert(thread_group->connection_count > 0); ++ ++#ifdef THREADPOOL_CREATE_THREADS_ON_WAIT ++ /* In order to achieve the best running performance of the ++ number of threads, the conditions for the wake-up or ++ creation of worker threads are relaxed. */ ++ if ((thread_group->active_thread_count < (1 + (int)threadpool_oversubscribe)) && ++ (!queues_are_empty(*thread_group) || !thread_group->listener)) { ++ /* ++ Group might stall while this thread waits, thus wake ++ or create a worker to prevent stall. ++ */ ++ wake_or_create_thread(thread_group); ++ } ++#endif ++ ++ mysql_mutex_unlock(&thread_group->mutex); ++ DBUG_VOID_RETURN; ++} ++ ++/** ++ Tells the pool has finished waiting. ++*/ ++static void wait_end(thread_group_t *thread_group) noexcept { ++ DBUG_ENTER("wait_end"); ++ mysql_mutex_lock(&thread_group->mutex); ++ thread_group->active_thread_count++; ++ thread_group->waiting_thread_count--; ++ mysql_mutex_unlock(&thread_group->mutex); ++ DBUG_VOID_RETURN; ++} ++ ++/** ++ Allocate/initialize a new connection structure. ++*/ ++ ++static connection_t *alloc_connection(THD *thd) noexcept { ++ DBUG_ENTER("alloc_connection"); ++ DBUG_EXECUTE_IF("simulate_tp_alloc_connection_oom", DBUG_RETURN(nullptr);); ++ ++ connection_t *connection = (connection_t *)my_malloc( ++ PSI_NOT_INSTRUMENTED /*key_memory_thread_pool_connection*/, ++ sizeof(connection_t), 0); ++ if (connection) { ++ connection->thd = thd; ++ connection->waiting = false; ++ connection->logged_in = false; ++ connection->bound_to_poll_descriptor = false; ++ connection->abs_wait_timeout = ULLONG_MAX; ++ connection->tickets = 0; ++ } ++ DBUG_RETURN(connection); ++} ++ ++/** ++ Add a new connection to thread pool.. ++*/ ++ ++bool tp_add_connection( ++ Channel_info *channel_info) { ++ DBUG_ENTER("Thread_pool_connection_handler::add_connection"); ++ SLockGuard lk(gPluginLock); ++ if (gPluginUninstalling) { ++ channel_info->send_error_and_close_channel(ER_OUT_OF_RESOURCES, 0, false); ++ DBUG_RETURN(true); ++ } ++ ++ THD *const thd = channel_info->create_thd(); ++ ++ if (unlikely(!thd)) { ++ channel_info->send_error_and_close_channel(ER_OUT_OF_RESOURCES, 0, false); ++ DBUG_RETURN(true); ++ } ++ ++ connection_t *const connection = alloc_connection(thd); ++ ++ if (unlikely(!connection)) { ++ thd->get_protocol_classic()->end_net(); ++ delete thd; ++ // channel will be closed by send_error_and_close_channel() ++ channel_info->send_error_and_close_channel(ER_OUT_OF_RESOURCES, 0, false); ++ DBUG_RETURN(true); ++ } ++ ++ delete channel_info; ++ ++ thd->set_new_thread_id(); ++ thd->start_utime = my_micro_time(); ++ ++ threadpool_thds.insert(connection); ++ Global_THD_manager::get_instance()->add_thd(thd); ++ ++ thd->scheduler.data = connection; ++ ++ /* Assign connection to a group. */ ++ thread_group_t *group = &all_groups[thd->thread_id() % group_count]; ++ ++ connection->thread_group = group; ++ ++ if (thd->is_admin_connection()) { ++ my_thread_handle thread_id; ++ mysql_mutex_lock(&group->mutex); ++ int err = mysql_thread_create(key_worker_thread, &thread_id, ++ group->pthread_attr, admin_port_worker_main, connection); ++ ++ if (err) { ++ set_my_errno(errno); ++ print_pool_blocked_message(false); ++ } else { ++ group->admin_port_thread_count++; ++ } ++ mysql_mutex_unlock(&group->mutex); ++ } else { ++ change_group_rwlock.xlock(); ++ group->connection_count++; ++ change_group_rwlock.unxlock(); ++ ++ /* ++ Add connection to the work queue. Actual login ++ will be done by a worker thread. ++ */ ++ queue_put(group, connection); ++ } ++ ++ DBUG_RETURN(false); ++} ++ ++/** ++ Terminate connection. ++*/ ++static void connection_abort(connection_t *connection) { ++ DBUG_ENTER("connection_abort"); ++ threadpool_thds.erase(connection); ++ ++ thread_group_t *group = connection->thread_group; ++ bool is_admin_port = connection->thd->is_admin_connection(); ++ threadpool_remove_connection(connection->thd); ++ ++ if (!is_admin_port) { ++ change_group_rwlock.xlock(); ++ group->connection_count--; ++ change_group_rwlock.unxlock(); ++ } ++ ++ my_free(connection); ++ DBUG_VOID_RETURN; ++} ++ ++/** ++ MySQL scheduler callback : kill connection ++*/ ++ ++void tp_post_kill_notification(THD *thd) noexcept { ++ DBUG_ENTER("tp_post_kill_notification"); ++ if (current_thd == thd || thd->system_thread) { ++ DBUG_VOID_RETURN; ++ } ++ ++ Vio *vio = thd->get_protocol_classic()->get_vio(); ++ if (vio) vio_cancel(vio, SHUT_RD); ++ DBUG_VOID_RETURN; ++} ++ ++alignas(CPU_LEVEL1_DCACHE_LINESIZE) std::atomic tp_waits[THD_WAIT_LAST]; ++ ++/** ++ MySQL scheduler callback: wait begin ++*/ ++void tp_wait_begin(THD *thd, int type MY_ATTRIBUTE((unused))) { ++ DBUG_ENTER("tp_wait_begin"); ++ ++ if (thd == nullptr) { ++ DBUG_VOID_RETURN; ++ } ++ ++ connection_t *connection = (connection_t *)thd->scheduler.data; ++ ++ if (connection && connection->thd && ++ !connection->thd->is_admin_connection()) { ++ assert(!connection->waiting); ++ connection->waiting = true; ++ assert(type > 0 && type < THD_WAIT_LAST); ++ tp_waits[type]++; ++ wait_begin(connection->thread_group); ++ } ++ DBUG_VOID_RETURN; ++} ++ ++/** ++ MySQL scheduler callback: wait end ++*/ ++ ++void tp_wait_end(THD *thd) { ++ DBUG_ENTER("tp_wait_end"); ++ ++ if (thd == nullptr) { ++ DBUG_VOID_RETURN; ++ } ++ connection_t *connection = (connection_t *)thd->scheduler.data; ++ ++ if (connection && connection->thd && ++ !connection->thd->is_admin_connection()) { ++ assert(connection->waiting); ++ connection->waiting = false; ++ wait_end(connection->thread_group); ++ } ++ DBUG_VOID_RETURN; ++} ++ ++static void set_next_timeout_check(ulonglong abstime) { ++ DBUG_ENTER("set_next_timeout_check"); ++ while (abstime < pool_timer.next_timeout_check.load()) { ++ uint64 old = pool_timer.next_timeout_check.load(); ++ pool_timer.next_timeout_check.compare_exchange_weak(old, abstime); ++ } ++ DBUG_VOID_RETURN; ++} ++ ++ ++ ++ inline ulong get_wait_timeout(THD *thd) noexcept { ++ return thd->variables.net_wait_timeout; ++ } ++ ++/** ++ Set wait timeout for connection. ++*/ ++ ++static void set_wait_timeout(connection_t *c) noexcept { ++ DBUG_ENTER("set_wait_timeout"); ++ /* ++ Calculate wait deadline for this connection. ++ Instead of using my_microsecond_getsystime() which has a syscall ++ overhead, use pool_timer.current_microtime and take ++ into account that its value could be off by at most ++ one tick interval. ++ */ ++ ++ c->abs_wait_timeout = ++ pool_timer.current_microtime.load(std::memory_order_relaxed) + ++ 1000LL * pool_timer.tick_interval + ++ 1000000LL * get_wait_timeout(c->thd); ++ ++ set_next_timeout_check(c->abs_wait_timeout); ++ DBUG_VOID_RETURN; ++} ++ ++static int start_io(connection_t *connection) { ++ /* ++ Usually, connection will stay in the same group for the entire ++ connection's life. However, we do allow group_count to change ++ at runtime, which means in rare cases when it decreases ++ connection whose thread group id exceeds group_count need to ++ migrate to another group. We also support connection number ++ balancing between groups, which means when variables ++ threadpool_connection_balance is set to on, connection would ++ migrate to the thread group with fewer connections. ++ */ ++ if (check_change_group(connection)) { ++ change_group(connection); ++ } ++ thread_group_t *group = connection->thread_group; ++ ++ /* ++ Bind to poll descriptor if not yet done. ++ */ ++ Vio *vio = connection->thd->get_protocol_classic()->get_vio(); ++ int fd = mysql_socket_getfd(vio->mysql_socket); ++ if (!connection->bound_to_poll_descriptor) { ++ connection->bound_to_poll_descriptor = true; ++ return io_poll_associate_fd(group->pollfd, fd, connection); ++ } ++ ++ return io_poll_start_read(group->pollfd, fd, connection); ++} ++ ++static void handle_event(connection_t *connection) { ++ DBUG_ENTER("handle_event"); ++ int err = 0; ++ ++ while (1) { ++ if (!connection->logged_in) { ++ err = threadpool_add_connection(connection->thd); ++ connection->logged_in = true; ++ } else { ++ err = threadpool_process_request(connection->thd); ++ } ++ ++ if (err) { ++ goto end; ++ } ++ ++ set_wait_timeout(connection); ++ ++ if (!connection_is_worker_continue(*connection)) { ++ break; ++ } ++ } ++ ++ if (!connection->thd->is_admin_connection()) { ++ err = start_io(connection); ++ } ++ ++end: ++ if (err || connection->thd->is_admin_connection()) { ++ connection_abort(connection); ++ } ++ ++ DBUG_VOID_RETURN; ++} ++ ++static void *admin_port_worker_main(void *param) { ++ my_thread_init(); ++ DBUG_ENTER("admin_port_worker_main"); ++ ++#ifdef HAVE_PSI_THREAD_INTERFACE ++ PSI_THREAD_CALL(set_thread_account) ++ (nullptr, 0, nullptr, 0); ++#endif ++ ++ connection_t *connection = static_cast(param); ++ assert(connection != nullptr); ++ assert(connection->thread_group != nullptr); ++ thread_group_t *group = connection->thread_group; ++ ++ handle_event(connection); ++ ++ mysql_mutex_lock(&group->mutex); ++ group->admin_port_thread_count--; ++ mysql_mutex_unlock(&group->mutex); ++ ++ my_thread_end(); ++ return nullptr; ++} ++ ++/** ++ Worker thread's main ++*/ ++static void *worker_main(void *param) { ++ my_thread_init(); ++ ++ DBUG_ENTER("worker_main"); ++ ++ thread_group_t *thread_group = static_cast(param); ++ assert(thread_group != nullptr); ++ ++ group_affinity.bind_foreground_thread(thread_group - all_groups); ++ ++ /* Init per-thread structure */ ++ worker_thread_t this_thread; ++ mysql_cond_init(key_worker_cond, &this_thread.cond); ++ this_thread.thread_group = thread_group; ++ this_thread.event_count = 0; ++ ++#ifdef HAVE_PSI_THREAD_INTERFACE ++ PSI_THREAD_CALL(set_thread_account) ++ (nullptr, 0, nullptr, 0); ++#endif ++ ++ /* Run event loop */ ++ for (;;) { ++ struct timespec ts; ++ set_timespec(&ts, threadpool_idle_timeout); ++ connection_t *connection = get_event(&this_thread, thread_group, &ts); ++ ++ if (!connection) { ++ break; ++ } ++ ++ this_thread.event_count++; ++ handle_event(connection); ++ } ++ ++ /* Thread shutdown: cleanup per-worker-thread structure. */ ++ mysql_cond_destroy(&this_thread.cond); ++ ++ bool last_thread = false; /* last thread in group exits */ ++ mysql_mutex_lock(&thread_group->mutex); ++ add_thread_count(thread_group, -1); ++ last_thread= ((thread_group->thread_count == 0) && thread_group->shutdown); ++ mysql_mutex_unlock(&thread_group->mutex); ++ ++ /* Last thread in group exits and pool is terminating, destroy group.*/ ++ if (last_thread) { ++ thread_group_destroy(thread_group); ++ } ++ ++ group_affinity.remove_foreground_thread(); ++ my_thread_end(); ++ return nullptr; ++} ++ ++bool tp_init() { ++ DBUG_ENTER("tp_init"); ++ threadpool_started = true; ++ group_affinity.init(); ++ ++ for (uint i = 0; i < array_elements(all_groups); i++) { ++ thread_group_init(&all_groups[i], get_connection_attrib()); ++ } ++ tp_set_threadpool_size(threadpool_size); ++ if (group_count == 0) { ++ /* Something went wrong */ ++ sql_print_error("Can't set threadpool size to %d", threadpool_size); ++ DBUG_RETURN(true); ++ } ++#ifdef HAVE_PSI_INTERFACE ++ mysql_mutex_register("threadpool", mutex_list, array_elements(mutex_list)); ++ mysql_cond_register("threadpool", cond_list, array_elements(cond_list)); ++ mysql_thread_register("threadpool", thread_list, array_elements(thread_list)); ++#endif ++ ++ pool_timer.tick_interval = threadpool_stall_limit; ++ start_timer(&pool_timer); ++ DBUG_RETURN(false); ++} ++ ++std::thread *tp_end_thread = nullptr; ++ ++void tp_deinit() { ++ if (tp_end_thread != nullptr) { ++ assert(threadpool_thds.empty()); ++ tp_end_thread->join(); ++ delete tp_end_thread; ++ tp_end_thread = nullptr; ++ } ++} ++ ++void tp_end_func() { ++ if (!threadpool_started) { ++ return; ++ } ++ ++ while (!threadpool_thds.empty()) { ++ my_sleep(10000); ++ } ++ ++ stop_timer(&pool_timer); ++ ++ for (uint i = 0; i < array_elements(all_groups); i++) { ++ thread_group_close(&all_groups[i]); ++ } ++ ++ threadpool_started = false; ++} ++ ++void tp_end() { ++ DBUG_ENTER("tp_end"); ++ if (threadpool_thds.empty()) { ++ assert(tp_end_thread == nullptr); ++ tp_end_func(); ++ } else { ++ tp_end_thread = new std::thread(tp_end_func); ++ } ++ DBUG_VOID_RETURN; ++} ++ ++/** Ensure that poll descriptors are created when threadpool_size changes */ ++void tp_set_threadpool_size(uint size) noexcept { ++ if (!threadpool_started) return; ++ ++ bool success = true; ++ for (uint i = 0; i < size; i++) { ++ thread_group_t *group = &all_groups[i]; ++ mysql_mutex_lock(&group->mutex); ++ if (group->pollfd == -1) { ++ group->pollfd = io_poll_create(); ++ success = (group->pollfd >= 0); ++ if (!success) { ++ sql_print_error("io_poll_create() failed, errno=%d\n", errno); ++ } ++ } ++ mysql_mutex_unlock(&all_groups[i].mutex); ++ if (!success) { ++ change_group_rwlock.xlock(); ++ group_count = i; ++ change_group_rwlock.unlock(); ++ return; ++ } ++ } ++ change_group_rwlock.xlock(); ++ group_count = size; ++ change_group_rwlock.unlock(); ++} ++ ++void tp_set_threadpool_stall_limit(uint limit) noexcept { ++ if (!threadpool_started) { ++ return; ++ } ++ ++ mysql_mutex_lock(&(pool_timer.mutex)); ++ pool_timer.tick_interval = limit; ++ mysql_mutex_unlock(&(pool_timer.mutex)); ++ mysql_cond_signal(&(pool_timer.cond)); ++} ++ ++/** ++ Calculate number of idle/waiting threads in the pool. ++ ++ Sum idle threads over all groups. ++ Don't do any locking, it is not required for stats. ++*/ ++int tp_get_idle_thread_count() noexcept { ++ int sum = 0; ++ for (uint i = 0; ++ i < array_elements(all_groups) && (all_groups[i].pollfd >= 0); i++) { ++ sum += (all_groups[i].thread_count - all_groups[i].active_thread_count); ++ } ++ return sum; ++} ++ ++/* Report threadpool problems */ ++ ++/** ++ Delay in microseconds, after which "pool blocked" message is printed. ++ (30 sec == 30 Mio usec) ++*/ ++#define BLOCK_MSG_DELAY 30 * 1000000 ++ ++#define MAX_THREADS_REACHED_MSG \ ++ "Threadpool could not create additional thread to handle queries, because the \ ++number of allowed threads was reached. Increasing 'thread_pool_max_threads' \ ++parameter can help in this situation.\n \ ++If 'admin_port' parameter is set, you can still connect to the database with \ ++superuser account (it must be TCP connection using admin_port as TCP port) \ ++and troubleshoot the situation. \ ++A likely cause of pool blocks are clients that lock resources for long time. \ ++'show processlist' or 'show engine innodb status' can give additional hints." ++ ++#define CREATE_THREAD_ERROR_MSG "Can't create threads in threadpool (errno=%d)." ++ ++/** ++ Write a message when blocking situation in threadpool occurs. ++ The message is written only when pool blocks for BLOCK_MSG_DELAY (30) seconds. ++ It will be just a single message for each blocking situation (to prevent ++ log flood). ++*/ ++static void print_pool_blocked_message(bool max_threads_reached) noexcept { ++ ulonglong now = my_microsecond_getsystime(); ++ static bool msg_written = false; ++ ++ if (pool_block_start == 0) { ++ pool_block_start = now; ++ msg_written = false; ++ } ++ ++ if (!msg_written && ((now > pool_block_start + BLOCK_MSG_DELAY) || ++ (now == pool_block_start))) { ++ if (max_threads_reached) ++ sql_print_error(MAX_THREADS_REACHED_MSG); ++ else ++ sql_print_error(CREATE_THREAD_ERROR_MSG, my_errno); ++ ++ if (now > pool_block_start) { ++ sql_print_information("Threadpool has been blocked for %u seconds\n", ++ (uint)((now - pool_block_start) / 1000000)); ++ } ++ /* avoid reperated messages for the same blocking situation */ ++ msg_written = true; ++ } ++} +diff --git a/plugin/thread_pool/threadpool_unix.h b/plugin/thread_pool/threadpool_unix.h +new file mode 100644 +index 000000000..3c561f2da +--- /dev/null ++++ b/plugin/thread_pool/threadpool_unix.h +@@ -0,0 +1,135 @@ ++/* Copyright (C) 2012 Monty Program Ab ++ Copyright (C) 2022 Huawei Technologies Co., Ltd ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; version 2 of the License. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, ++ USA */ ++ ++#ifndef THREADPOOL_UNIX_H_ ++#define THREADPOOL_UNIX_H_ ++ ++#include "mysql/service_thd_wait.h" ++#include "sql/sql_plist.h" ++#include "sql/mysqld.h" ++#include "threadpool.h" ++#include "violite.h" ++#include "numa_affinity_manager.h" ++ ++#ifdef __linux__ ++#include ++typedef struct epoll_event native_event; ++#endif ++#if defined(__FreeBSD__) || defined(__APPLE__) ++#include ++typedef struct kevent native_event; ++#endif ++#if defined(__sun) ++#include ++typedef port_event_t native_event; ++#endif ++ ++#define my_microsecond_getsystime() (my_getsystime()/10) ++ ++struct thread_group_t; ++ ++/* Per-thread structure for workers */ ++struct worker_thread_t { ++ ulonglong event_count; /* number of request handled by this thread */ ++ thread_group_t *thread_group; ++ worker_thread_t *next_in_list; ++ worker_thread_t **prev_in_list; ++ ++ mysql_cond_t cond; ++ bool woken; ++}; ++ ++typedef I_P_List< ++ worker_thread_t, ++ I_P_List_adapter> ++ worker_list_t; ++ ++struct connection_t { ++ THD *thd; ++ thread_group_t *thread_group; ++ connection_t *next_in_queue; ++ connection_t **prev_in_queue; ++ ulonglong abs_wait_timeout; ++ ulonglong enqueue_time; ++ bool logged_in; ++ bool bound_to_poll_descriptor; ++ bool waiting; ++ uint tickets; ++}; ++ ++typedef I_P_List, ++ I_P_List_counter, I_P_List_fast_push_back> ++ connection_queue_t; ++ ++const int NQUEUES = 2; /* We have high and low priority queues */ ++ ++enum operation_origin ++{ ++ WORKER, ++ LISTENER ++}; ++ ++struct thread_group_counters_t ++{ ++ ulonglong thread_creations; ++ ulonglong thread_creations_due_to_stall; ++ ulonglong wakes; ++ ulonglong wakes_due_to_stall; ++ ulonglong throttles; ++ ulonglong stalls; ++ ulonglong dequeues[2]; ++ ulonglong polls[2]; ++}; ++ ++struct alignas(128) thread_group_t { ++ mysql_mutex_t mutex; ++ connection_queue_t queue; ++ connection_queue_t high_prio_queue; ++ worker_list_t waiting_threads; ++ worker_thread_t *listener; ++ pthread_attr_t *pthread_attr; ++ int pollfd; ++ int thread_count; ++ int admin_port_thread_count; ++ int dump_thread_count; ++ int active_thread_count; ++ int connection_count; ++ int waiting_thread_count; ++ /* Stats for the deadlock detection timer routine.*/ ++ int io_event_count; ++ int queue_event_count; ++ ulonglong last_thread_creation_time; ++ int shutdown_pipe[2]; ++ bool shutdown; ++ bool stalled; ++ thread_group_counters_t counters; ++ char padding[320 - sizeof(thread_group_counters_t)]; ++}; ++ ++static_assert(sizeof(thread_group_t) == 512, ++ "sizeof(thread_group_t) must be 512 to avoid false sharing"); ++ ++#define TP_INCREMENT_GROUP_COUNTER(group, var) do {group->counters.var++;}while(0) ++ ++extern thread_group_t all_groups[MAX_THREAD_GROUPS]; ++extern numa_affinity_manager group_affinity; ++ ++#endif // THREADPOOL_UNIX_H_ ++ +diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt +index 5fd225c1e..2bccb4e18 100644 +--- a/sql/CMakeLists.txt ++++ b/sql/CMakeLists.txt +@@ -531,6 +531,7 @@ SET(SQL_SHARED_SOURCES + sql_partition.cc + sql_partition_admin.cc + sql_planner.cc ++ sql_plan_cache.cc + sql_plugin.cc + sql_plugin_var.cc + sql_prepare.cc +diff --git a/sql/event_scheduler.cc b/sql/event_scheduler.cc +index c8a9ec1eb..e4a491f98 100644 +--- a/sql/event_scheduler.cc ++++ b/sql/event_scheduler.cc +@@ -170,7 +170,6 @@ bool post_init_event_thread(THD *thd) { + + Global_THD_manager *thd_manager = Global_THD_manager::get_instance(); + thd_manager->add_thd(thd); +- thd_manager->inc_thread_running(); + return false; + } + +@@ -190,7 +189,6 @@ void deinit_event_thread(THD *thd) { + DBUG_PRINT("exit", ("Event thread finishing")); + thd->release_resources(); + thd_manager->remove_thd(thd); +- thd_manager->dec_thread_running(); + delete thd; + } + +diff --git a/sql/mysqld.cc b/sql/mysqld.cc +index 83643f76a..3b99cf3c0 100644 +--- a/sql/mysqld.cc ++++ b/sql/mysqld.cc +@@ -1137,6 +1137,7 @@ LEX_STRING opt_init_connect, opt_init_slave; + /* Global variables */ + + LEX_STRING opt_mandatory_roles; ++bool opt_plan_cache = false; + bool opt_mandatory_roles_cache = false; + bool opt_always_activate_granted_roles = false; + bool opt_bin_log; +@@ -1363,6 +1364,9 @@ uint sync_binlog_period = 0, sync_relaylog_period = 0, + opt_mts_checkpoint_period, opt_mts_checkpoint_group; + ulong expire_logs_days = 0; + ulong binlog_expire_logs_seconds = 0; ++ ++std::atomic cached_prepared_stmt_count{0}; ++ + /** + Soft upper limit for number of sp_head objects that can be stored + in the sp_cache for one connection. +@@ -8893,6 +8897,13 @@ static int show_prepared_stmt_count(THD *, SHOW_VAR *var, char *buff) { + return 0; + } + ++static int show_cached_prepared_stmt_count(THD *, SHOW_VAR *var, char *buff) { ++ var->type = SHOW_LONG; ++ var->value = buff; ++ *((long *)buff) = (long)(cached_prepared_stmt_count.load()); ++ return 0; ++} ++ + static int show_table_definitions(THD *, SHOW_VAR *var, char *buff) { + var->type = SHOW_LONG; + var->value = buff; +@@ -9180,6 +9191,8 @@ SHOW_VAR status_vars[] = { + SHOW_SCOPE_ALL}, + {"Prepared_stmt_count", (char *)&show_prepared_stmt_count, SHOW_FUNC, + SHOW_SCOPE_GLOBAL}, ++ {"Cached_prepared_stmt_count", (char *)&show_cached_prepared_stmt_count, ++ SHOW_FUNC, SHOW_SCOPE_GLOBAL}, + {"Queries", (char *)&show_queries, SHOW_FUNC, SHOW_SCOPE_ALL}, + {"Questions", (char *)offsetof(System_status_var, questions), + SHOW_LONGLONG_STATUS, SHOW_SCOPE_ALL}, +@@ -9504,6 +9517,7 @@ static int mysql_init_variables() { + binlog_cache_use = binlog_cache_disk_use = 0; + mysqld_user = mysqld_chroot = opt_init_file = opt_bin_logname = nullptr; + prepared_stmt_count = 0; ++ cached_prepared_stmt_count = 0; + mysqld_unix_port = opt_mysql_tmpdir = my_bind_addr_str = NullS; + new (&mysql_tmpdir_list) MY_TMPDIR; + memset(&global_status_var, 0, sizeof(global_status_var)); +diff --git a/sql/mysqld.h b/sql/mysqld.h +index a7a80a223..3c8a429c0 100644 +--- a/sql/mysqld.h ++++ b/sql/mysqld.h +@@ -308,6 +308,7 @@ extern ulong rpl_stop_slave_timeout; + extern bool log_bin_use_v1_row_events; + extern ulong what_to_log, flush_time; + extern ulong max_prepared_stmt_count, prepared_stmt_count; ++extern std::atomic cached_prepared_stmt_count; + extern ulong open_files_limit; + extern bool clone_startup; + extern bool clone_recovery_error; +@@ -775,6 +776,7 @@ bool update_named_pipe_full_access_group(const char *new_group_name); + #endif + + extern LEX_STRING opt_mandatory_roles; ++extern bool opt_plan_cache; + extern bool opt_mandatory_roles_cache; + extern bool opt_always_activate_granted_roles; + +diff --git a/sql/mysqld_thd_manager.cc b/sql/mysqld_thd_manager.cc +index 902fe2b2c..56f7aab26 100644 +--- a/sql/mysqld_thd_manager.cc ++++ b/sql/mysqld_thd_manager.cc +@@ -315,6 +315,32 @@ THD *Global_THD_manager::find_thd(Find_thd_with_id *func) { + return nullptr; + } + ++/** ++ This class implements callback for do_for_all_thd(). ++ It counts the total number of running threads ++ from global thread list. ++*/ ++class Count_thread_running : public Do_THD_Impl { ++ public: ++ Count_thread_running() : m_count(0) {} ++ ~Count_thread_running() {} ++ virtual void operator()(THD *thd) { ++ if (thd->get_command() != COM_SLEEP) { ++ m_count++; ++ } ++ } ++ int get_count() { return m_count; } ++ ++ private: ++ int m_count; ++}; ++ ++void Global_THD_manager::count_num_thread_running() { ++ Count_thread_running count_thread_running; ++ do_for_all_thd(&count_thread_running); ++ atomic_num_thread_running = count_thread_running.get_count(); ++} ++ + void inc_thread_created() { + Global_THD_manager::get_instance()->inc_thread_created(); + } +diff --git a/sql/mysqld_thd_manager.h b/sql/mysqld_thd_manager.h +index 046997db6..70dcea1ef 100644 +--- a/sql/mysqld_thd_manager.h ++++ b/sql/mysqld_thd_manager.h +@@ -148,20 +148,15 @@ class Global_THD_manager { + void remove_thd(THD *thd); + + /** +- Retrieves thread running statistic variable. +- @return int Returns the total number of threads currently running ++ Count thread running statistic variable. + */ +- int get_num_thread_running() const { return atomic_num_thread_running; } ++ void count_num_thread_running(); + + /** +- Increments thread running statistic variable. +- */ +- void inc_thread_running() { atomic_num_thread_running++; } +- +- /** +- Decrements thread running statistic variable. ++ Retrieves thread running statistic variable. ++ @return int Returns the total number of threads currently running + */ +- void dec_thread_running() { atomic_num_thread_running--; } ++ int get_num_thread_running() const { return atomic_num_thread_running; } + + /** + Retrieves thread created statistic variable. +diff --git a/sql/opt_range.cc b/sql/opt_range.cc +index ec848bf6c..0577ecabd 100644 +--- a/sql/opt_range.cc ++++ b/sql/opt_range.cc +@@ -188,6 +188,7 @@ + #include "sql/thr_malloc.h" + #include "sql/uniques.h" // Unique + #include "template_utils.h" ++#include "sql/thd_raii.h" + + using std::max; + using std::min; +@@ -653,6 +654,10 @@ bool get_quick_keys(PARAM *param, QUICK_RANGE_SELECT *quick, KEY_PART *key, + SEL_ARG *key_tree, uchar *min_key, uint min_key_flag, + uchar *max_key, uint max_key_flag, uint *desc_flag, + uint num_key_parts); ++bool get_quick_keys(QUICK_RANGE_SELECT *quick, KEY_PART *key, ++ SEL_ARG *key_tree, uchar *min_key, uint min_key_flag, ++ uchar *max_key, uint max_key_flag, uint *desc_flag, ++ uint num_key_parts); + static bool eq_tree(const SEL_ROOT *a, const SEL_ROOT *b); + static bool eq_tree(const SEL_ARG *a, const SEL_ARG *b); + static bool eq_ranges_exceeds_limit(const SEL_ROOT *keypart, uint *count, +@@ -9524,6 +9529,281 @@ end: + return false; + } + ++bool get_quick_keys(QUICK_RANGE_SELECT *quick, KEY_PART *key, ++ SEL_ARG *key_tree, uchar *min_key, uint min_key_flag, ++ uchar *max_key, uint max_key_flag, uint *desc_flag, ++ uint num_key_parts) { ++ QUICK_RANGE *range; ++ uint flag = 0; ++ int min_part = key_tree->part - 1, // # of keypart values in min_key buffer ++ max_part = key_tree->part - 1; // # of keypart values in max_key buffer ++ ++ const bool asc = key_tree->is_ascending; ++ SEL_ARG *cur_key_tree = asc ? key_tree->left : key_tree->right; ++ if (cur_key_tree != null_element) ++ if (get_quick_keys(quick, key, cur_key_tree, min_key, min_key_flag, ++ max_key, max_key_flag, desc_flag, num_key_parts)) ++ return true; ++ uchar *tmp_min_key = min_key, *tmp_max_key = max_key; ++ uchar *param_min_key = min_key, *param_max_key = max_key; ++ key_tree->store_min_max_values(key[key_tree->part].store_length, &tmp_min_key, ++ min_key_flag, &tmp_max_key, max_key_flag, ++ &min_part, &max_part); ++ if (!asc) flag |= DESC_FLAG; ++ ++ // Stop processing key values if this is the last key part that needs to be ++ // looked into. See get_quick_select() for details. ++ if ((num_key_parts > 1) && key_tree->next_key_part && ++ key_tree->next_key_part->type == SEL_ROOT::Type::KEY_RANGE && ++ key_tree->next_key_part->root->part == ++ key_tree->part + 1) { // const key as prefix ++ if ((tmp_min_key - min_key) == (tmp_max_key - max_key) && ++ memcmp(min_key, max_key, (uint)(tmp_max_key - max_key)) == 0 && ++ key_tree->min_flag == 0 && key_tree->max_flag == 0) { ++ if (get_quick_keys(quick, key, key_tree->next_key_part->root, ++ tmp_min_key, min_key_flag | key_tree->get_min_flag(), ++ tmp_max_key, max_key_flag | key_tree->get_max_flag(), ++ (desc_flag ? desc_flag : &flag), num_key_parts - 1)) ++ return true; ++ goto end; // Ugly, but efficient ++ } ++ { ++ uint tmp_min_flag = key_tree->get_min_flag(); ++ uint tmp_max_flag = key_tree->get_max_flag(); ++ key_tree->store_next_min_max_keys(key, &tmp_min_key, &tmp_min_flag, ++ &tmp_max_key, &tmp_max_flag, &min_part, ++ &max_part); ++ flag |= tmp_min_flag | tmp_max_flag; ++ } ++ } else { ++ if (asc) ++ flag = (key_tree->min_flag & GEOM_FLAG) ++ ? key_tree->min_flag ++ : key_tree->min_flag | key_tree->max_flag; ++ else { ++ // Invert flags for DESC keypart ++ flag |= invert_min_flag(key_tree->min_flag) | ++ invert_max_flag(key_tree->max_flag); ++ } ++ } ++ ++ /* ++ Ensure that some part of min_key and max_key are used. If not, ++ regard this as no lower/upper range ++ */ ++ if ((flag & GEOM_FLAG) == 0) { ++ if (tmp_min_key != param_min_key) ++ flag &= ~NO_MIN_RANGE; ++ else ++ flag |= NO_MIN_RANGE; ++ if (tmp_max_key != param_max_key) ++ flag &= ~NO_MAX_RANGE; ++ else ++ flag |= NO_MAX_RANGE; ++ } ++ if ((flag & ~DESC_FLAG) == 0) { ++ uint length = (uint)(tmp_min_key - param_min_key); ++ if (length == (uint)(tmp_max_key - param_max_key) && ++ !memcmp(param_min_key, param_max_key, length)) { ++ const KEY *table_key = quick->head->key_info + quick->index; ++ flag |= EQ_RANGE; ++ /* ++ Note that keys which are extended with PK parts have no ++ HA_NOSAME flag. So we can use user_defined_key_parts. ++ */ ++ if ((table_key->flags & HA_NOSAME) && ++ key_tree->part == table_key->user_defined_key_parts - 1) { ++ if ((table_key->flags & HA_NULL_PART_KEY) && ++ null_part_in_key(key, param_min_key, ++ (uint)(tmp_min_key - param_min_key))) ++ flag |= NULL_RANGE; ++ else ++ flag |= UNIQUE_RANGE; ++ } ++ } ++ } ++ /* ++ Set DESC flag. We need this flag set according to the first keypart. ++ Depending on it, key values will be scanned either forward or backward, ++ preserving the order or records in the index along multiple ranges. ++ */ ++ if (desc_flag) flag = (flag & ~DESC_FLAG) | *desc_flag; ++ ++ /* Get range for retrieving rows in QUICK_SELECT::get_next */ ++ if (!(range = new (*THR_MALLOC) ++ QUICK_RANGE(param_min_key, (uint)(tmp_min_key - param_min_key), ++ min_part >= 0 ? make_keypart_map(min_part) : 0, ++ param_max_key, (uint)(tmp_max_key - param_max_key), ++ max_part >= 0 ? make_keypart_map(max_part) : 0, flag, ++ key_tree->rkey_func_flag))) ++ return true; // out of memory ++ ++ quick->max_used_key_length = ++ std::max(quick->max_used_key_length, uint(range->min_length)); ++ quick->max_used_key_length = ++ std::max(quick->max_used_key_length, uint(range->max_length)); ++ quick->used_key_parts = ++ std::max(quick->used_key_parts, uint(key_tree->part + 1)); ++ if (quick->ranges.push_back(range)) return true; ++ ++end: ++ cur_key_tree = asc ? key_tree->right : key_tree->left; ++ if (cur_key_tree != null_element) ++ return get_quick_keys(quick, key, cur_key_tree, min_key, ++ min_key_flag, max_key, max_key_flag, desc_flag, ++ num_key_parts); ++ return false; ++} ++ ++bool QEP_TAB::replace_cache_key(THD *thd) { ++ PARAM param; ++ SEL_TREE *tree = nullptr; ++ MEM_ROOT alloc; ++ KEY_PART *key_parts; ++ KEY *key_info; ++ QUICK_RANGE_SELECT *quick = dynamic_cast(m_qs->quick()); ++ ++ if (!quick) { ++ return true; ++ } ++ ++ TABLE *const head = table(); ++ table_map prev_tables = 0; ++ table_map const_tables = join()->found_const_table_map; ++ table_map read_tables = join()->is_executed() ? ++ (prefix_tables() & ~added_tables()) : const_tables; ++ Query_block *query_block = join()->query_block; ++ Key_map *needed_reg_ptr = &(join()->join_tab->needed_reg); ++ const enum_order interesting_order = ORDER_NOT_RELEVANT; ++ Item *cond = join()->where_cond; ++ ++ set_skip_records_in_range(true); ++ table()->init_cost_model(join()->cost_model()); ++ table()->in_use = thd; ++ quick->head = table(); ++ quick->set_handler(table()->file); ++ ++ /* set up parameter that is passed to all functions */ ++ param.thd = thd; ++ param.baseflag = head->file->ha_table_flags(); ++ param.prev_tables = prev_tables | const_tables | INNER_TABLE_BIT; ++ param.read_tables = read_tables | INNER_TABLE_BIT; ++ param.current_table = head->pos_in_table_list->map(); ++ param.table = head; ++ param.query_block = query_block; ++ param.keys = 0; ++ param.is_ror_scan = false; ++ param.mem_root = &alloc; ++ param.old_root = thd->mem_root; ++ param.needed_reg = needed_reg_ptr; ++ param.imerge_cost_buff.reset(); ++ param.using_real_indexes = true; ++ param.remove_jump_scans = true; ++ param.force_default_mrr = (interesting_order == ORDER_DESC); ++ param.order_direction = interesting_order; ++ param.use_index_statistics = false; ++ /* ++ Set index_merge_allowed from OPTIMIZER_SWITCH_INDEX_MERGE. ++ Notice also that OPTIMIZER_SWITCH_INDEX_MERGE disables all ++ index merge sub strategies. ++ */ ++ param.index_merge_allowed = ++ thd->optimizer_switch_flag(OPTIMIZER_SWITCH_INDEX_MERGE); ++ param.index_merge_union_allowed = ++ param.index_merge_allowed && ++ thd->optimizer_switch_flag(OPTIMIZER_SWITCH_INDEX_MERGE_UNION); ++ param.index_merge_sort_union_allowed = ++ param.index_merge_allowed && ++ thd->optimizer_switch_flag(OPTIMIZER_SWITCH_INDEX_MERGE_SORT_UNION); ++ param.index_merge_intersect_allowed = ++ param.index_merge_allowed && ++ thd->optimizer_switch_flag(OPTIMIZER_SWITCH_INDEX_MERGE_INTERSECT); ++ ++ param.skip_records_in_range = skip_records_in_range(); ++ ++ init_sql_alloc(key_memory_test_quick_select_exec, &alloc, ++ thd->variables.range_alloc_block_size, 0); ++ alloc.set_max_capacity(thd->variables.range_optimizer_max_mem_size); ++ alloc.set_error_for_capacity_exceeded(true); ++ thd->push_internal_handler(¶m.error_handler); ++ if (!(param.key_parts = ++ (KEY_PART *)alloc.Alloc(sizeof(KEY_PART) * head->s->key_parts)) || ++ fill_used_fields_bitmap(¶m)) { ++ thd->pop_internal_handler(); ++ free_root(&alloc, MYF(0)); // Return memory & allocator ++ return true; // Can't use range ++ } ++ key_parts = param.key_parts; ++ thd->mem_root = &alloc; ++ ++ { ++ /* ++ Make an array with description of all key parts of all table keys. ++ This is used in get_mm_parts function. ++ */ ++ key_info = head->key_info; ++ for (uint idx = 0; idx < head->s->keys; idx++, key_info++) { ++ KEY_PART_INFO *key_part_info; ++ ++ if (hint_key_state(thd, head->pos_in_table_list, idx, ++ NO_RANGE_HINT_ENUM, 0)) { ++ continue; ++ } ++ ++ if (key_info->flags & HA_FULLTEXT) { ++ continue; ++ } ++ ++ param.key[param.keys] = key_parts; ++ key_part_info = key_info->key_part; ++ for (uint part = 0; part < actual_key_parts(key_info); ++ part++, key_parts++, key_part_info++) { ++ key_parts->key = param.keys; ++ key_parts->part = part; ++ key_parts->length = key_part_info->length; ++ key_parts->store_length = key_part_info->store_length; ++ key_parts->field = key_part_info->field; ++ key_parts->null_bit = key_part_info->null_bit; ++ key_parts->image_type = (part < key_info->user_defined_key_parts && ++ key_info->flags & HA_SPATIAL) ++ ? Field::itMBR ++ : Field::itRAW; ++ key_parts->flag = key_part_info->key_part_flag; ++ } ++ param.real_keynr[param.keys++] = idx; ++ } ++ } ++ param.key_parts_end = key_parts; ++ ++ if (cond) { ++ tree = get_mm_tree(¶m, cond); ++ } ++ ++ if (tree->type == SEL_TREE::IMPOSSIBLE) { ++ thd->mem_root = param.old_root; ++ thd->pop_internal_handler(); ++ free_root(&alloc, MYF(0)); ++ return false; ++ } ++ ++ thd->mem_root = param.old_root; ++ uint key_idx = quick->index; ++ KEY_PART *key = quick->key_parts; ++ SEL_ARG *key_tree = tree->keys[key_idx]->root; ++ uchar *min_key = tree->keys[key_idx]->root->min_value; ++ uchar *max_key = tree->keys[key_idx]->root->max_value; ++ ++ if (get_quick_keys(quick, key, key_tree, min_key, 0, max_key, 0, nullptr, ++ MAX_REF_PARTS) || quick->init()) { ++ delete quick; ++ quick = nullptr; ++ return true; ++ } ++ thd->pop_internal_handler(); ++ free_root(&alloc, MYF(0)); ++ return false; ++} ++ + /* + Return 1 if there is only one range and this uses the whole unique key + */ +diff --git a/sql/opt_range.h b/sql/opt_range.h +index 58515d0af..2c347b97e 100644 +--- a/sql/opt_range.h ++++ b/sql/opt_range.h +@@ -483,7 +483,9 @@ class QUICK_RANGE_SELECT : public QUICK_SELECT_I { + friend class QUICK_ROR_INTERSECT_SELECT; + friend class QUICK_GROUP_MIN_MAX_SELECT; + ++public: + Quick_ranges ranges; /* ordered array of range ptrs */ ++protected: + bool free_file; /* TRUE <=> this->file is "owned" by this quick select */ + + /* Range pointers to be used when not using MRR interface */ +@@ -499,8 +501,10 @@ class QUICK_RANGE_SELECT : public QUICK_SELECT_I { + uint mrr_buf_size; /* copy from thd->variables.read_rnd_buff_size */ + HANDLER_BUFFER *mrr_buf_desc; /* the handler buffer */ + ++public: + /* Info about index we're scanning */ + KEY_PART *key_parts; ++protected: + KEY_PART_INFO *key_part_info; + + bool dont_free; /* Used by QUICK_SELECT_DESC */ +diff --git a/sql/sql_class.cc b/sql/sql_class.cc +index e01d33815..9294a2d2c 100644 +--- a/sql/sql_class.cc ++++ b/sql/sql_class.cc +@@ -907,6 +907,8 @@ void THD::cleanup_connection(void) { + assert(server_status == SERVER_STATUS_AUTOCOMMIT); + /* check prepared stmts are cleaned up */ + assert(prepared_stmt_count == 0); ++ /* check prepared stmts cached are cleaned up */ ++ assert(cached_prepared_stmt_count == 0); + /* check diagnostic area is cleaned up */ + assert(get_stmt_da()->status() == Diagnostics_area::DA_EMPTY); + /* check if temp tables are deleted */ +@@ -1749,6 +1751,12 @@ Prepared_statement *Prepared_statement_map::find(ulong id) { + } + + void Prepared_statement_map::erase(Prepared_statement *statement) { ++ if (statement->lex->query_block->qep_cache_state == ++ Query_block::QEP_CACHE_READY) { ++ assert(cached_prepared_stmt_count > 0); ++ --cached_prepared_stmt_count; ++ } ++ + if (statement == m_last_found_statement) m_last_found_statement = nullptr; + if (statement->name().str) names_hash.erase(to_string(statement->name())); + +@@ -1767,12 +1775,17 @@ void Prepared_statement_map::claim_memory_ownership(bool claim) { + + void Prepared_statement_map::reset() { + if (!st_hash.empty()) { +-#ifdef HAVE_PSI_PS_INTERFACE + for (auto &key_and_value : st_hash) { + Prepared_statement *stmt = key_and_value.second.get(); ++ if (stmt->lex->query_block->qep_cache_state == ++ Query_block::QEP_CACHE_READY) { ++ assert(cached_prepared_stmt_count > 0); ++ --cached_prepared_stmt_count; ++ } ++#ifdef HAVE_PSI_PS_INTERFACE + MYSQL_DESTROY_PS(stmt->get_PS_prepared_stmt()); +- } + #endif ++ } + mysql_mutex_lock(&LOCK_prepared_stmt_count); + assert(prepared_stmt_count >= st_hash.size()); + prepared_stmt_count -= st_hash.size(); +diff --git a/sql/sql_executor.cc b/sql/sql_executor.cc +index 923d9a214..538ee2ad4 100644 +--- a/sql/sql_executor.cc ++++ b/sql/sql_executor.cc +@@ -136,7 +136,6 @@ using std::unique_ptr; + using std::vector; + + static int read_system(TABLE *table); +-static int read_const(TABLE *table, TABLE_REF *ref); + static bool alloc_group_fields(JOIN *join, ORDER *group); + static inline pair FindKeyBufferAndMap( + const TABLE_REF *ref); +@@ -2807,7 +2806,8 @@ static AccessPath *ConnectJoins( + } + + void JOIN::create_access_paths() { +- assert(m_root_access_path == nullptr); ++ assert(m_root_access_path == nullptr || ++ query_block->qep_cache_state == Query_block::QEP_CACHE_READY); + + AccessPath *path = create_root_access_path_for_join(); + path = attach_access_paths_for_having_and_limit(path); +@@ -3487,7 +3487,7 @@ int ConstIterator::Read() { + return err; + } + +-static int read_const(TABLE *table, TABLE_REF *ref) { ++int read_const(TABLE *table, TABLE_REF *ref) { + int error; + DBUG_TRACE; + +diff --git a/sql/sql_executor.h b/sql/sql_executor.h +index 465e23883..be8cf2814 100644 +--- a/sql/sql_executor.h ++++ b/sql/sql_executor.h +@@ -284,6 +284,10 @@ class QEP_TAB : public QEP_shared_owner { + // Cleans up. + void cleanup(); + ++ void replace_cache_key(); ++ ++ bool replace_cache_key(THD *thd); ++ + // Getters and setters + + Item *condition_optim() const { return m_condition_optim; } +@@ -573,6 +577,7 @@ AccessPath *GetAccessPathForDerivedTable( + + void ConvertItemsToCopy(const mem_root_deque &items, Field **fields, + Temp_table_param *param); ++int read_const(TABLE *table, TABLE_REF *ref); + std::string RefToString(const TABLE_REF &ref, const KEY *key, + bool include_nulls); + +diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc +index 146a46047..0dbc2c612 100644 +--- a/sql/sql_lex.cc ++++ b/sql/sql_lex.cc +@@ -4752,7 +4752,6 @@ void Query_block::restore_cmd_properties() { + tbl->restore_properties(); + tbl->table->m_record_buffer = Record_buffer{0, 0, nullptr}; + } +- assert(join == nullptr); + + // Restore GROUP BY list + if (group_list_ptrs && group_list_ptrs->size() > 0) { +diff --git a/sql/sql_lex.h b/sql/sql_lex.h +index ce5ecd548..e2e8261b5 100644 +--- a/sql/sql_lex.h ++++ b/sql/sql_lex.h +@@ -695,11 +695,13 @@ class Query_expression { + Mem_root_array setup_materialization( + THD *thd, TABLE *dst_table, bool union_distinct_only); + ++ public: + /** + Convert the executor structures to a set of access paths, storing the result + in m_root_access_path. + */ + void create_access_paths(THD *thd); ++ bool create_root_iterator(THD *thd); + + public: + /** +@@ -1171,6 +1173,17 @@ class Query_block { + /// @returns a map of all tables references in the query block + table_map all_tables_map() const { return (1ULL << leaf_table_count) - 1; } + ++ /// State of plan cache. ++ enum enum_qep_cache_state { ++ QEP_NO_CACHE, // Initial state. ++ QEP_CACHE_START, // plan can cache, set up when statement first execute. ++ QEP_CACHE_INITIALIZED, // Starting plan cache when statement second execute. ++ QEP_CACHE_READY, // plan already cache. ++ QEP_CACHE_INVALID, // can't cache. ++ }; ++ /// See enum_cache_state ++ enum enum_qep_cache_state qep_cache_state { QEP_NO_CACHE }; ++ + void remove_derived(THD *thd, TABLE_LIST *tl); + bool remove_aggregates(THD *thd, Query_block *select); + +diff --git a/sql/sql_opt_exec_shared.h b/sql/sql_opt_exec_shared.h +index 2018c7c77..c8de2b050 100644 +--- a/sql/sql_opt_exec_shared.h ++++ b/sql/sql_opt_exec_shared.h +@@ -559,6 +559,7 @@ class QEP_shared_owner { + bool skip_records_in_range() const { return m_qs->skip_records_in_range(); } + + void qs_cleanup(); ++ void qs_cleanup(bool full); + + protected: + QEP_shared *m_qs; // qs stands for Qep_Shared +diff --git a/sql/sql_optimizer.cc b/sql/sql_optimizer.cc +index 29a3049e0..d5f776186 100644 +--- a/sql/sql_optimizer.cc ++++ b/sql/sql_optimizer.cc +@@ -1128,10 +1128,17 @@ bool JOIN::alloc_qep(uint n) { + + ASSERT_BEST_REF_IN_JOIN_ORDER(this); + +- qep_tab = new (thd->mem_root) +- QEP_TAB[n + 1]; // The last one holds only the final op_type. ++ if (query_block->qep_cache_state == Query_block::QEP_CACHE_INITIALIZED) { ++ query_block->qep_cache_state = Query_block::QEP_CACHE_READY; ++ ++cached_prepared_stmt_count; ++ Prepared_stmt_arena_holder ps_arena_holder(thd); ++ qep_tab = new (thd->mem_root) QEP_TAB[n + 1]; ++ } else { ++ qep_tab = new (thd->mem_root) QEP_TAB[n + 1]; ++ } + if (!qep_tab) return true; /* purecov: inspected */ + for (uint i = 0; i < n; ++i) qep_tab[i].init(best_ref[i]); ++ plan_cache::plan_cache(this); + return false; + } + +@@ -2773,7 +2780,13 @@ static JOIN_TAB *alloc_jtab_array(THD *thd, uint table_count) { + JOIN_TAB *t = new (thd->mem_root) JOIN_TAB[table_count]; + if (!t) return nullptr; /* purecov: inspected */ + +- QEP_shared *qs = new (thd->mem_root) QEP_shared[table_count]; ++ QEP_shared *qs = nullptr; ++ if (thd->lex->query_block->qep_cache_state == Query_block::QEP_CACHE_INITIALIZED) { ++ Prepared_stmt_arena_holder ps_arena_holder(thd); ++ qs = new (thd->mem_root) QEP_shared[table_count]; ++ } else { ++ qs = new (thd->mem_root) QEP_shared[table_count]; ++ } + if (!qs) return nullptr; /* purecov: inspected */ + + for (uint i = 0; i < table_count; ++i) t[i].set_qs(qs++); +@@ -5139,8 +5152,16 @@ bool JOIN::init_planner_arrays() { + + if (!(positions = new (thd->mem_root) POSITION[table_count])) return true; + +- if (!(best_positions = new (thd->mem_root) POSITION[table_count + sj_nests])) +- return true; ++ if (query_block->qep_cache_state == Query_block::QEP_CACHE_INITIALIZED) { ++ Prepared_stmt_arena_holder ps_arena_holder(thd); ++ if (!(best_positions = new (thd->mem_root) POSITION[table_count + sj_nests])) { ++ return true; ++ } ++ } else { ++ if (!(best_positions = new (thd->mem_root) POSITION[table_count + sj_nests])) { ++ return true; ++ } ++ } + + /* + Initialize data structures for tables to be joined. +@@ -5515,8 +5536,10 @@ bool JOIN::extract_func_dependent_tables() { + join_read_const_table(tab, positions + const_tables - 1); + if (status > 0) + return true; +- else if (status == 0) ++ else if (status == 0) { + found_const_table_map |= tl->map(); ++ plan_cache::check_query_plan_cachable(query_block); ++ } + break; + } else + found_ref |= refs; // Table is const if all refs are const +@@ -5664,6 +5687,7 @@ bool JOIN::estimate_rowcount() { + } + } + if (records != HA_POS_ERROR) { ++ plan_cache::check_query_plan_cachable(query_block); + tab->found_records = records; + tab->read_time = + tab->quick() ? tab->quick()->cost_est.total_cost() : 0.0; +diff --git a/sql/sql_optimizer.h b/sql/sql_optimizer.h +index 53a88995b..c6fcec517 100644 +--- a/sql/sql_optimizer.h ++++ b/sql/sql_optimizer.h +@@ -51,6 +51,7 @@ + #include "sql/sql_select.h" // Key_use + #include "sql/table.h" + #include "sql/temp_table_param.h" ++#include "sql/sql_plan_cache.h" + + enum class Subquery_strategy : int; + class COND_EQUAL; +@@ -96,8 +97,6 @@ class ORDER_with_src { + public: + ORDER *order; ///< ORDER expression that we are wrapping with this class + Explain_sort_clause src; ///< origin of order list +- +- private: + int flags; ///< bitmap of Explain_sort_property + + public: +@@ -930,6 +929,7 @@ class JOIN { + + bool alloc_indirection_slices(); + ++public: + /** + Convert the executor structures to a set of access paths, storing + the result in m_root_access_path. +@@ -947,6 +947,7 @@ class JOIN { + + void create_access_paths_for_index_subquery(); + ++private: + /** @{ Helpers for create_access_paths. */ + AccessPath *create_root_access_path_for_join(); + AccessPath *attach_access_paths_for_having_and_limit(AccessPath *path); +@@ -957,6 +958,9 @@ class JOIN { + (after you create an iterator from it). + */ + AccessPath *m_root_access_path = nullptr; ++ ++public: ++ plan_cache::Plan_cache_context plan_cache_context; + }; + + /** +diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc +index 52f7f1643..e4952a63a 100644 +--- a/sql/sql_parse.cc ++++ b/sql/sql_parse.cc +@@ -1599,7 +1599,6 @@ bool dispatch_command(THD *thd, const COM_DATA *com_data, + } + thd->set_query_id(next_query_id()); + thd->reset_rewritten_query(); +- thd_manager->inc_thread_running(); + + if (!(server_command_flags[command] & CF_SKIP_QUESTIONS)) + thd->status_var.questions++; +@@ -2265,8 +2264,6 @@ done: + /* Prevent rewritten query from getting "stuck" in SHOW PROCESSLIST. */ + thd->reset_rewritten_query(); + +- thd_manager->dec_thread_running(); +- + /* Freeing the memroot will leave the THD::work_part_info invalid. */ + thd->work_part_info = nullptr; + +diff --git a/sql/sql_plan_cache.cc b/sql/sql_plan_cache.cc +new file mode 100644 +index 000000000..1d5458048 +--- /dev/null ++++ b/sql/sql_plan_cache.cc +@@ -0,0 +1,334 @@ ++/* Copyright (c) 2025, Huawei and/or its affiliates. All rights reserved. ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License, version 2.0, ++ as published by the Free Software Foundation. ++ ++ This program is also distributed with certain software (including ++ but not limited to OpenSSL) that is licensed under separate terms, ++ as designated in a particular file or component or in included license ++ documentation. The authors of MySQL hereby grant you an additional ++ permission to link the program and your derivative works with the ++ separately licensed software that they have included with MySQL. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License, version 2.0, for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ ++#include "sql/filesort.h" ++#include "sql/sql_plan_cache.h" ++#include "sql/sql_class.h" ++#include "sql/sql_optimizer.h" ++ ++namespace plan_cache { ++ ++extern "C" { ++ ulonglong get_table_ref_version_c(TABLE_SHARE *s) { ++ return s->get_table_ref_version(); ++ } ++ ++ TABLE* qep_table_c(JOIN *join) { ++ return join->qep_tab[0].table(); ++ } ++} ++ ++bool Plan_cache_context::is_environment_changed(THD *thd) { ++ bool result; ++ ++ __asm__ __volatile__ ( ++ "ldr x2, [%[this_ptr], %[ctx_opt_sw]]\n" ++ "ldr x4, [%[thd_ptr], %[var_opt_sw]]\n" ++ "cmp x2, x4\n" ++ "b.ne .Lret_true%=\n" ++ "ldr x2, [%[this_ptr], %[ctx_charset]]\n" ++ "ldr x4, [%[thd_ptr], %[var_charset]]\n" ++ "cmp x2, x4\n" ++ "b.ne .Lret_true%=\n" ++ "mov %w[ret], #0\n" ++ "b .Lexit%=\n" ++ ".Lret_true%=:\n" ++ "mov %w[ret], #1\n" ++ ".Lexit%=:\n" ++ : [ret] "=r" (result) ++ : [this_ptr] "r" (this), ++ [thd_ptr] "r" (thd), ++ [ctx_opt_sw] "i" (offsetof(Plan_cache_context, optimizer_switch)), ++ [var_opt_sw] "i" (offsetof(THD, variables) + ++ offsetof(System_variables, optimizer_switch)), ++ [ctx_charset] "i" (offsetof(Plan_cache_context, character_set_client)), ++ [var_charset] "i" (offsetof(THD, variables) + ++ offsetof(System_variables, character_set_client)) ++ : "x2", "x4", "memory", "cc" ++ ); ++ ++ return result; ++} ++ ++void invalidate_cached_plan(Query_block *query_block) { ++ JOIN *plan = query_block->join; ++ if (!plan) { ++ return; ++ } ++ ++ ::destroy(plan); ++ query_block->join = nullptr; ++ if (cached_prepared_stmt_count > 0) { ++ cached_prepared_stmt_count--; ++ } ++ ++ if (query_block->qep_cache_state != Query_block::QEP_CACHE_INVALID) { ++ query_block->qep_cache_state = Query_block::QEP_NO_CACHE; ++ } ++} ++ ++void check_query_plan_cachable(Query_block *query_block) { ++ JOIN *join = query_block->join; ++ THD *thd = join->thd; ++ uint leaf_table_count = query_block->leaf_table_count; ++ ++ // user asked to disable cache plans ++ if (!opt_plan_cache) { ++ return; ++ } ++ ++ // Already checked ++ if (query_block->qep_cache_state != Query_block::QEP_NO_CACHE) { ++ return; ++ } ++ ++ if (query_block->parent_lex->sql_command != SQLCOM_SELECT || ++ query_block->type() != enum_explain_type::EXPLAIN_SIMPLE || ++ query_block->outer_query_block() || ++ query_block->first_inner_query_expression() || ++ leaf_table_count != 1 || ++ query_block->partitioned_table_count != 0 || ++ query_block->join->select_distinct || ++ query_block->join->need_tmp_before_win || ++ !thd->lex->m_sql_cmd || ++ query_block->olap == ROLLUP_TYPE || ++ thd->sp_runtime_ctx || ++ thd->lex->sroutines_list.elements > 0) { ++ query_block->qep_cache_state = Query_block::QEP_CACHE_INVALID; ++ return; ++ } ++ ++ uint total_key_parts = 0; ++ for (uint i = 0; i < leaf_table_count; ++i) { ++ JOIN_TAB *join_tab = &query_block->join->join_tab[i]; ++ uint key_parts = join_tab->ref().key_parts; ++ total_key_parts += key_parts; ++ ++ if (thd->stmt_arena->get_state() == Query_arena::STMT_PREPARED || ++ thd->stmt_arena->get_state() == Query_arena::STMT_EXECUTED) { ++ if (join_tab->table()->is_nullable()) { ++ query_block->qep_cache_state = Query_block::QEP_CACHE_INVALID; ++ return; ++ } ++ } else { ++ return; ++ } ++ } ++ ++ if (query_block->where_cond()->type() == Item::COND_ITEM && ++ (((Item_cond *)query_block->where_cond())->argument_list()->size()) != ++ total_key_parts) { ++ query_block->qep_cache_state = Query_block::QEP_CACHE_INVALID; ++ return; ++ } ++ ++ if (WalkItem(query_block->where_cond(), ++ enum_walk::POSTFIX, ++ [](Item *sub_item) { ++ if (sub_item->type() == Item::FUNC_ITEM && ++ ((Item_func *)sub_item)->functype() == Item_func::GUSERVAR_FUNC) { ++ return true; ++ } ++ return false; ++ })) { ++ query_block->qep_cache_state = Query_block::QEP_CACHE_INVALID; ++ return; ++ } ++ ++ if (is_temporary_table(query_block->leaf_tables) || ++ query_block->leaf_tables->schema_table || ++ query_block->leaf_tables->is_system_view || ++ is_infoschema_db(query_block->leaf_tables->db) || ++ is_perfschema_db(query_block->leaf_tables->db)) { ++ query_block->qep_cache_state = Query_block::QEP_CACHE_INVALID; ++ return; ++ } ++ ++ query_block->qep_cache_state = Query_block::QEP_CACHE_START; ++} ++ ++void plan_cache(JOIN *join) { ++ __asm__ __volatile__ ( ++ "ldr x20, [%[join_ptr], #%[thd_off]]\n" ++ "add x21, %[join_ptr], #%[ctx_off]\n" ++ "stp x29, x30, [sp, #-32]!\n" ++ "mov x0, %[join_ptr]\n" ++ "bl qep_table_c\n" ++ "mov x23, x0\n" ++ "ldp x29, x30, [sp], #32\n" ++ "ldr x24, [%[join_ptr], #%[query_block_off]]\n" ++ "ldrb w25, [x24, #%[qep_state_off]]\n" ++ "cmp w25, #%[cache_ready]\n" ++ "b.ne .Lexit%=\n" ++ "ldr x26, [x20, #%[opt_switch_off]]\n" ++ "str x26, [%[join_ptr], #%[ctx_opt_sw_off]]\n" ++ "ldr x26, [x23, #%[table_file_off]]\n" ++ "ldr x27, [x26, #%[stats_records_off]]\n" ++ "str x27, [%[join_ptr], #%[ctx_records_off]]\n" ++ "stp x29, x30, [sp, #-32]!\n" ++ "ldr x0, [x23, #%[table_s_off]]\n" ++ "bl get_table_ref_version_c\n" ++ "str x0, [%[join_ptr], #%[ctx_table_ver_off]]\n" ++ "ldp x29, x30, [sp], #32\n" ++ "ldr x27, [x20, #%[charset_off]]\n" ++ "str x27, [%[join_ptr], #%[ctx_charset_off]]\n" ++ ".Lexit%=:\n" ++ : ++ : [join_ptr] "r" (join), ++ [thd_off] "i" (offsetof(JOIN, thd)), ++ [ctx_off] "i" (offsetof(JOIN, plan_cache_context)), ++ [query_block_off] "i" (offsetof(JOIN, query_block)), ++ [qep_state_off] "i" (offsetof(Query_block, qep_cache_state)), ++ [cache_ready] "i" (Query_block::QEP_CACHE_READY), ++ [opt_switch_off] "i" (offsetof(THD, variables) + ++ offsetof(System_variables, optimizer_switch)), ++ [ctx_opt_sw_off] "i" (offsetof(JOIN, plan_cache_context) + ++ offsetof(Plan_cache_context, optimizer_switch)), ++ [table_file_off] "i" (offsetof(TABLE, file)), ++ [stats_records_off] "i" (offsetof(handler, stats) + ++ offsetof(ha_statistics, records)), ++ [ctx_records_off] "i" (offsetof(JOIN, plan_cache_context) + ++ offsetof(Plan_cache_context, table_records)), ++ [table_s_off] "i" (offsetof(TABLE, s)), ++ [ctx_table_ver_off] "i" (offsetof(JOIN, plan_cache_context) + ++ offsetof(Plan_cache_context, table_version)), ++ [charset_off] "i" (offsetof(THD, variables) + ++ offsetof(System_variables, character_set_client)), ++ [ctx_charset_off] "i" (offsetof(JOIN, plan_cache_context) + ++ offsetof(Plan_cache_context, character_set_client)) ++ : "x0", "x1", "x20", "x21", "x23", "x24", "x25", "x26", "x27", ++ "memory", "cc" ++ ); ++} ++ ++static bool apply_cached_plan(THD *thd, LEX *lex) { ++ Query_expression *unit = lex->unit; ++ JOIN *join = unit->first_query_block()->join; ++ TABLE_LIST *leaf_tables = lex->query_block->leaf_tables; ++ uint leaf_table_count = lex->query_block->leaf_table_count; ++ ++ join->grouped = lex->query_block->is_explicitly_grouped(); ++ join->implicit_grouping = lex->query_block->is_implicitly_grouped(); ++ join->select_distinct = lex->query_block->is_distinct(); ++ join->order.order = lex->query_block->order_list.first; ++ join->order.src = ESC_ORDER_BY; ++ join->order.flags = lex->query_block->order_list.first ? ++ ESP_EXISTS : ESP_none; ++ join->group_list.order = lex->query_block->group_list.first; ++ join->group_list.src = ESC_GROUP_BY; ++ join->group_list.flags = lex->query_block->group_list.first ? ++ ESP_EXISTS : ESP_none; ++ join->fields = &(lex->query_block->fields); ++ ++ for (uint i = 0; i < leaf_table_count; ++i) { ++ QEP_TAB *qep_tab = &join->qep_tab[i]; ++ ++ assert(leaf_tables); ++ qep_tab->set_table(leaf_tables->table); ++ leaf_tables = leaf_tables->next_leaf; ++ ++ if (join->plan_is_const() && qep_tab->ref().key_parts) { ++ qep_tab->replace_cache_key(); ++ int status = read_const(qep_tab->table(), &qep_tab->ref()); ++ if (status == 0) { ++ if (unit->first_query_block()->with_sum_func && join->sum_funcs) { ++ if (prepare_sum_aggregators(join->sum_funcs, false)) { ++ return true; ++ } ++ if (setup_sum_funcs(thd, join->sum_funcs) || thd->is_fatal_error()) { ++ return true; ++ } ++ } ++ } else if (status == -1) { ++ join->create_access_paths_for_zero_rows(); ++ goto unit_set_optimized; ++ } else if (status > 0) { ++ return true; ++ } ++ } else { ++ if (unit->first_query_block()->with_sum_func) { ++ if (join->sum_funcs) { ++ if (prepare_sum_aggregators(join->sum_funcs, false)) { ++ return true; ++ } ++ if (setup_sum_funcs(thd, join->sum_funcs) || thd->is_fatal_error()) { ++ return true; ++ } ++ } ++ } ++ if (qep_tab->replace_cache_key(thd)) { ++ join->query_block->qep_cache_state = Query_block::QEP_CACHE_INVALID; ++ return true; ++ } ++ if (!join->group_list.empty() || ++ (!join->order.empty() && !join->m_windowing_steps)) { ++ bool keep_buffers = ++ join->query_block->master_query_expression()->item != nullptr && ++ join->query_block->master_query_expression()->item->is_uncacheable(); ++ qep_tab->filesort = new (thd->mem_root) ++ Filesort(thd, {qep_tab->table()}, keep_buffers, join->order.order, ++ HA_POS_ERROR, false, false, false, false); ++ } ++ } ++ } ++ join->create_access_paths(); ++unit_set_optimized: ++ unit->set_optimized(); ++ unit->create_access_paths(thd); ++ unit->create_root_iterator(thd); ++ ++ return false; ++} ++ ++bool apply_cached_plan_if_suitable(THD *thd, LEX *lex) { ++ Query_block *query_block = lex->query_block; ++ ++ if (query_block->qep_cache_state != Query_block::QEP_CACHE_READY) { ++ return false; ++ } ++ ++ if (opt_plan_cache) { ++ JOIN *join = query_block->join; ++ assert(join); ++ Plan_cache_context &context = join->plan_cache_context; ++ TABLE_LIST *tl = query_block->leaf_tables; ++ if (tl->table->s->is_secondary_engine()) { ++ return false; ++ } ++ if (tl->fetch_number_of_rows()) { ++ return false; ++ } ++ if (tl->table->s->get_table_ref_version() != context.table_version || ++ context.is_environment_changed(thd) || ++ context.is_table_stats_changed_sharply( ++ tl->table->file->stats.records) || ++ apply_cached_plan(thd, lex)) { ++ invalidate_cached_plan(query_block); ++ return false; ++ } ++ return true; ++ } else { ++ invalidate_cached_plan(query_block); ++ } ++ return false; ++} ++ ++} // namespace plan_cache +diff --git a/sql/sql_plan_cache.h b/sql/sql_plan_cache.h +new file mode 100644 +index 000000000..1193e1ca6 +--- /dev/null ++++ b/sql/sql_plan_cache.h +@@ -0,0 +1,59 @@ ++#ifndef PLAN_CACHE_INCLUDED ++#define PLAN_CACHE_INCLUDED ++ ++/* Copyright (c) 2025, Huawei and/or its affiliates. All rights reserved. ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License, version 2.0, ++ as published by the Free Software Foundation. ++ ++ This program is also distributed with certain software (including ++ but not limited to OpenSSL) that is licensed under separate terms, ++ as designated in a particular file or component or in included license ++ documentation. The authors of MySQL hereby grant you an additional ++ permission to link the program and your derivative works with the ++ separately licensed software that they have included with MySQL. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License, version 2.0, for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ ++#include "my_base.h" ++#include "sql/mysqld.h" ++ ++class JOIN; ++struct LEX; ++class Query_block; ++ ++namespace plan_cache { ++ ++struct Plan_cache_context { ++ static constexpr float allow_change_ratio = 0.2f; ++ ++ ulonglong optimizer_switch = 0; ++ const CHARSET_INFO *character_set_client; ++ ulonglong table_version = 0; ++ ha_rows table_records = 0; ++ ++ bool is_environment_changed(THD *thd); ++ bool is_table_stats_changed_sharply(ha_rows new_rows) { ++ if ((std::abs(longlong(new_rows - table_records)) / ++ (float)(table_records + 1)) > allow_change_ratio) { ++ return true; ++ } ++ return false; ++ } ++}; ++ ++void plan_cache(JOIN *join); ++void invalidate_cached_plan(Query_block *query_block); ++void check_query_plan_cachable(Query_block *query_block); ++bool apply_cached_plan_if_suitable(THD *thd, LEX *lex); ++ ++} // namespace plan_cache ++ ++#endif // PLAN_CACHE_INCLUDED +diff --git a/sql/sql_prepare.cc b/sql/sql_prepare.cc +index 1d4f5983d..d3512233b 100644 +--- a/sql/sql_prepare.cc ++++ b/sql/sql_prepare.cc +@@ -163,6 +163,7 @@ When one supplies long data for a placeholder: + #include "sql/sql_digest_stream.h" + #include "sql/sql_handler.h" // mysql_ha_rm_tables + #include "sql/sql_insert.h" // Query_result_create ++#include "sql/sql_plan_cache.h" // invalidate_cached_plan + #include "sql/sql_lex.h" + #include "sql/sql_list.h" + #include "sql/sql_parse.h" // sql_command_flags +@@ -3153,6 +3154,7 @@ bool Prepared_statement::reprepare() { + Prepared_statement copy(thd); + + swap_prepared_statement(©); ++ plan_cache::invalidate_cached_plan(copy.lex->query_block); + auto copy_guard = + create_scope_guard([&]() { swap_prepared_statement(©); }); + +diff --git a/sql/sql_resolver.cc b/sql/sql_resolver.cc +index 0c116ca63..a19ac8cee 100644 +--- a/sql/sql_resolver.cc ++++ b/sql/sql_resolver.cc +@@ -170,6 +170,37 @@ static Item *create_rollup_switcher(THD *thd, Query_block *query_block, + bool Query_block::prepare(THD *thd, mem_root_deque *insert_field_list) { + DBUG_TRACE; + ++ if (qep_cache_state == Query_block::QEP_CACHE_READY) { ++ thd->mark_used_columns = MARK_COLUMNS_READ; ++ thd->want_privilege = SELECT_ACL; ++ ++ is_item_list_lookup = false; ++ ++ /* Check that all tables, fields, conds and order are ok */ ++ ++ if (!(active_options() & OPTION_SETUP_TABLES_DONE)) { ++ if (setup_tables(thd, get_table_list(), false)) { ++ return true; ++ } ++ } ++ ++ is_item_list_lookup = true; ++ ++ if (setup_fields(thd, thd->want_privilege, /*allow_sum_func=*/true, ++ /*split_sum_funcs=*/true, /*column_update=*/false, ++ insert_field_list, &fields, base_ref_items)) { ++ return true; ++ } ++ ++ // Set up join conditions and WHERE clause ++ if (setup_conds(thd)) { ++ return true; ++ } ++ ++ assert(!thd->is_error()); ++ return false; ++ } ++ + assert(this == thd->lex->current_query_block()); + assert(join == nullptr); + assert(!thd->is_error()); +diff --git a/sql/sql_select.cc b/sql/sql_select.cc +index 0d5929ff9..1501feb62 100644 +--- a/sql/sql_select.cc ++++ b/sql/sql_select.cc +@@ -772,16 +772,18 @@ static bool optimize_secondary_engine(THD *thd) { + bool Sql_cmd_dml::execute_inner(THD *thd) { + Query_expression *unit = lex->unit; + +- if (unit->optimize(thd, /*materialize_destination=*/nullptr, +- /*create_iterators=*/true)) +- return true; ++ if (!plan_cache::apply_cached_plan_if_suitable(thd, lex)) { ++ if (unit->optimize(thd, /*materialize_destination=*/nullptr, ++ /*create_iterators=*/true)) ++ return true; + +- // Calculate the current statement cost. It will be made available in +- // the Last_query_cost status variable. +- thd->m_current_query_cost = accumulate_statement_cost(lex); ++ // Calculate the current statement cost. It will be made available in ++ // the Last_query_cost status variable. ++ thd->m_current_query_cost = accumulate_statement_cost(lex); + +- // Perform secondary engine optimizations, if needed. +- if (optimize_secondary_engine(thd)) return true; ++ // Perform secondary engine optimizations, if needed. ++ if (optimize_secondary_engine(thd)) return true; ++ } + + // We know by now that execution will complete (successful or with error) + lex->set_exec_completed(); +@@ -1805,7 +1807,16 @@ bool Query_block::optimize(THD *thd) { + DBUG_TRACE; + + assert(join == nullptr); +- JOIN *const join_local = new (thd->mem_root) JOIN(thd, this); ++ JOIN *join_local = nullptr; ++ ++ if (qep_cache_state == Query_block::QEP_CACHE_START) { ++ qep_cache_state = Query_block::QEP_CACHE_INITIALIZED; ++ Prepared_stmt_arena_holder ps_arena_holder(thd); ++ join_local = new (thd->mem_root) JOIN(thd, this); ++ } else { ++ join_local = new (thd->mem_root) JOIN(thd, this); ++ } ++ + if (!join_local) return true; /* purecov: inspected */ + + /* +@@ -2109,16 +2120,35 @@ void calc_length_and_keyparts(Key_use *keyuse, JOIN_TAB *tab, const uint key, + + bool init_ref(THD *thd, unsigned keyparts, unsigned length, unsigned keyno, + TABLE_REF *ref) { ++ Query_block *queryBlock = thd->lex->query_block; + ref->key_parts = keyparts; + ref->key_length = length; + ref->key = keyno; +- if (!(ref->key_buff = thd->mem_root->ArrayAlloc(ALIGN_SIZE(length))) || +- !(ref->key_buff2 = +- thd->mem_root->ArrayAlloc(ALIGN_SIZE(length))) || +- !(ref->key_copy = thd->mem_root->ArrayAlloc(keyparts)) || +- !(ref->items = thd->mem_root->ArrayAlloc(keyparts)) || +- !(ref->cond_guards = thd->mem_root->ArrayAlloc(keyparts))) { +- return true; ++ if (queryBlock->qep_cache_state == Query_block::QEP_CACHE_INITIALIZED) { ++ Prepared_stmt_arena_holder ps_arena_holder(thd); ++ if (!(ref->key_buff = ++ thd->mem_root->ArrayAlloc(ALIGN_SIZE(ref->key_length))) || ++ !(ref->key_buff2 = ++ thd->mem_root->ArrayAlloc(ALIGN_SIZE(ref->key_length))) || ++ !(ref->key_copy = ++ thd->mem_root->ArrayAlloc(ref->key_parts)) || ++ !(ref->items = thd->mem_root->ArrayAlloc(ref->key_parts)) || ++ !(ref->cond_guards = ++ thd->mem_root->ArrayAlloc(ref->key_parts))) { ++ return true; ++ } ++ } else { ++ if (!(ref->key_buff = ++ thd->mem_root->ArrayAlloc(ALIGN_SIZE(ref->key_length))) || ++ !(ref->key_buff2 = ++ thd->mem_root->ArrayAlloc(ALIGN_SIZE(ref->key_length))) || ++ !(ref->key_copy = ++ thd->mem_root->ArrayAlloc(ref->key_parts)) || ++ !(ref->items = thd->mem_root->ArrayAlloc(ref->key_parts)) || ++ !(ref->cond_guards = ++ thd->mem_root->ArrayAlloc(ref->key_parts))) { ++ return true; ++ } + } + ref->key_err = true; + ref->null_rejecting = 0; +@@ -3425,6 +3455,44 @@ void QEP_TAB::cleanup() { + } + } + ++void QEP_TAB::replace_cache_key() { ++ TABLE_REF *t_ref = &ref(); ++ THD *const thd = join()->thd; ++ Key_use *keyuse = NULL; ++ KEY *const keyinfo = table()->key_info + t_ref->key; ++ uchar *key_buff = t_ref->key_buff; ++ ++ memset(key_buff, 0, ALIGN_SIZE(t_ref->key_length) * 2); ++ ++ for (uint part_no = 0; part_no < t_ref->key_parts; part_no++) { ++ keyuse = &position()->key[part_no]; ++ bool maybe_null = keyinfo->key_part[part_no].null_bit; ++ ++ store_key *s_key = get_store_key(thd, keyuse->val, keyuse->used_tables, ++ join()->const_table_map, ++ &keyinfo->key_part[part_no], key_buff, ++ maybe_null); ++ if (unlikely(!s_key || thd->is_fatal_error())) { ++ assert(0); ++ } ++ ++ bool dummy_value = false; ++ uchar *arg = pointer_cast(&dummy_value); ++ keyuse->val->walk(&Item::repoint_const_outer_ref, enum_walk::PREFIX, arg); ++ ++ (void)s_key->copy(); ++ ++ if (s_key->null_key) { ++ t_ref->key_copy[part_no] = s_key; // Reevaluate in JOIN::exec() ++ } ++ else { ++ t_ref->key_copy[part_no] = nullptr; ++ } ++ ++ key_buff += keyinfo->key_part[part_no].store_length; ++ } ++} ++ + void QEP_shared_owner::qs_cleanup() { + /* Skip non-existing derived tables/views result tables */ + if (table() && +@@ -3442,6 +3510,32 @@ void QEP_shared_owner::qs_cleanup() { + delete quick(); + } + ++void QEP_shared_owner::qs_cleanup(bool full) { ++ /* Skip non-existing derived tables/views result tables */ ++ if (table() && ++ (table()->s->tmp_table != INTERNAL_TMP_TABLE || table()->is_created())) { ++ table()->set_keyread(false); ++ table()->file->ha_index_or_rnd_end(); ++ free_io_cache(table()); ++ filesort_free_buffers(table(), true); ++ TABLE_LIST *const table_ref = table()->pos_in_table_list; ++ if (table_ref) { ++ table_ref->derived_keys_ready = false; ++ table_ref->derived_key_list.clear(); ++ } ++ } ++ if (full && ++ (join()->query_block->qep_cache_state == Query_block::QEP_CACHE_READY)) { ++ QUICK_RANGE_SELECT *squick = dynamic_cast(quick()); ++ if (squick) { ++ squick->ranges.clear(); ++ } ++ ++ return; ++ } ++ delete quick(); ++} ++ + uint QEP_TAB::sjm_query_block_id() const { + assert(sj_is_materialize_strategy(get_sj_strategy())); + for (uint i = 0; i < join()->primary_tables; ++i) { +@@ -3985,9 +4079,17 @@ bool JOIN::alloc_func_list() { + } + + /* This must use calloc() as rollup_make_fields depends on this */ +- sum_funcs = +- (Item_sum **)thd->mem_calloc(sizeof(Item_sum **) * (func_count + 1) + +- sizeof(Item_sum ***) * (group_parts + 1)); ++ if (query_block->with_sum_func && ++ query_block->qep_cache_state == Query_block::QEP_CACHE_INITIALIZED) { ++ Prepared_stmt_arena_holder ps_arena_holder(thd); ++ sum_funcs = ++ (Item_sum **)thd->mem_calloc(sizeof(Item_sum **) * (func_count + 1) + ++ sizeof(Item_sum ***) * (group_parts + 1)); ++ } else { ++ sum_funcs = ++ (Item_sum **)thd->mem_calloc(sizeof(Item_sum **) * (func_count + 1) + ++ sizeof(Item_sum ***) * (group_parts + 1)); ++ } + return sum_funcs == nullptr; + } + +diff --git a/sql/sql_union.cc b/sql/sql_union.cc +index 76977f95a..481f2f34b 100644 +--- a/sql/sql_union.cc ++++ b/sql/sql_union.cc +@@ -72,6 +72,7 @@ + #include "sql/opt_explain.h" // explain_no_table + #include "sql/opt_explain_format.h" + #include "sql/opt_trace.h" ++#include "sql/opt_range.h" + #include "sql/parse_tree_node_base.h" + #include "sql/parse_tree_nodes.h" // PT_with_clause + #include "sql/pfs_batch_mode.h" +@@ -991,6 +992,14 @@ void Query_expression::create_access_paths(THD *thd) { + } + } + ++bool Query_expression::create_root_iterator(THD *thd) ++{ ++ JOIN *join = first_query_block()->join; ++ m_root_iterator = CreateIteratorFromAccessPath( ++ thd, m_root_access_path, join, /*eligible_for_batch_mode=*/true); ++ return false; ++} ++ + /** + Explain query starting from this unit. + +@@ -1526,13 +1535,21 @@ static void destroy_materialized(TABLE_LIST *list) { + + void Query_block::cleanup(THD *thd, bool full) { + if (join) { +- if (full) { +- assert(join->query_block == this); +- join->destroy(); +- ::destroy(join); +- join = nullptr; +- } else +- join->cleanup(); ++ if (qep_cache_state == Query_block::QEP_CACHE_READY) { ++ if (full && leaf_tables->table) { ++ assert(join->query_block == this); ++ join->qep_tab->qs_cleanup(true); ++ } ++ } else { ++ if (full) { ++ assert(join->query_block == this); ++ join->destroy(); ++ ::destroy(join); ++ join = nullptr; ++ } else { ++ join->cleanup(); ++ } ++ } + } + + if (full) { +diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc +index 3b8473bd1..6af8be88d 100644 +--- a/sql/sys_vars.cc ++++ b/sql/sys_vars.cc +@@ -7015,6 +7015,13 @@ static Sys_var_charptr Sys_protocol_compression_algorithms( + NO_MUTEX_GUARD, NOT_IN_BINLOG, + ON_CHECK(check_set_protocol_compression_algorithms), ON_UPDATE(nullptr)); + ++static Sys_var_bool Sys_plan_cache("plan_cache", ++ "Sys_plan_cache " ++ "when first execute prepare stmt " ++ "the remove product execute plan time ", ++ GLOBAL_VAR(opt_plan_cache), ++ CMD_LINE(OPT_ARG), DEFAULT(true)); ++ + static bool check_set_require_row_format(sys_var *, THD *thd, set_var *var) { + /* + Should own SUPER or SYSTEM_VARIABLES_ADMIN or SESSION_VARIABLES_ADMIN +diff --git a/storage/innobase/ut/crc32.cc b/storage/innobase/ut/crc32.cc +index 77a7aa6d6..70ded44e1 100644 +--- a/storage/innobase/ut/crc32.cc ++++ b/storage/innobase/ut/crc32.cc +@@ -99,6 +99,9 @@ external tools. */ + #define gnuc64 + #endif + ++#if defined(__aarch64__) ++#include ++#else + #if defined(gnuc64) || defined(_WIN32) + /* + GCC 4.8 can't include intrinsic headers without -msse4.2. +@@ -121,6 +124,7 @@ ALWAYS_INLINE uint64_t _mm_crc32_u64(uint64_t __C, uint64_t __V) { + } + #endif + #endif // defined(gnuc64) || defined(_WIN32) ++#endif // defined(__aarch64__) + + #include "univ.i" + #include "ut0crc32.h" +@@ -155,7 +159,7 @@ bool ut_crc32_cpu_enabled = false; + #if defined(_WIN32) + #include + #endif +-#if defined(gnuc64) || defined(_WIN32) ++#if defined(gnuc64) || defined(_WIN32) || defined(__aarch64__) + /** Checks whether the CPU has the CRC32 instructions (part of the SSE4.2 + instruction set). + @return true if CRC32 is available */ +@@ -180,6 +184,9 @@ static bool ut_crc32_check_cpu() { + return false; + #else + ++#if defined(__aarch64__) ++ return true; ++#else + uint32_t features_ecx; + + #if defined(gnuc64) +@@ -201,6 +208,7 @@ static bool ut_crc32_check_cpu() { + #endif + + return features_ecx & (1 << 20); // SSE4.2 ++#endif /* __aarch64__ */ + #endif /* UNIV_DEBUG_VALGRIND */ + } + +@@ -210,9 +218,17 @@ when the function ends it will contain the new checksum + @param[in,out] data data to be checksummed, the pointer will be advanced + with 1 byte + @param[in,out] len remaining bytes, it will be decremented with 1 */ ++#if defined(__aarch64__) ++MY_ATTRIBUTE((target("arch=armv8-a+crc"))) ++#else + MY_ATTRIBUTE((target("sse4.2"))) ++#endif + inline void ut_crc32_8_hw(uint64_t *crc, const byte **data, ulint *len) { ++#if defined(__aarch64__) ++ *crc = __crc32cb(static_cast(*crc), (*data)[0]); ++#else + *crc = _mm_crc32_u8(static_cast(*crc), (*data)[0]); ++#endif + (*data)++; + (*len)--; + } +@@ -221,10 +237,18 @@ inline void ut_crc32_8_hw(uint64_t *crc, const byte **data, ulint *len) { + @param[in] crc crc32 checksum so far + @param[in] data data to be checksummed + @return resulting checksum of crc + crc(data) */ ++#if defined(__aarch64__) ++MY_ATTRIBUTE((target("arch=armv8-a+crc"))) ++#else + MY_ATTRIBUTE((target("sse4.2"))) ++#endif + inline uint64_t ut_crc32_64_low_hw(uint64_t crc, uint64_t data) { + uint64_t crc_64bit = crc; ++#if defined(__aarch64__) ++ crc_64bit = __crc32cd(crc_64bit, data); ++#else + crc_64bit = _mm_crc32_u64(crc_64bit, data); ++#endif + return (crc_64bit); + } + +@@ -234,7 +258,11 @@ when the function ends it will contain the new checksum + @param[in,out] data data to be checksummed, the pointer will be advanced + with 8 bytes + @param[in,out] len remaining bytes, it will be decremented with 8 */ ++#if defined(__aarch64__) ++MY_ATTRIBUTE((target("arch=armv8-a+crc"))) ++#else + MY_ATTRIBUTE((target("sse4.2"))) ++#endif + inline void ut_crc32_64_hw(uint64_t *crc, const byte **data, ulint *len) { + uint64_t data_int = *reinterpret_cast(*data); + +@@ -284,7 +312,11 @@ inline void ut_crc32_64_legacy_big_endian_hw(uint64_t *crc, const byte **data, + @param[in] buf data over which to calculate CRC32 + @param[in] len data length + @return CRC-32C (polynomial 0x11EDC6F41) */ ++#if defined(__aarch64__) ++MY_ATTRIBUTE((target("arch=armv8-a+crc"))) ++#else + MY_ATTRIBUTE((target("sse4.2"))) ++#endif + static uint32_t ut_crc32_hw(const byte *buf, ulint len) { + uint64_t crc = 0xFFFFFFFFU; + +@@ -660,7 +692,7 @@ static uint32_t ut_crc32_byte_by_byte_sw(const byte *buf, ulint len) { + /** Initializes the data structures used by ut_crc32*(). Does not do any + allocations, would not hurt if called twice, but would be pointless. */ + void ut_crc32_init() { +-#if defined(gnuc64) || defined(_WIN32) ++#if defined(gnuc64) || defined(_WIN32) || defined(__aarch64__) + ut_crc32_cpu_enabled = ut_crc32_check_cpu(); + + if (ut_crc32_cpu_enabled) { +@@ -668,7 +700,7 @@ void ut_crc32_init() { + ut_crc32_legacy_big_endian = ut_crc32_legacy_big_endian_hw; + ut_crc32_byte_by_byte = ut_crc32_byte_by_byte_hw; + } +-#endif /* defined(gnuc64) || defined(_WIN32) */ ++#endif /* defined(gnuc64) || defined(_WIN32) || defined(__aarch64__) */ + + if (!ut_crc32_cpu_enabled) { + ut_crc32_slice8_table_init(); +diff --git a/storage/perfschema/pfs_variable.cc b/storage/perfschema/pfs_variable.cc +index d33b1a488..15a9f41e8 100644 +--- a/storage/perfschema/pfs_variable.cc ++++ b/storage/perfschema/pfs_variable.cc +@@ -1157,6 +1157,17 @@ int PFS_status_variable_cache::do_materialize_global(void) { + false, /* threads */ + true, /* THDs */ + &visitor); ++ ++ /* ++ Because of the reason described in ++ PFS_status_variable_cache::do_materialize_all(THD *unsafe_thd), ++ PFS_status_variable_cache::do_materialize_session(THD *unsafe_thd) and ++ PFS_status_variable_cache::do_materialize_session(PFS_thread *pfs_thread), ++ count_num_thread_running() cannot put together with ++ get_num_thread_running(), so count_num_thread_running() is put here. ++ */ ++ Global_THD_manager::get_instance()->count_num_thread_running(); ++ + /* + Build the status variable cache using the SHOW_VAR array as a reference. + Use the status totals collected from all threads. +@@ -1200,6 +1211,22 @@ int PFS_status_variable_cache::do_materialize_all(THD *unsafe_thd) { + init_show_var_array(OPT_SESSION, false); + } + ++ /* ++ count_num_thread_running() counts the total number of running threads ++ from global thread list, using LOCK_thd_list to protect sharded ++ global thread list. In lock_order_dependencies.txt, the lock order ++ is that LOCK_thd_list must be locked before LOCK_thd_data. In this ++ function, LOCK_thd_data is already locked in get_THD(), Then manifest() ++ will call get_num_thread_running(). If get_num_thread_running() counts ++ and returns the num, the lock order will be incorrect, which may ++ lead to dead lock. To prevent this situation, get_num_thread_running() ++ is split into two part, one is still called get_num_thread_running() ++ which returns the num, the other is called count_num_thread_running() ++ which counts the num and should be called before get_THD() and ++ get_num_thread_running(). So count_num_thread_running() is put here. ++ */ ++ Global_THD_manager::get_instance()->count_num_thread_running(); ++ + /* Get and lock a validated THD from the thread manager. */ + if ((m_safe_thd = get_THD(unsafe_thd)) != nullptr) { + /* +@@ -1249,6 +1276,22 @@ int PFS_status_variable_cache::do_materialize_session(THD *unsafe_thd) { + init_show_var_array(OPT_SESSION, true); + } + ++ /* ++ count_num_thread_running() counts the total number of running threads ++ from global thread list, using LOCK_thd_list to protect sharded ++ global thread list. In lock_order_dependencies.txt, the lock order ++ is that LOCK_thd_list must be locked before LOCK_thd_data. In this ++ function, LOCK_thd_data is already locked in get_THD(), Then manifest() ++ will call get_num_thread_running(). If get_num_thread_running() counts ++ and returns the num, the lock order will be incorrect, which may ++ lead to dead lock. To prevent this situation, get_num_thread_running() ++ is split into two part, one is still called get_num_thread_running() ++ which returns the num, the other is called count_num_thread_running() ++ which counts the num and should be called before get_THD() and ++ get_num_thread_running(). So count_num_thread_running() is put here. ++ */ ++ Global_THD_manager::get_instance()->count_num_thread_running(); ++ + /* Get and lock a validated THD from the thread manager. */ + if ((m_safe_thd = get_THD(unsafe_thd)) != nullptr) { + /* +@@ -1292,6 +1335,22 @@ int PFS_status_variable_cache::do_materialize_session(PFS_thread *pfs_thread) { + /* The SHOW_VAR array must be initialized externally. */ + assert(m_initialized); + ++ /* ++ count_num_thread_running() counts the total number of running threads ++ from global thread list, using LOCK_thd_list to protect sharded ++ global thread list. In lock_order_dependencies.txt, the lock order ++ is that LOCK_thd_list must be locked before LOCK_thd_data. In this ++ function, LOCK_thd_data is already locked in get_THD(), Then manifest() ++ will call get_num_thread_running(). If get_num_thread_running() counts ++ and returns the num, the lock order will be incorrect, which may ++ lead to dead lock. To prevent this situation, get_num_thread_running() ++ is split into two part, one is still called get_num_thread_running() ++ which returns the num, the other is called count_num_thread_running() ++ which counts the num and should be called before get_THD() and ++ get_num_thread_running(). So count_num_thread_running() is put here. ++ */ ++ Global_THD_manager::get_instance()->count_num_thread_running(); ++ + /* Get and lock a validated THD from the thread manager. */ + if ((m_safe_thd = get_THD(pfs_thread)) != nullptr) { + /* +@@ -1343,6 +1402,16 @@ int PFS_status_variable_cache::do_materialize_client(PFS_client *pfs_client) { + */ + m_sum_client_status(pfs_client, &status_totals); + ++ /* ++ Because of the reason described in ++ PFS_status_variable_cache::do_materialize_all(THD *unsafe_thd), ++ PFS_status_variable_cache::do_materialize_session(THD *unsafe_thd) and ++ PFS_status_variable_cache::do_materialize_session(PFS_thread *pfs_thread), ++ count_num_thread_running() cannot put together with ++ get_num_thread_running(), so count_num_thread_running() is put here. ++ */ ++ Global_THD_manager::get_instance()->count_num_thread_running(); ++ + /* + Build the status variable cache using the SHOW_VAR array as a reference and + the status totals collected from threads associated with this client. +diff --git a/unittest/gunit/thd_manager-t.cc b/unittest/gunit/thd_manager-t.cc +index 1e2efa420..4c0aa1bdb 100644 +--- a/unittest/gunit/thd_manager-t.cc ++++ b/unittest/gunit/thd_manager-t.cc +@@ -82,14 +82,6 @@ TEST_F(ThreadManagerTest, AddRemoveTHDWithGuard) { + EXPECT_EQ(0U, thd_manager->get_thd_count()); + } + +-TEST_F(ThreadManagerTest, IncDecThreadRunning) { +- EXPECT_EQ(0, thd_manager->get_num_thread_running()); +- thd_manager->inc_thread_running(); +- EXPECT_EQ(1, thd_manager->get_num_thread_running()); +- thd_manager->dec_thread_running(); +- EXPECT_EQ(0, thd_manager->get_num_thread_running()); +-} +- + TEST_F(ThreadManagerTest, IncThreadCreated) { + EXPECT_EQ(0U, thd_manager->get_num_thread_created()); + thd_manager->inc_thread_created(); +-- +2.31.1.windows.1 +