diff --git a/plugin/thread_pool/threadpool_unix.cc b/plugin/thread_pool/threadpool_unix.cc new file mode 100644 index 0000000000000000000000000000000000000000..d0e2cad7572b442887c26478311e6851b87d277c --- /dev/null +++ b/plugin/thread_pool/threadpool_unix.cc @@ -0,0 +1,500 @@ +/* Copyright (C) 2012 Monty Program Ab + Copyright (C) 2022 Huawei Technologies Co., Ltd + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include "threadpool_unix.h" +#include "sql/debug_sync.h" +#include "sql/log.h" +#include "sql/protocol_classic.h" +#include "my_sys.h" +#include "my_systime.h" +#include "mysql/thread_pool_priv.h" // thd_is_transaction_active() +#include "mysql/plugin.h" +#include "threadpool.h" +#include +#include +#include + +#define MYSQL_SERVER 1 + +/** Maximum number of native events a listener can read in one go */ +#define MAX_EVENTS 1024 + +/** Define if wait_begin() should create threads if necessary without waiting +for stall detection to kick in */ +#define THREADPOOL_CREATE_THREADS_ON_WAIT + +/** Indicates that threadpool was initialized*/ +static bool threadpool_started = false; + +thread_pool_rwlock_t change_group_rwlock; + +/* + Define PSI Keys for performance schema. + We have a mutex per group, worker threads, condition per worker thread, + and timer thread with its own mutex and condition. +*/ + +#ifdef HAVE_PSI_INTERFACE +static PSI_mutex_key key_group_mutex; +static PSI_mutex_key key_timer_mutex; +static PSI_mutex_info mutex_list[] = { + {&key_group_mutex, "group_mutex", 0, 0, PSI_DOCUMENT_ME}, + {&key_timer_mutex, "timer_mutex", PSI_FLAG_SINGLETON, 0, PSI_DOCUMENT_ME}}; + +static PSI_cond_key key_worker_cond; +static PSI_cond_key key_timer_cond; +static PSI_cond_info cond_list[] = { + {&key_worker_cond, "worker_cond", 0, 0, PSI_DOCUMENT_ME}, + {&key_timer_cond, "timer_cond", PSI_FLAG_SINGLETON, 0, PSI_DOCUMENT_ME}}; + +static PSI_thread_key key_worker_thread; +static PSI_thread_key key_timer_thread; +static PSI_thread_info thread_list[] = { + {&key_worker_thread, "worker_thread", "thread_pool_worker", 0, 0, PSI_DOCUMENT_ME}, + {&key_timer_thread, "timer_thread", "thread_pool_timer", PSI_FLAG_SINGLETON, 0, + PSI_DOCUMENT_ME}}; +#endif // HAVE_PSI_INTERFACE + +thread_group_t all_groups[MAX_THREAD_GROUPS]; +numa_affinity_manager group_affinity; + +static uint group_count; + +/** + Used for printing "pool blocked" message, see + print_pool_blocked_message(); +*/ +static ulonglong pool_block_start; + +/* Global timer for all groups */ +struct pool_timer_t { + mysql_mutex_t mutex; + mysql_cond_t cond; + std::atomic current_microtime; + std::atomic next_timeout_check; + int tick_interval; + bool shutdown; +}; + +static pool_timer_t pool_timer; + +static void queue_put(thread_group_t *thread_group, connection_t *connection); +static int wake_thread(thread_group_t *thread_group, + bool due_to_stall) noexcept; +static void handle_event(connection_t *connection); +static int wake_or_create_thread(thread_group_t *thread_group, + bool due_to_stall = false); +static int create_worker(thread_group_t *thread_group, bool due_to_stall) noexcept; +static void *admin_port_worker_main(void *param); +static void *worker_main(void *param); +static void *connection_detach_worker(void *param); +static void check_stall(thread_group_t *thread_group); +static void connection_abort(connection_t *connection); +static void set_next_timeout_check(ulonglong abstime); +static void print_pool_blocked_message(bool) noexcept; + +THD *thd_to_detach = nullptr; + +class ThreadPoolConnSet { +public: + ThreadPoolConnSet() {}; + virtual ~ThreadPoolConnSet() {}; + + bool empty() { + bool ret = false; + mtx.lock(); + ret = conns.empty(); + mtx.unlock(); + return ret; + } + + void killConns() { + mtx.lock(); + for (auto &it: conns) { + THD *thd = it->thd; + if (current_thd != thd && thd->killed != THD::KILL_CONNECTION) { + mysql_mutex_lock(&thd->LOCK_thd_data); + thd->killed = THD::KILL_CONNECTION; + tp_post_kill_notification(thd); + mysql_mutex_unlock(&thd->LOCK_thd_data); + } else if (current_thd == thd) { + thd_to_detach = thd; + } + } + mtx.unlock(); + } + + void insert(connection_t *c) { + mtx.lock(); + conns.insert(c); + mtx.unlock(); + } + + void erase(connection_t *c) { + mtx.lock(); + conns.erase(c); + mtx.unlock(); + } + +public: + std::set conns; + std::mutex mtx; +}; + +ThreadPoolConnSet threadpool_thds; + +int vio_cancel(Vio *vio, int how) +{ + int r= 0; + DBUG_ENTER("vio_cancel"); + + if (vio->inactive == false) + { + assert(vio->type == VIO_TYPE_TCPIP || + vio->type == VIO_TYPE_SOCKET || + vio->type == VIO_TYPE_SSL); + + assert(mysql_socket_getfd(vio->mysql_socket) >= 0); + if (mysql_socket_shutdown(vio->mysql_socket, how)) + r= -1; + } + + DBUG_RETURN(r); +} + +/** + Asynchronous network IO. + + We use native edge-triggered network IO multiplexing facility. + This maps to different APIs on different Unixes. + + Supported are currently Linux with epoll, Solaris with event ports, + OSX and BSD with kevent. All those API's are used with one-shot flags + (the event is signalled once client has written something into the socket, + then socket is removed from the "poll-set" until the command is finished, + and we need to re-arm/re-register socket) + + No implementation for poll/select/AIO is currently provided. + + The API closely resembles all of the above mentioned platform APIs + and consists of following functions. + + - io_poll_create() + Creates an io_poll descriptor + On Linux: epoll_create() + + - io_poll_associate_fd(int poll_fd, int fd, void *data) + Associate file descriptor with io poll descriptor + On Linux : epoll_ctl(..EPOLL_CTL_ADD)) + + - io_poll_disassociate_fd(int pollfd, int fd) + Associate file descriptor with io poll descriptor + On Linux: epoll_ctl(..EPOLL_CTL_DEL) + + + - io_poll_start_read(int poll_fd,int fd, void *data) + The same as io_poll_associate_fd(), but cannot be used before + io_poll_associate_fd() was called. + On Linux : epoll_ctl(..EPOLL_CTL_MOD) + + - io_poll_wait (int pollfd, native_event *native_events, int maxevents, + int timeout_ms) + + wait until one or more descriptors added with io_poll_associate_fd() + or io_poll_start_read() becomes readable. Data associated with + descriptors can be retrieved from native_events array, using + native_event_get_userdata() function. + + + On Linux: epoll_wait() +*/ + +#if defined(__linux__) +#ifndef EPOLLRDHUP +/* Early 2.6 kernel did not have EPOLLRDHUP */ +#define EPOLLRDHUP 0 +#endif +static int io_poll_create() noexcept { return epoll_create(1); } + +static int io_poll_associate_fd(int pollfd, int fd, void *data) noexcept { + struct epoll_event ev; + ev.data.u64 = 0; /* Keep valgrind happy */ + ev.data.ptr = data; + ev.events = EPOLLIN | EPOLLET | EPOLLERR | EPOLLRDHUP | EPOLLONESHOT; + return epoll_ctl(pollfd, EPOLL_CTL_ADD, fd, &ev); +} + +static int io_poll_start_read(int pollfd, int fd, void *data) noexcept { + struct epoll_event ev; + ev.data.u64 = 0; /* Keep valgrind happy */ + ev.data.ptr = data; + ev.events = EPOLLIN | EPOLLET | EPOLLERR | EPOLLRDHUP | EPOLLONESHOT; + return epoll_ctl(pollfd, EPOLL_CTL_MOD, fd, &ev); +} + +static int io_poll_disassociate_fd(int pollfd, int fd) noexcept { + struct epoll_event ev; + return epoll_ctl(pollfd, EPOLL_CTL_DEL, fd, &ev); +} + +/* + Wrapper around epoll_wait. + NOTE - in case of EINTR, it restarts with original timeout. Since we use + either infinite or 0 timeouts, this is not critical +*/ +static int io_poll_wait(int pollfd, native_event *native_events, int maxevents, + int timeout_ms) noexcept { + int ret; + do { + ret = epoll_wait(pollfd, native_events, maxevents, timeout_ms); + } while (ret == -1 && errno == EINTR); + return ret; +} + +static void *native_event_get_userdata(native_event *event) noexcept { + return event->data.ptr; +} + +#elif defined(__FreeBSD__) || defined(__APPLE__) +static int io_poll_create() noexcept { return kqueue(); } + +static int io_poll_start_read(int pollfd, int fd, void *data) noexcept { + struct kevent ke; + EV_SET(&ke, fd, EVFILT_READ, EV_ADD | EV_ONESHOT, 0, 0, data); + return kevent(pollfd, &ke, 1, 0, 0, 0); +} + +static int io_poll_associate_fd(int pollfd, int fd, void *data) noexcept { + struct kevent ke; + EV_SET(&ke, fd, EVFILT_READ, EV_ADD | EV_ONESHOT, 0, 0, data); + return io_poll_start_read(pollfd, fd, data); +} + +static int io_poll_disassociate_fd(int pollfd, int fd) noexcept { + struct kevent ke; + EV_SET(&ke, fd, EVFILT_READ, EV_DELETE, 0, 0, nullptr); + return kevent(pollfd, &ke, 1, 0, 0, 0); +} + +static int io_poll_wait(int pollfd, struct kevent *events, int maxevents, + int timeout_ms) noexcept { + struct timespec ts; + int ret; + if (timeout_ms >= 0) { + ts.tv_sec = timeout_ms / 1000; + ts.tv_nsec = (timeout_ms % 1000) * 1000000; + } + do { + ret = kevent(pollfd, 0, 0, events, maxevents, + (timeout_ms >= 0) ? &ts : nullptr); + } while (ret == -1 && errno == EINTR); + return ret; +} + +static void *native_event_get_userdata(native_event *event) noexcept { + return event->udata; +} +#else +#error not ported yet to this OS +#endif + +namespace { + +/* + Prevent too many active threads executing at the same time, if the workload is + not CPU bound. +*/ +inline bool too_many_active_threads( + const thread_group_t &thread_group) noexcept { + return (thread_group.active_thread_count >= + 1 + (int)threadpool_oversubscribe && + !thread_group.stalled); +} + +/* + Limit the number of 'busy' threads by 1 + threadpool_toobusy. A thread + is busy if it is in either the active state or the waiting state (i.e. between + thd_wait_begin() / thd_wait_end() calls). +*/ +inline bool too_many_busy_threads(const thread_group_t &thread_group) noexcept { + return (thread_group.active_thread_count + thread_group.waiting_thread_count > + 1 + (int)threadpool_toobusy); +} + +inline bool too_many_connection(const thread_group_t &thread_group) noexcept { + return (thread_group.connection_count > (int)threadpool_toobusy - 1); +} + +/* + Checks if a given connection is eligible to enter the high priority queue + based on its current thread_pool_high_prio_mode value, available high + priority tickets and transactional state and whether any locks are held. +*/ +inline bool connection_is_high_prio(const connection_t &c) noexcept { + const ulong mode = tp_get_thdvar_high_prio_mode(c.thd); + + return (mode == TP_HIGH_PRIO_MODE_STATEMENTS) || + (mode == TP_HIGH_PRIO_MODE_TRANSACTIONS && c.tickets > 0 && + (thd_is_transaction_active(c.thd) || + c.thd->variables.option_bits & OPTION_TABLE_LOCK || + c.thd->locked_tables_mode != LTM_NONE || + c.thd->mdl_context.has_locks() || + c.thd->global_read_lock.is_acquired() || + c.thd->mdl_context.has_locks(MDL_key::USER_LEVEL_LOCK) || + c.thd->mdl_context.has_locks(MDL_key::LOCKING_SERVICE))); +} + +int change_group(connection_t *c, thread_group_t *group, thread_group_t *to_group) { + assert(c->thread_group == group); + + /* Remove connection from the old group. */ + if (c->bound_to_poll_descriptor) { + Vio *const vio = c->thd->get_protocol_classic()->get_vio(); + const int fd = mysql_socket_getfd(vio->mysql_socket); + mysql_mutex_lock(&group->mutex); + io_poll_disassociate_fd(group->pollfd, fd); + c->bound_to_poll_descriptor = false; + mysql_mutex_unlock(&group->mutex); + } + c->thread_group->connection_count--; + + /* Add connection to the new group. */ + c->thread_group = to_group; + to_group->connection_count++; + /* Ensure that there is a listener in the new group. */ + int ret = 0; + if (!to_group->thread_count) ret = create_worker(to_group, false); + mysql_mutex_unlock(&to_group->mutex); + + return ret; +} + +int get_avg_conn_cnt() { + int total_conn_cnt = 0; + + for (uint i = 0; i < group_count; i++) { + total_conn_cnt += all_groups[i].connection_count; + } + return ceil((total_conn_cnt + 0.0) / group_count); +} + +thread_group_t *get_change_group_to(connection_t *connection) { + int avg_conn_cnt = get_avg_conn_cnt(); + thread_group_t *group = connection->thread_group; + + thread_group_t *to_group = &all_groups[(connection->thd->thread_id()) % group_count]; + if (to_group->connection_count <= avg_conn_cnt || !threadpool_connection_balance) { + return to_group; + } + + for (uint i = 0; i < group_count; i++) { + if (group == &all_groups[i]) { + continue; + } + if (all_groups[i].connection_count < avg_conn_cnt || + (connection->thread_group - all_groups >= group_count && i == group_count - 1)) { + return &all_groups[i]; + } + } + return &all_groups[group_count - 1]; +} + +int get_min_conn_cnt() { + int min_conn_cnt = INT_MAX; + for (uint i = 0; i < group_count; i++) { + min_conn_cnt = all_groups[i].connection_count < min_conn_cnt ? + all_groups[i].connection_count : min_conn_cnt; + } + return min_conn_cnt; +} + +bool check_change_group_low(connection_t *connection) { + return connection->thread_group - all_groups >= group_count || + (threadpool_connection_balance && + (connection->thread_group->connection_count > get_avg_conn_cnt() || + connection->thread_group->connection_count - get_min_conn_cnt() >= 2)); +} + +int change_group(connection_t *connection) { + int ret = -1; + change_group_rwlock.xlock(); + if (check_change_group_low(connection)) { + thread_group_t *to_group = get_change_group_to(connection); + ret = change_group(connection, connection->thread_group, to_group); + } + change_group_rwlock.unxlock(); + return ret; +} + +/** + Check if connection needs to migrate to a different group + because group_count has changed after thread_pool_size + setting or connection_count in each thread group is not + evenly distributed. +*/ +bool check_change_group(connection_t *connection) { + bool ret = false; + change_group_rwlock.slock(); + ret = check_change_group_low(connection); + change_group_rwlock.unslock(); + return ret; +} + +inline bool connection_is_worker_continue(const connection_t &c) noexcept { + if (c.thd->is_admin_connection()) { + return true; + } + + if (check_change_group(const_cast(&c))) { + return false; + } + + if (!too_many_connection(*(c.thread_group))) { + return true; + } + + const ulong mode = tp_get_thdvar_high_prio_mode(c.thd); + bool ret = (mode == TP_HIGH_PRIO_MODE_TRANSACTIONS && c.tickets > 0 && + (thd_is_transaction_active(c.thd) || + c.thd->variables.option_bits & OPTION_TABLE_LOCK || + c.thd->locked_tables_mode != LTM_NONE || + c.thd->mdl_context.has_locks() || + c.thd->global_read_lock.is_acquired() || + c.thd->mdl_context.has_locks(MDL_key::USER_LEVEL_LOCK) || + c.thd->mdl_context.has_locks(MDL_key::LOCKING_SERVICE))); + return ret; +} + +} // namespace + +/* Dequeue element from a workqueue */ +static connection_t *queue_get(thread_group_t *thread_group) noexcept { + DBUG_ENTER("queue_get"); + thread_group->queue_event_count++; + connection_t *c; + + if ((c = thread_group->high_prio_queue.front())) { + thread_group->high_prio_queue.remove(c); + } + /* + Don't pick events from the low priority queue if there are too many + active + waiting threads. + */ + else if (!too_many_busy_threads(*thread_group) && + (c = thread_group->queue.front())) { + thread_group->queue.remove(c); + } + DBUG_RETURN(c); +} \ No newline at end of file diff --git a/plugin/thread_pool/threadpool_unix.h b/plugin/thread_pool/threadpool_unix.h new file mode 100644 index 0000000000000000000000000000000000000000..3c561f2da75484170dd86029185be7e786973c93 --- /dev/null +++ b/plugin/thread_pool/threadpool_unix.h @@ -0,0 +1,135 @@ +/* Copyright (C) 2012 Monty Program Ab + Copyright (C) 2022 Huawei Technologies Co., Ltd + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, + USA */ + +#ifndef THREADPOOL_UNIX_H_ +#define THREADPOOL_UNIX_H_ + +#include "mysql/service_thd_wait.h" +#include "sql/sql_plist.h" +#include "sql/mysqld.h" +#include "threadpool.h" +#include "violite.h" +#include "numa_affinity_manager.h" + +#ifdef __linux__ +#include +typedef struct epoll_event native_event; +#endif +#if defined(__FreeBSD__) || defined(__APPLE__) +#include +typedef struct kevent native_event; +#endif +#if defined(__sun) +#include +typedef port_event_t native_event; +#endif + +#define my_microsecond_getsystime() (my_getsystime()/10) + +struct thread_group_t; + +/* Per-thread structure for workers */ +struct worker_thread_t { + ulonglong event_count; /* number of request handled by this thread */ + thread_group_t *thread_group; + worker_thread_t *next_in_list; + worker_thread_t **prev_in_list; + + mysql_cond_t cond; + bool woken; +}; + +typedef I_P_List< + worker_thread_t, + I_P_List_adapter> + worker_list_t; + +struct connection_t { + THD *thd; + thread_group_t *thread_group; + connection_t *next_in_queue; + connection_t **prev_in_queue; + ulonglong abs_wait_timeout; + ulonglong enqueue_time; + bool logged_in; + bool bound_to_poll_descriptor; + bool waiting; + uint tickets; +}; + +typedef I_P_List, + I_P_List_counter, I_P_List_fast_push_back> + connection_queue_t; + +const int NQUEUES = 2; /* We have high and low priority queues */ + +enum operation_origin +{ + WORKER, + LISTENER +}; + +struct thread_group_counters_t +{ + ulonglong thread_creations; + ulonglong thread_creations_due_to_stall; + ulonglong wakes; + ulonglong wakes_due_to_stall; + ulonglong throttles; + ulonglong stalls; + ulonglong dequeues[2]; + ulonglong polls[2]; +}; + +struct alignas(128) thread_group_t { + mysql_mutex_t mutex; + connection_queue_t queue; + connection_queue_t high_prio_queue; + worker_list_t waiting_threads; + worker_thread_t *listener; + pthread_attr_t *pthread_attr; + int pollfd; + int thread_count; + int admin_port_thread_count; + int dump_thread_count; + int active_thread_count; + int connection_count; + int waiting_thread_count; + /* Stats for the deadlock detection timer routine.*/ + int io_event_count; + int queue_event_count; + ulonglong last_thread_creation_time; + int shutdown_pipe[2]; + bool shutdown; + bool stalled; + thread_group_counters_t counters; + char padding[320 - sizeof(thread_group_counters_t)]; +}; + +static_assert(sizeof(thread_group_t) == 512, + "sizeof(thread_group_t) must be 512 to avoid false sharing"); + +#define TP_INCREMENT_GROUP_COUNTER(group, var) do {group->counters.var++;}while(0) + +extern thread_group_t all_groups[MAX_THREAD_GROUPS]; +extern numa_affinity_manager group_affinity; + +#endif // THREADPOOL_UNIX_H_ +