diff --git a/aio-wait-delegate-polling-of-main-AioContext-if-BQL-not-held.patch b/aio-wait-delegate-polling-of-main-AioContext-if-BQL-not-held.patch new file mode 100644 index 0000000000000000000000000000000000000000..d27d763bb19d0b6b3323eb8f7647e5d89cb1df22 --- /dev/null +++ b/aio-wait-delegate-polling-of-main-AioContext-if-BQL-not-held.patch @@ -0,0 +1,120 @@ +From a2bae876b7f694b12073bac8ad6668e4d975ad88 Mon Sep 17 00:00:00 2001 +From: Ying Fang +Date: Fri, 10 Apr 2020 16:08:19 +0000 +Subject: [PATCH 1/2] aio-wait: delegate polling of main AioContext if BQL not + held + +Any thread that is not a iothread returns NULL for qemu_get_current_aio_context(). +As a result, it would also return true for +in_aio_context_home_thread(qemu_get_aio_context()), causing +AIO_WAIT_WHILE to invoke aio_poll() directly. This is incorrect +if the BQL is not held, because aio_poll() does not expect to +run concurrently from multiple threads, and it can actually +happen when savevm writes to the vmstate file from the +migration thread. + +Therefore, restrict in_aio_context_home_thread to return true +for the main AioContext only if the BQL is held. + +The function is moved to aio-wait.h because it is mostly used +there and to avoid a circular reference between main-loop.h +and block/aio.h. + +upstream_url: https://patchwork.kernel.org/patch/11482099/ + +Signed-off-by: Paolo Bonzini +Message-Id: <20200407140746.8041-5-pbonzini@redhat.com> +Signed-off-by: Stefan Hajnoczi +--- + include/block/aio-wait.h | 22 ++++++++++++++++++++++ + include/block/aio.h | 29 ++++++++++------------------- + 2 files changed, 32 insertions(+), 19 deletions(-) + +diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h +index afd0ff7e..d349e7e3 100644 +--- a/include/block/aio-wait.h ++++ b/include/block/aio-wait.h +@@ -26,6 +26,7 @@ + #define QEMU_AIO_WAIT_H + + #include "block/aio.h" ++#include "qemu/main-loop.h" + + /** + * AioWait: +@@ -124,4 +125,25 @@ void aio_wait_kick(void); + */ + void aio_wait_bh_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque); + ++/** ++ * in_aio_context_home_thread: ++ * @ctx: the aio context ++ * ++ * Return whether we are running in the thread that normally runs @ctx. Note ++ * that acquiring/releasing ctx does not affect the outcome, each AioContext ++ * still only has one home thread that is responsible for running it. ++ */ ++static inline bool in_aio_context_home_thread(AioContext *ctx) ++{ ++ if (ctx == qemu_get_current_aio_context()) { ++ return true; ++ } ++ ++ if (ctx == qemu_get_aio_context()) { ++ return qemu_mutex_iothread_locked(); ++ } else { ++ return false; ++ } ++} ++ + #endif /* QEMU_AIO_WAIT */ +diff --git a/include/block/aio.h b/include/block/aio.h +index 0ca25dfe..c527893b 100644 +--- a/include/block/aio.h ++++ b/include/block/aio.h +@@ -61,12 +61,16 @@ struct AioContext { + QLIST_HEAD(, AioHandler) aio_handlers; + + /* Used to avoid unnecessary event_notifier_set calls in aio_notify; +- * accessed with atomic primitives. If this field is 0, everything +- * (file descriptors, bottom halves, timers) will be re-evaluated +- * before the next blocking poll(), thus the event_notifier_set call +- * can be skipped. If it is non-zero, you may need to wake up a +- * concurrent aio_poll or the glib main event loop, making +- * event_notifier_set necessary. ++ * only written from the AioContext home thread, or under the BQL in ++ * the case of the main AioContext. However, it is read from any ++ * thread so it is still accessed with atomic primitives. ++ * ++ * If this field is 0, everything (file descriptors, bottom halves, ++ * timers) will be re-evaluated before the next blocking poll() or ++ * io_uring wait; therefore, the event_notifier_set call can be ++ * skipped. If it is non-zero, you may need to wake up a concurrent ++ * aio_poll or the glib main event loop, making event_notifier_set ++ * necessary. + * + * Bit 0 is reserved for GSource usage of the AioContext, and is 1 + * between a call to aio_ctx_prepare and the next call to aio_ctx_check. +@@ -581,19 +585,6 @@ void aio_co_enter(AioContext *ctx, struct Coroutine *co); + */ + AioContext *qemu_get_current_aio_context(void); + +-/** +- * in_aio_context_home_thread: +- * @ctx: the aio context +- * +- * Return whether we are running in the thread that normally runs @ctx. Note +- * that acquiring/releasing ctx does not affect the outcome, each AioContext +- * still only has one home thread that is responsible for running it. +- */ +-static inline bool in_aio_context_home_thread(AioContext *ctx) +-{ +- return ctx == qemu_get_current_aio_context(); +-} +- + /** + * aio_context_setup: + * @ctx: the aio context +-- +2.25.2 + diff --git a/async-use-explicit-memory-barriers.patch b/async-use-explicit-memory-barriers.patch new file mode 100644 index 0000000000000000000000000000000000000000..7fb68c949ad4e95cc0f908c44042096b69cf9295 --- /dev/null +++ b/async-use-explicit-memory-barriers.patch @@ -0,0 +1,171 @@ +From 787af8ed2bc86dc8688727d62a251965d9c42e00 Mon Sep 17 00:00:00 2001 +From: Ying Fang +Date: Fri, 10 Apr 2020 16:19:50 +0000 +Subject: [PATCH 2/2] async: use explicit memory barriers + +When using C11 atomics, non-seqcst reads and writes do not participate +in the total order of seqcst operations. In util/async.c and util/aio-posix.c, +in particular, the pattern that we use + + write ctx->notify_me write bh->scheduled + read bh->scheduled read ctx->notify_me + if !bh->scheduled, sleep if ctx->notify_me, notify + +needs to use seqcst operations for both the write and the read. In +general this is something that we do not want, because there can be +many sources that are polled in addition to bottom halves. The +alternative is to place a seqcst memory barrier between the write +and the read. This also comes with a disadvantage, in that the +memory barrier is implicit on strongly-ordered architectures and +it wastes a few dozen clock cycles. + +Fortunately, ctx->notify_me is never written concurrently by two +threads, so we can assert that and relax the writes to ctx->notify_me. +The resulting solution works and performs well on both aarch64 and x86. + +Note that the atomic_set/atomic_read combination is not an atomic +read-modify-write, and therefore it is even weaker than C11 ATOMIC_RELAXED; +on x86, ATOMIC_RELAXED compiles to a locked operation. + +upstream_url: https://patchwork.kernel.org/patch/11482103/ + +Analyzed-by: Ying Fang +Signed-off-by: Paolo Bonzini +Tested-by: Ying Fang +Message-Id: <20200407140746.8041-6-pbonzini@redhat.com> +Signed-off-by: Stefan Hajnoczi +--- + util/aio-posix.c | 16 ++++++++++++++-- + util/aio-win32.c | 17 ++++++++++++++--- + util/async.c | 16 ++++++++++++---- + 3 files changed, 40 insertions(+), 9 deletions(-) + +diff --git a/util/aio-posix.c b/util/aio-posix.c +index 6fbfa792..ca58b9a4 100644 +--- a/util/aio-posix.c ++++ b/util/aio-posix.c +@@ -613,6 +613,11 @@ bool aio_poll(AioContext *ctx, bool blocking) + int64_t timeout; + int64_t start = 0; + ++ /* ++ * There cannot be two concurrent aio_poll calls for the same AioContext (or ++ * an aio_poll concurrent with a GSource prepare/check/dispatch callback). ++ * We rely on this below to avoid slow locked accesses to ctx->notify_me. ++ */ + assert(in_aio_context_home_thread(ctx)); + + /* aio_notify can avoid the expensive event_notifier_set if +@@ -623,7 +628,13 @@ bool aio_poll(AioContext *ctx, bool blocking) + * so disable the optimization now. + */ + if (blocking) { +- atomic_add(&ctx->notify_me, 2); ++ atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2); ++ /* ++ * Write ctx->notify_me before computing the timeout ++ * (reading bottom half flags, etc.). Pairs with ++ * smp_mb in aio_notify(). ++ */ ++ smp_mb(); + } + + qemu_lockcnt_inc(&ctx->list_lock); +@@ -668,7 +679,8 @@ bool aio_poll(AioContext *ctx, bool blocking) + } + + if (blocking) { +- atomic_sub(&ctx->notify_me, 2); ++ /* Finish the poll before clearing the flag. */ ++ atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2); + aio_notify_accept(ctx); + } + +diff --git a/util/aio-win32.c b/util/aio-win32.c +index a23b9c36..729d533f 100644 +--- a/util/aio-win32.c ++++ b/util/aio-win32.c +@@ -321,6 +321,12 @@ bool aio_poll(AioContext *ctx, bool blocking) + int count; + int timeout; + ++ /* ++ * There cannot be two concurrent aio_poll calls for the same AioContext (or ++ * an aio_poll concurrent with a GSource prepare/check/dispatch callback). ++ * We rely on this below to avoid slow locked accesses to ctx->notify_me. ++ */ ++ assert(in_aio_context_home_thread(ctx)); + progress = false; + + /* aio_notify can avoid the expensive event_notifier_set if +@@ -331,7 +337,13 @@ bool aio_poll(AioContext *ctx, bool blocking) + * so disable the optimization now. + */ + if (blocking) { +- atomic_add(&ctx->notify_me, 2); ++ atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2); ++ /* ++ * Write ctx->notify_me before computing the timeout ++ * (reading bottom half flags, etc.). Pairs with ++ * smp_mb in aio_notify(). ++ */ ++ smp_mb(); + } + + qemu_lockcnt_inc(&ctx->list_lock); +@@ -364,8 +376,7 @@ bool aio_poll(AioContext *ctx, bool blocking) + ret = WaitForMultipleObjects(count, events, FALSE, timeout); + if (blocking) { + assert(first); +- assert(in_aio_context_home_thread(ctx)); +- atomic_sub(&ctx->notify_me, 2); ++ atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2); + aio_notify_accept(ctx); + } + +diff --git a/util/async.c b/util/async.c +index afc17fb3..12b33204 100644 +--- a/util/async.c ++++ b/util/async.c +@@ -221,7 +221,14 @@ aio_ctx_prepare(GSource *source, gint *timeout) + { + AioContext *ctx = (AioContext *) source; + +- atomic_or(&ctx->notify_me, 1); ++ atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) | 1); ++ ++ /* ++ * Write ctx->notify_me before computing the timeout ++ * (reading bottom half flags, etc.). Pairs with ++ * smp_mb in aio_notify(). ++ */ ++ smp_mb(); + + /* We assume there is no timeout already supplied */ + *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)); +@@ -239,7 +246,8 @@ aio_ctx_check(GSource *source) + AioContext *ctx = (AioContext *) source; + QEMUBH *bh; + +- atomic_and(&ctx->notify_me, ~1); ++ /* Finish computing the timeout before clearing the flag. */ ++ atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) & ~1); + aio_notify_accept(ctx); + + for (bh = ctx->first_bh; bh; bh = bh->next) { +@@ -344,10 +352,10 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx) + void aio_notify(AioContext *ctx) + { + /* Write e.g. bh->scheduled before reading ctx->notify_me. Pairs +- * with atomic_or in aio_ctx_prepare or atomic_add in aio_poll. ++ * with smp_mb in aio_ctx_prepare or aio_poll. + */ + smp_mb(); +- if (ctx->notify_me) { ++ if (atomic_read(&ctx->notify_me)) { + event_notifier_set(&ctx->notifier); + atomic_mb_set(&ctx->notified, true); + } +-- +2.25.2 + diff --git a/qemu.spec b/qemu.spec index cc1eadadb1ffdb55e87b98f55e33fdf0c2760543..c2a390b06ad96488c8acb43fc007dd97e068c8fa 100644 --- a/qemu.spec +++ b/qemu.spec @@ -61,7 +61,8 @@ Patch0048: COLO-compare-Fix-incorrect-if-logic.patch Patch0049: qcow2-bitmap-Fix-uint64_t-left-shift-overflow.patch Patch0050: pcie-Add-pcie-root-port-fast-plug-unplug-feature.patch Patch0051: pcie-Compat-with-devices-which-do-not-support-Link-W.patch -Patch0052: util-async-Add-memory-barrier-to-aio_ctx_prepare.patch +Patch0052: aio-wait-delegate-polling-of-main-AioContext-if-BQL-not-held.patch +Patch0053: async-use-explicit-memory-barriers.patch BuildRequires: flex BuildRequires: bison @@ -397,8 +398,9 @@ getent passwd qemu >/dev/null || \ %endif %changelog -* Thu Apr 2 2020 Huawei Technologies Co., Ltd. -util/async: Add memory barrier to aio_ctx_prepare +* Sat Apr 11 4 2020 Huawei Technologies Co., Ltd. +- aio-wait: delegate polling of main AioContext if BQL not held +- async: use explicit memory barriers * Wed Mar 18 2020 Huawei Technologies Co., Ltd. - pcie: Add pcie-root-port fast plug/unplug feature diff --git a/util-async-Add-memory-barrier-to-aio_ctx_prepare.patch b/util-async-Add-memory-barrier-to-aio_ctx_prepare.patch deleted file mode 100644 index 49648291c96d78e5bc33db952c50397076bfb2fc..0000000000000000000000000000000000000000 --- a/util-async-Add-memory-barrier-to-aio_ctx_prepare.patch +++ /dev/null @@ -1,70 +0,0 @@ -From 99026ec6a2735c6ff2f094ac247f558f14e3f3b9 Mon Sep 17 00:00:00 2001 -From: Ying Fang -Date: Thu, 2 Apr 2020 15:53:47 +0800 -Subject: [PATCH] util/async: Add memory barrier to aio_ctx_prepare - -Qemu main thread is found to hang up in the mainloop when doing -image format convert on aarch64 platform and it is highly -reproduceable by executing test using: - -qemu-img convert -f qcow2 -O qcow2 origin.qcow2 converted.qcow2 - -This mysterious hang can be explained by a race condition between -the main thread and an io worker thread. There can be a chance that -the last worker thread has called aio_bh_schedule_oneshot and it is -checking against notify_me to deliver a notfiy event. At the same -time, the main thread is calling aio_ctx_prepare however it first -calls qemu_timeout_ns_to_ms, thus the worker thread did not see -notify_me as true and did not send a notify event. The time line -can be shown in the following way: - - Main Thread - ------------------------------------------------ - aio_ctx_prepare - atomic_or(&ctx->notify_me, 1); - /* out of order execution goes here */ - *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)); - - Worker Thread - ----------------------------------------------- - aio_bh_schedule_oneshot -> aio_bh_enqueue - aio_notify - smp_mb(); - if (ctx->notify_me) { /* worker thread checks notify_me here */ - event_notifier_set(&ctx->notifier); - atomic_mb_set(&ctx->notified, true); - } - -Normal VM runtime is not affected by this hang since there is always some -timer timeout or subsequent io worker come and notify the main thead. -To fix this problem, a memory barrier is added to aio_ctx_prepare and -it is proved to have the hang fixed in our test. - -This hang is not observed on the x86 platform however it can be easily -reproduced on the aarch64 platform, thus it is architecture related. -Not sure if this is revelant to Commit eabc977973103527bbb8fed69c91cfaa6691f8ab - -Signed-off-by: Ying Fang -Signed-off-by: zhanghailiang -Reported-by: Euler Robot ---- - util/async.c | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/util/async.c b/util/async.c -index afc17fb3..50dfa9ce 100644 ---- a/util/async.c -+++ b/util/async.c -@@ -222,7 +222,8 @@ aio_ctx_prepare(GSource *source, gint *timeout) - AioContext *ctx = (AioContext *) source; - - atomic_or(&ctx->notify_me, 1); -- -+ /* Make sure notify_me is set before aio_compute_timeout */ -+ smp_mb(); - /* We assume there is no timeout already supplied */ - *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)); - --- -2.23.0 -