diff --git a/aio-wait-delegate-polling-of-main-AioContext-if-BQL-not-held.patch b/aio-wait-delegate-polling-of-main-AioContext-if-BQL-not-held.patch
new file mode 100644
index 0000000000000000000000000000000000000000..d27d763bb19d0b6b3323eb8f7647e5d89cb1df22
--- /dev/null
+++ b/aio-wait-delegate-polling-of-main-AioContext-if-BQL-not-held.patch
@@ -0,0 +1,120 @@
+From a2bae876b7f694b12073bac8ad6668e4d975ad88 Mon Sep 17 00:00:00 2001
+From: Ying Fang <fangying1@huawei.com>
+Date: Fri, 10 Apr 2020 16:08:19 +0000
+Subject: [PATCH 1/2] aio-wait: delegate polling of main AioContext if BQL not
+ held
+
+Any thread that is not a iothread returns NULL for qemu_get_current_aio_context().
+As a result, it would also return true for
+in_aio_context_home_thread(qemu_get_aio_context()), causing
+AIO_WAIT_WHILE to invoke aio_poll() directly.  This is incorrect
+if the BQL is not held, because aio_poll() does not expect to
+run concurrently from multiple threads, and it can actually
+happen when savevm writes to the vmstate file from the
+migration thread.
+
+Therefore, restrict in_aio_context_home_thread to return true
+for the main AioContext only if the BQL is held.
+
+The function is moved to aio-wait.h because it is mostly used
+there and to avoid a circular reference between main-loop.h
+and block/aio.h.
+
+upstream_url: https://patchwork.kernel.org/patch/11482099/
+
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Message-Id: <20200407140746.8041-5-pbonzini@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ include/block/aio-wait.h | 22 ++++++++++++++++++++++
+ include/block/aio.h      | 29 ++++++++++-------------------
+ 2 files changed, 32 insertions(+), 19 deletions(-)
+
+diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
+index afd0ff7e..d349e7e3 100644
+--- a/include/block/aio-wait.h
++++ b/include/block/aio-wait.h
+@@ -26,6 +26,7 @@
+ #define QEMU_AIO_WAIT_H
+ 
+ #include "block/aio.h"
++#include "qemu/main-loop.h"
+ 
+ /**
+  * AioWait:
+@@ -124,4 +125,25 @@ void aio_wait_kick(void);
+  */
+ void aio_wait_bh_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque);
+ 
++/**
++ * in_aio_context_home_thread:
++ * @ctx: the aio context
++ *
++ * Return whether we are running in the thread that normally runs @ctx.  Note
++ * that acquiring/releasing ctx does not affect the outcome, each AioContext
++ * still only has one home thread that is responsible for running it.
++ */
++static inline bool in_aio_context_home_thread(AioContext *ctx)
++{
++    if (ctx == qemu_get_current_aio_context()) {
++        return true;
++    }
++
++    if (ctx == qemu_get_aio_context()) {
++        return qemu_mutex_iothread_locked();
++    } else {
++        return false;
++    }
++}
++
+ #endif /* QEMU_AIO_WAIT */
+diff --git a/include/block/aio.h b/include/block/aio.h
+index 0ca25dfe..c527893b 100644
+--- a/include/block/aio.h
++++ b/include/block/aio.h
+@@ -61,12 +61,16 @@ struct AioContext {
+     QLIST_HEAD(, AioHandler) aio_handlers;
+ 
+     /* Used to avoid unnecessary event_notifier_set calls in aio_notify;
+-     * accessed with atomic primitives.  If this field is 0, everything
+-     * (file descriptors, bottom halves, timers) will be re-evaluated
+-     * before the next blocking poll(), thus the event_notifier_set call
+-     * can be skipped.  If it is non-zero, you may need to wake up a
+-     * concurrent aio_poll or the glib main event loop, making
+-     * event_notifier_set necessary.
++     * only written from the AioContext home thread, or under the BQL in
++     * the case of the main AioContext.  However, it is read from any
++     * thread so it is still accessed with atomic primitives.
++     *
++     * If this field is 0, everything (file descriptors, bottom halves,
++     * timers) will be re-evaluated before the next blocking poll() or
++     * io_uring wait; therefore, the event_notifier_set call can be
++     * skipped.  If it is non-zero, you may need to wake up a concurrent
++     * aio_poll or the glib main event loop, making event_notifier_set
++     * necessary.
+      *
+      * Bit 0 is reserved for GSource usage of the AioContext, and is 1
+      * between a call to aio_ctx_prepare and the next call to aio_ctx_check.
+@@ -581,19 +585,6 @@ void aio_co_enter(AioContext *ctx, struct Coroutine *co);
+  */
+ AioContext *qemu_get_current_aio_context(void);
+ 
+-/**
+- * in_aio_context_home_thread:
+- * @ctx: the aio context
+- *
+- * Return whether we are running in the thread that normally runs @ctx.  Note
+- * that acquiring/releasing ctx does not affect the outcome, each AioContext
+- * still only has one home thread that is responsible for running it.
+- */
+-static inline bool in_aio_context_home_thread(AioContext *ctx)
+-{
+-    return ctx == qemu_get_current_aio_context();
+-}
+-
+ /**
+  * aio_context_setup:
+  * @ctx: the aio context
+-- 
+2.25.2
+
diff --git a/async-use-explicit-memory-barriers.patch b/async-use-explicit-memory-barriers.patch
new file mode 100644
index 0000000000000000000000000000000000000000..7fb68c949ad4e95cc0f908c44042096b69cf9295
--- /dev/null
+++ b/async-use-explicit-memory-barriers.patch
@@ -0,0 +1,171 @@
+From 787af8ed2bc86dc8688727d62a251965d9c42e00 Mon Sep 17 00:00:00 2001
+From: Ying Fang <fangying1@huawei.com>
+Date: Fri, 10 Apr 2020 16:19:50 +0000
+Subject: [PATCH 2/2] async: use explicit memory barriers
+
+When using C11 atomics, non-seqcst reads and writes do not participate
+in the total order of seqcst operations.  In util/async.c and util/aio-posix.c,
+in particular, the pattern that we use
+
+        write ctx->notify_me                 write bh->scheduled
+        read bh->scheduled                   read ctx->notify_me
+        if !bh->scheduled, sleep             if ctx->notify_me, notify
+
+needs to use seqcst operations for both the write and the read.  In
+general this is something that we do not want, because there can be
+many sources that are polled in addition to bottom halves.  The
+alternative is to place a seqcst memory barrier between the write
+and the read.  This also comes with a disadvantage, in that the
+memory barrier is implicit on strongly-ordered architectures and
+it wastes a few dozen clock cycles.
+
+Fortunately, ctx->notify_me is never written concurrently by two
+threads, so we can assert that and relax the writes to ctx->notify_me.
+The resulting solution works and performs well on both aarch64 and x86.
+
+Note that the atomic_set/atomic_read combination is not an atomic
+read-modify-write, and therefore it is even weaker than C11 ATOMIC_RELAXED;
+on x86, ATOMIC_RELAXED compiles to a locked operation.
+
+upstream_url: https://patchwork.kernel.org/patch/11482103/
+
+Analyzed-by: Ying Fang <fangying1@huawei.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Tested-by: Ying Fang <fangying1@huawei.com>
+Message-Id: <20200407140746.8041-6-pbonzini@redhat.com>
+Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
+---
+ util/aio-posix.c | 16 ++++++++++++++--
+ util/aio-win32.c | 17 ++++++++++++++---
+ util/async.c     | 16 ++++++++++++----
+ 3 files changed, 40 insertions(+), 9 deletions(-)
+
+diff --git a/util/aio-posix.c b/util/aio-posix.c
+index 6fbfa792..ca58b9a4 100644
+--- a/util/aio-posix.c
++++ b/util/aio-posix.c
+@@ -613,6 +613,11 @@ bool aio_poll(AioContext *ctx, bool blocking)
+     int64_t timeout;
+     int64_t start = 0;
+ 
++    /*
++     * There cannot be two concurrent aio_poll calls for the same AioContext (or
++     * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
++     * We rely on this below to avoid slow locked accesses to ctx->notify_me.
++     */
+     assert(in_aio_context_home_thread(ctx));
+ 
+     /* aio_notify can avoid the expensive event_notifier_set if
+@@ -623,7 +628,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
+      * so disable the optimization now.
+      */
+     if (blocking) {
+-        atomic_add(&ctx->notify_me, 2);
++        atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2);
++        /*
++         * Write ctx->notify_me before computing the timeout
++         * (reading bottom half flags, etc.).  Pairs with
++         * smp_mb in aio_notify().
++         */
++        smp_mb();
+     }
+ 
+     qemu_lockcnt_inc(&ctx->list_lock);
+@@ -668,7 +679,8 @@ bool aio_poll(AioContext *ctx, bool blocking)
+     }
+ 
+     if (blocking) {
+-        atomic_sub(&ctx->notify_me, 2);
++        /* Finish the poll before clearing the flag.  */
++        atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2);
+         aio_notify_accept(ctx);
+     }
+ 
+diff --git a/util/aio-win32.c b/util/aio-win32.c
+index a23b9c36..729d533f 100644
+--- a/util/aio-win32.c
++++ b/util/aio-win32.c
+@@ -321,6 +321,12 @@ bool aio_poll(AioContext *ctx, bool blocking)
+     int count;
+     int timeout;
+ 
++    /*
++     * There cannot be two concurrent aio_poll calls for the same AioContext (or
++     * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
++     * We rely on this below to avoid slow locked accesses to ctx->notify_me.
++     */
++    assert(in_aio_context_home_thread(ctx));
+     progress = false;
+ 
+     /* aio_notify can avoid the expensive event_notifier_set if
+@@ -331,7 +337,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
+      * so disable the optimization now.
+      */
+     if (blocking) {
+-        atomic_add(&ctx->notify_me, 2);
++        atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) + 2);
++        /*
++         * Write ctx->notify_me before computing the timeout
++         * (reading bottom half flags, etc.).  Pairs with
++         * smp_mb in aio_notify().
++         */
++        smp_mb();
+     }
+ 
+     qemu_lockcnt_inc(&ctx->list_lock);
+@@ -364,8 +376,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
+         ret = WaitForMultipleObjects(count, events, FALSE, timeout);
+         if (blocking) {
+             assert(first);
+-            assert(in_aio_context_home_thread(ctx));
+-            atomic_sub(&ctx->notify_me, 2);
++            atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) - 2);
+             aio_notify_accept(ctx);
+         }
+ 
+diff --git a/util/async.c b/util/async.c
+index afc17fb3..12b33204 100644
+--- a/util/async.c
++++ b/util/async.c
+@@ -221,7 +221,14 @@ aio_ctx_prepare(GSource *source, gint    *timeout)
+ {
+     AioContext *ctx = (AioContext *) source;
+ 
+-    atomic_or(&ctx->notify_me, 1);
++    atomic_set(&ctx->notify_me, atomic_read(&ctx->notify_me) | 1);
++
++    /*
++     * Write ctx->notify_me before computing the timeout
++     * (reading bottom half flags, etc.).  Pairs with
++     * smp_mb in aio_notify().
++     */
++    smp_mb();
+ 
+     /* We assume there is no timeout already supplied */
+     *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
+@@ -239,7 +246,8 @@ aio_ctx_check(GSource *source)
+     AioContext *ctx = (AioContext *) source;
+     QEMUBH *bh;
+ 
+-    atomic_and(&ctx->notify_me, ~1);
++    /* Finish computing the timeout before clearing the flag.  */
++    atomic_store_release(&ctx->notify_me, atomic_read(&ctx->notify_me) & ~1);
+     aio_notify_accept(ctx);
+ 
+     for (bh = ctx->first_bh; bh; bh = bh->next) {
+@@ -344,10 +352,10 @@ LinuxAioState *aio_get_linux_aio(AioContext *ctx)
+ void aio_notify(AioContext *ctx)
+ {
+     /* Write e.g. bh->scheduled before reading ctx->notify_me.  Pairs
+-     * with atomic_or in aio_ctx_prepare or atomic_add in aio_poll.
++     * with smp_mb in aio_ctx_prepare or aio_poll.
+      */
+     smp_mb();
+-    if (ctx->notify_me) {
++    if (atomic_read(&ctx->notify_me)) {
+         event_notifier_set(&ctx->notifier);
+         atomic_mb_set(&ctx->notified, true);
+     }
+-- 
+2.25.2
+
diff --git a/qemu.spec b/qemu.spec
index cc1eadadb1ffdb55e87b98f55e33fdf0c2760543..c2a390b06ad96488c8acb43fc007dd97e068c8fa 100644
--- a/qemu.spec
+++ b/qemu.spec
@@ -61,7 +61,8 @@ Patch0048: COLO-compare-Fix-incorrect-if-logic.patch
 Patch0049: qcow2-bitmap-Fix-uint64_t-left-shift-overflow.patch
 Patch0050: pcie-Add-pcie-root-port-fast-plug-unplug-feature.patch
 Patch0051: pcie-Compat-with-devices-which-do-not-support-Link-W.patch
-Patch0052: util-async-Add-memory-barrier-to-aio_ctx_prepare.patch
+Patch0052: aio-wait-delegate-polling-of-main-AioContext-if-BQL-not-held.patch
+Patch0053: async-use-explicit-memory-barriers.patch
 
 BuildRequires: flex
 BuildRequires: bison
@@ -397,8 +398,9 @@ getent passwd qemu >/dev/null || \
 %endif
 
 %changelog
-* Thu Apr 2 2020 Huawei Technologies Co., Ltd. <fangying1@huawei.com>
-util/async: Add memory barrier to aio_ctx_prepare
+* Sat Apr 11 4 2020 Huawei Technologies Co., Ltd. <fangying1@huawei.com>
+- aio-wait: delegate polling of main AioContext if BQL not held
+- async: use explicit memory barriers
 
 * Wed Mar 18 2020 Huawei Technologies Co., Ltd. <fangying1@huawei.com>
 - pcie: Add pcie-root-port fast plug/unplug feature
diff --git a/util-async-Add-memory-barrier-to-aio_ctx_prepare.patch b/util-async-Add-memory-barrier-to-aio_ctx_prepare.patch
deleted file mode 100644
index 49648291c96d78e5bc33db952c50397076bfb2fc..0000000000000000000000000000000000000000
--- a/util-async-Add-memory-barrier-to-aio_ctx_prepare.patch
+++ /dev/null
@@ -1,70 +0,0 @@
-From 99026ec6a2735c6ff2f094ac247f558f14e3f3b9 Mon Sep 17 00:00:00 2001
-From: Ying Fang <fangying1@huawei.com>
-Date: Thu, 2 Apr 2020 15:53:47 +0800
-Subject: [PATCH] util/async: Add memory barrier to aio_ctx_prepare
-
-Qemu main thread is found to hang up in the mainloop when doing
-image format convert on aarch64 platform and it is highly
-reproduceable by executing test using:
-
-qemu-img convert -f qcow2 -O qcow2 origin.qcow2 converted.qcow2
-
-This mysterious hang can be explained by a race condition between
-the main thread and an io worker thread. There can be a chance that
-the last worker thread has called aio_bh_schedule_oneshot and it is
-checking against notify_me to deliver a notfiy event. At the same
-time, the main thread is calling aio_ctx_prepare however it first
-calls qemu_timeout_ns_to_ms, thus the worker thread did not see
-notify_me as true and did not send a notify event. The time line
-can be shown in the following way:
-
- Main Thread
- ------------------------------------------------
- aio_ctx_prepare
-    atomic_or(&ctx->notify_me, 1);
-    /* out of order execution goes here */
-    *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
-
- Worker Thread
- -----------------------------------------------
- aio_bh_schedule_oneshot -> aio_bh_enqueue
-    aio_notify
-    smp_mb();
-    if (ctx->notify_me) {   /* worker thread checks notify_me here */
-        event_notifier_set(&ctx->notifier);
-        atomic_mb_set(&ctx->notified, true);
-    }
-
-Normal VM runtime is not affected by this hang since there is always some
-timer timeout or subsequent io worker come and notify the main thead.
-To fix this problem, a memory barrier is added to aio_ctx_prepare and
-it is proved to have the hang fixed in our test.
-
-This hang is not observed on the x86 platform however it can be easily
-reproduced on the aarch64 platform, thus it is architecture related.
-Not sure if this is revelant to Commit eabc977973103527bbb8fed69c91cfaa6691f8ab
-
-Signed-off-by: Ying Fang <fangying1@huawei.com>
-Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com>
-Reported-by: Euler Robot <euler.robot@huawei.com>
----
- util/async.c | 3 ++-
- 1 file changed, 2 insertions(+), 1 deletion(-)
-
-diff --git a/util/async.c b/util/async.c
-index afc17fb3..50dfa9ce 100644
---- a/util/async.c
-+++ b/util/async.c
-@@ -222,7 +222,8 @@ aio_ctx_prepare(GSource *source, gint    *timeout)
-     AioContext *ctx = (AioContext *) source;
- 
-     atomic_or(&ctx->notify_me, 1);
--
-+    /* Make sure notify_me is set before aio_compute_timeout */
-+    smp_mb();
-     /* We assume there is no timeout already supplied */
-     *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
- 
--- 
-2.23.0
-