From 5c92173e43bd4ae8608197659901125dfb386a25 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:10 +0200 Subject: [PATCH 01/90] migration/multifd: Fix error message in multifd_recv_initial_packet() commit c77b40859a5201f01b44dc475258405e289c431f upstream. In multifd_recv_initial_packet(), if MultiFDInit_t->id is greater than the configured number of multifd channels, an irrelevant error message about multifd version is printed. Change the error message to a relevant one about the channel id. Intel-SIG: commit c77b40859a52 migration/multifd: Fix error message in multifd_recv_initial_packet() Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20231231093016.14204-6-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 409460684f2..a6204fc72f4 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -228,8 +228,8 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) } if (msg.id > migrate_multifd_channels()) { - error_setg(errp, "multifd: received channel version %u " - "expected %u", msg.version, MULTIFD_VERSION); + error_setg(errp, "multifd: received channel id %u is greater than " + "number of channels %u", msg.id, migrate_multifd_channels()); return -1; } -- Gitee From 5e07e72c5ee04a83386b278d142ffb4293c9b337 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:11 +0200 Subject: [PATCH 02/90] migration/multifd: Simplify multifd_channel_connect() if else statement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit a4395f5d3c06472ed70d9ef9f79878f95575be9e upstream. The else branch in multifd_channel_connect() is redundant because when the if branch is taken the function returns. Simplify the code by removing the else branch. Intel-SIG: commit a4395f5d3c06 migration/multifd: Simplify multifd_channel_connect() if else statement Signed-off-by: Avihai Horon Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20231231093016.14204-7-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index a6204fc72f4..55d5fd55f82 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -847,14 +847,13 @@ static bool multifd_channel_connect(MultiFDSendParams *p, * so we mustn't call multifd_send_thread until then */ return multifd_tls_channel_connect(p, ioc, errp); - - } else { - migration_ioc_register_yank(ioc); - p->registered_yank = true; - p->c = ioc; - qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, - QEMU_THREAD_JOINABLE); } + + migration_ioc_register_yank(ioc); + p->registered_yank = true; + p->c = ioc; + qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, + QEMU_THREAD_JOINABLE); return true; } -- Gitee From bd2d6353f6dfaeb19611d6ebcfe161e7d14db33e Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:12 +0200 Subject: [PATCH 03/90] migration/multifd: Fix leaking of Error in TLS error flow commit 6ae208ce9656114e428b1a75ac62a6761ed3216c upstream. If there is an error in multifd TLS handshake task, multifd_tls_outgoing_handshake() retrieves the error with qio_task_propagate_error() but never frees it. Fix it by freeing the obtained Error. In addition, the error is not reported at all, so report it with migrate_set_error(). Intel-SIG: commit 6ae208ce9656 migration/multifd: Fix leaking of Error in TLS error flow Fixes: 29647140157a ("migration/tls: add support for multifd tls-handshake") Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20231231093016.14204-8-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/migration/multifd.c b/migration/multifd.c index 55d5fd55f82..9ac24866adf 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -787,6 +787,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); + migrate_set_error(migrate_get_current(), err); /* * Error happen, mark multifd_send_thread status as 'quit' although it * is not created, and then tell who pay attention to me. @@ -794,6 +795,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, p->quit = true; qemu_sem_post(&multifd_send_state->channels_ready); qemu_sem_post(&p->sem_sync); + error_free(err); } static void *multifd_tls_handshake_thread(void *opaque) -- Gitee From 65d482a1654600997464e68035e08325a5d901b5 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:13 +0200 Subject: [PATCH 04/90] migration/multifd: Remove error_setg() in migration_ioc_process_incoming() commit 1d3886f837d8e972366a8b58ba8afb0e5efbeed7 upstream. If multifd_load_setup() fails in migration_ioc_process_incoming(), error_setg() is called with errp. This will lead to an assert because in that case errp already contains an error. Fix it by removing the redundant error_setg(). Intel-SIG: commit 1d3886f837d8 migration/multifd: Remove error_setg() in migration_ioc_process_incoming() Fixes: 6720c2b32725 ("migration: check magic value for deciding the mapping of channels") Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20231231093016.14204-9-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 1 - 1 file changed, 1 deletion(-) diff --git a/migration/migration.c b/migration/migration.c index 8b1b47836f4..31abc7c3a3b 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -857,7 +857,6 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) } if (multifd_load_setup(errp) != 0) { - error_setg(errp, "Failed to setup multifd channels"); return; } -- Gitee From 7bbba86cdb6f492c4cc83651edc6049b0195e474 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:14 +0200 Subject: [PATCH 05/90] migration: Fix migration_channel_read_peek() error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4f8cf323e80c17f7d4b5604f1699591326df6262 upstream. migration_channel_read_peek() calls qio_channel_readv_full() and handles both cases of return value == 0 and return value < 0 the same way, by calling error_setg() with errp. However, if return value < 0, errp is already set, so calling error_setg() with errp will lead to an assert. Fix it by handling these cases separately, calling error_setg() with errp only in return value == 0 case. Intel-SIG: commit 4f8cf323e80c migration: Fix migration_channel_read_peek() error path Fixes: 6720c2b32725 ("migration: check magic value for deciding the mapping of channels") Signed-off-by: Avihai Horon Reviewed-by: Fabiano Rosas Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20231231093016.14204-10-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/channel.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/migration/channel.c b/migration/channel.c index ca3319a3098..f9de064f3b1 100644 --- a/migration/channel.c +++ b/migration/channel.c @@ -117,9 +117,12 @@ int migration_channel_read_peek(QIOChannel *ioc, len = qio_channel_readv_full(ioc, &iov, 1, NULL, NULL, QIO_CHANNEL_READ_FLAG_MSG_PEEK, errp); - if (len <= 0 && len != QIO_CHANNEL_ERR_BLOCK) { - error_setg(errp, - "Failed to peek at channel"); + if (len < 0 && len != QIO_CHANNEL_ERR_BLOCK) { + return -1; + } + + if (len == 0) { + error_setg(errp, "Failed to peek at channel"); return -1; } -- Gitee From 6c0433dbe088f5958e6745521be82c2756b75318 Mon Sep 17 00:00:00 2001 From: Avihai Horon Date: Sun, 31 Dec 2023 11:30:16 +0200 Subject: [PATCH 06/90] migration/multifd: Remove unnecessary usage of local Error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 3fc58efa938338a82e4d5c0c031e7e9c98e9544f upstream. According to Error API, usage of ERRP_GUARD() or a local Error instead of errp is needed if errp is passed to void functions, where it is later dereferenced to see if an error occurred. There are several places in multifd.c that use local Error although it is not needed. Change these places to use errp directly. Intel-SIG: commit 3fc58efa9383 migration/multifd: Remove unnecessary usage of local Error Signed-off-by: Avihai Horon Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20231231093016.14204-12-avihaih@nvidia.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 9ac24866adf..9f353aecfa2 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -951,12 +951,10 @@ int multifd_save_setup(Error **errp) for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - Error *local_err = NULL; int ret; - ret = multifd_send_state->ops->send_setup(p, &local_err); + ret = multifd_send_state->ops->send_setup(p, errp); if (ret) { - error_propagate(errp, local_err); return ret; } } @@ -1195,12 +1193,10 @@ int multifd_load_setup(Error **errp) for (i = 0; i < thread_count; i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; - Error *local_err = NULL; int ret; - ret = multifd_recv_state->ops->recv_setup(p, &local_err); + ret = multifd_recv_state->ops->recv_setup(p, errp); if (ret) { - error_propagate(errp, local_err); return ret; } } -- Gitee From 6c6164252bf6ee888dff574fcba6121da10f43c6 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:38 -0300 Subject: [PATCH 07/90] migration/multifd: Remove MultiFDPages_t::packet_num commit dca1bc7f24d2fa227f0b787f85f3cc67006e67bf upstream. This was introduced by commit 34c55a94b1 ("migration: Create multipage support") and never used. Intel-SIG: commit dca1bc7f24d2 migration/multifd: Remove MultiFDPages_t::packet_num Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-2-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 1 - migration/multifd.h | 2 -- 2 files changed, 3 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 9f353aecfa2..3e650f5da08 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -250,7 +250,6 @@ static void multifd_pages_clear(MultiFDPages_t *pages) { pages->num = 0; pages->allocated = 0; - pages->packet_num = 0; pages->block = NULL; g_free(pages->offset); pages->offset = NULL; diff --git a/migration/multifd.h b/migration/multifd.h index a835643b48c..b0ff610c377 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -58,8 +58,6 @@ typedef struct { uint32_t num; /* number of allocated pages */ uint32_t allocated; - /* global number of generated multifd packets */ - uint64_t packet_num; /* offset of each page */ ram_addr_t *offset; RAMBlock *block; -- Gitee From 069d9f227719d50e77ea8f4a540ea605d0be03eb Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:39 -0300 Subject: [PATCH 08/90] migration/multifd: Remove QEMUFile from where it is not needed commit 9346fa1870784c70618bfd5a9e1f1da89de0c5ec upstream. Intel-SIG: commit 9346fa187078 migration/multifd: Remove QEMUFile from where it is not needed Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-3-farosas@suse.de Signed-off-by: Peter Xu Conflicts: migration/ram.c [jz: resolve context conflict] Signed-off-by: Jason Zeng --- migration/multifd.c | 12 ++++++------ migration/multifd.h | 4 ++-- migration/ram.c | 15 +++++++-------- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 3e650f5da08..2dbc3ba836a 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -390,7 +390,7 @@ struct { * false. */ -static int multifd_send_pages(QEMUFile *f) +static int multifd_send_pages(void) { int i; static int next_channel; @@ -436,7 +436,7 @@ static int multifd_send_pages(QEMUFile *f) return 1; } -int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset) +int multifd_queue_page(RAMBlock *block, ram_addr_t offset) { MultiFDPages_t *pages = multifd_send_state->pages; bool changed = false; @@ -456,12 +456,12 @@ int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset) changed = true; } - if (multifd_send_pages(f) < 0) { + if (multifd_send_pages() < 0) { return -1; } if (changed) { - return multifd_queue_page(f, block, offset); + return multifd_queue_page(block, offset); } return 1; @@ -583,7 +583,7 @@ static int multifd_zero_copy_flush(QIOChannel *c) return ret; } -int multifd_send_sync_main(QEMUFile *f) +int multifd_send_sync_main(void) { int i; bool flush_zero_copy; @@ -592,7 +592,7 @@ int multifd_send_sync_main(QEMUFile *f) return 0; } if (multifd_send_state->pages->num) { - if (multifd_send_pages(f) < 0) { + if (multifd_send_pages() < 0) { error_report("%s: multifd_send_pages fail", __func__); return -1; } diff --git a/migration/multifd.h b/migration/multifd.h index b0ff610c377..35d11f103cd 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -21,8 +21,8 @@ void multifd_load_shutdown(void); bool multifd_recv_all_channels_created(void); void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); void multifd_recv_sync_main(void); -int multifd_send_sync_main(QEMUFile *f); -int multifd_queue_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset); +int multifd_send_sync_main(void); +int multifd_queue_page(RAMBlock *block, ram_addr_t offset); /* Multifd Compression flags */ #define MULTIFD_FLAG_SYNC (1 << 0) diff --git a/migration/ram.c b/migration/ram.c index 71353bc90bf..4706dcda7dd 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1384,10 +1384,9 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss) return pages; } -static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block, - ram_addr_t offset) +static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) { - if (multifd_queue_page(file, block, offset) < 0) { + if (multifd_queue_page(block, offset) < 0) { return -1; } stat64_add(&mig_stats.normal_pages, 1); @@ -1470,7 +1469,7 @@ static int find_dirty_block(RAMState *rs, PageSearchStatus *pss) if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) { QEMUFile *f = rs->pss[RAM_CHANNEL_PRECOPY].pss_channel; - int ret = multifd_send_sync_main(f); + int ret = multifd_send_sync_main(); if (ret < 0) { return ret; } @@ -2262,7 +2261,7 @@ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) * still see partially copied pages which is data corruption. */ if (migrate_multifd() && !migration_in_postcopy()) { - return ram_save_multifd_page(pss->pss_channel, block, offset); + return ram_save_multifd_page(block, offset); } return ram_save_page(rs, pss); @@ -3404,7 +3403,7 @@ static int ram_save_setup(QEMUFile *f, void *opaque) migration_ops->ram_save_target_page = ram_save_target_page_legacy; qemu_mutex_unlock_iothread(); - ret = multifd_send_sync_main(f); + ret = multifd_send_sync_main(); qemu_mutex_lock_iothread(); if (ret < 0) { return ret; @@ -3528,7 +3527,7 @@ out: if (ret >= 0 && migration_is_setup_or_active(migrate_get_current()->state)) { if (migrate_multifd() && migrate_multifd_flush_after_each_section()) { - ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); + ret = multifd_send_sync_main(); if (ret < 0) { return ret; } @@ -3624,7 +3623,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque) } } - ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel); + ret = multifd_send_sync_main(); if (ret < 0) { return ret; } -- Gitee From fe6d85998d3786972bf1e54de78a754940d5c9c9 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:40 -0300 Subject: [PATCH 09/90] migration/multifd: Change multifd_pages_init argument commit 6074f81625800743e4c374aecf7dd30774aaf6e0 upstream. The 'size' argument is actually the number of pages that fit in a multifd packet. Change it to uint32_t and rename. Intel-SIG: commit 6074f8162580 migration/multifd: Change multifd_pages_init argument Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-4-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 2dbc3ba836a..25cbc6dc6be 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -236,12 +236,12 @@ static int multifd_recv_initial_packet(QIOChannel *c, Error **errp) return msg.id; } -static MultiFDPages_t *multifd_pages_init(size_t size) +static MultiFDPages_t *multifd_pages_init(uint32_t n) { MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1); - pages->allocated = size; - pages->offset = g_new0(ram_addr_t, size); + pages->allocated = n; + pages->offset = g_new0(ram_addr_t, n); return pages; } -- Gitee From 47f33fa2ef725e70c4cc7e7376bea09c456020e5 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:41 -0300 Subject: [PATCH 10/90] migration: Report error in incoming migration commit e3b8ad5c13714cca5e3fc1445472171fbcd469bc upstream. We're not currently reporting the errors set with migrate_set_error() when incoming migration fails. Intel-SIG: commit e3b8ad5c1371 migration: Report error in incoming migration Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-5-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/migration/migration.c b/migration/migration.c index 31abc7c3a3b..699ba0c8347 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -706,6 +706,13 @@ process_incoming_migration_co(void *opaque) } if (ret < 0) { + MigrationState *s = migrate_get_current(); + + if (migrate_has_error(s)) { + WITH_QEMU_LOCK_GUARD(&s->error_mutex) { + error_report_err(s->error); + } + } error_report("load of migration failed: %s", strerror(-ret)); goto fail; } -- Gitee From 27be910b9f7de4a1963bc1ad1d111dc2f2c478de Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:42 -0300 Subject: [PATCH 11/90] tests/qtest/migration: Print migration incoming errors commit 679a7382a389875c0f7835a1a409ebf4859f8410 upstream. We're currently just asserting when incoming migration fails. Let's print the error message from QMP as well. Intel-SIG: commit 679a7382a389 tests/qtest/migration: Print migration incoming errors Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-6-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-helpers.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c index 24fb7b3525c..f1106128a96 100644 --- a/tests/qtest/migration-helpers.c +++ b/tests/qtest/migration-helpers.c @@ -118,6 +118,12 @@ void migrate_incoming_qmp(QTestState *to, const char *uri, const char *fmt, ...) rsp = qtest_qmp(to, "{ 'execute': 'migrate-incoming', 'arguments': %p}", args); + + if (!qdict_haskey(rsp, "return")) { + g_autoptr(GString) s = qobject_to_json_pretty(QOBJECT(rsp), true); + g_test_message("%s", s->str); + } + g_assert(qdict_haskey(rsp, "return")); qobject_unref(rsp); -- Gitee From 71737e11ecb3d15b6b2861234f605bda3af66643 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:43 -0300 Subject: [PATCH 12/90] tests/qtest/migration: Add a wrapper to print test names commit e33b6712dba206547a313a6f2608b0fd967ee558 upstream. Our usage of gtest results in us losing the very basic functionality of "knowing which test failed". The issue is that gtest only prints test names ("paths" in gtest parlance) once the test has finished, but we use asserts in the tests and crash gtest itself before it can print anything. We also use a final abort when the result of g_test_run is not 0. Depending on how the test failed/broke we can see the function that trigged the abort, which may be representative of the test, but it could also just be some generic function. We have been relying on the primitive method of looking at the name of the previous successful test and then looking at the code to figure out which test should have come next. Add a wrapper to the test registration that does the job of printing the test name before running. Intel-SIG: commit e33b6712dba2 tests/qtest/migration: Add a wrapper to print test names Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-7-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-helpers.c | 32 ++++++++++++++++++++++++++++++++ tests/qtest/migration-helpers.h | 1 + 2 files changed, 33 insertions(+) diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c index f1106128a96..164e09c299a 100644 --- a/tests/qtest/migration-helpers.c +++ b/tests/qtest/migration-helpers.c @@ -298,3 +298,35 @@ char *resolve_machine_version(const char *alias, const char *var1, return find_common_machine_version(machine_name, var1, var2); } + +typedef struct { + char *name; + void (*func)(void); +} MigrationTest; + +static void migration_test_destroy(gpointer data) +{ + MigrationTest *test = (MigrationTest *)data; + + g_free(test->name); + g_free(test); +} + +static void migration_test_wrapper(const void *data) +{ + MigrationTest *test = (MigrationTest *)data; + + g_test_message("Running /%s%s", qtest_get_arch(), test->name); + test->func(); +} + +void migration_test_add(const char *path, void (*fn)(void)) +{ + MigrationTest *test = g_new0(MigrationTest, 1); + + test->func = fn; + test->name = g_strdup(path); + + qtest_add_data_func_full(path, test, migration_test_wrapper, + migration_test_destroy); +} diff --git a/tests/qtest/migration-helpers.h b/tests/qtest/migration-helpers.h index e31dc85cc75..0d9a02edc7e 100644 --- a/tests/qtest/migration-helpers.h +++ b/tests/qtest/migration-helpers.h @@ -47,4 +47,5 @@ char *find_common_machine_version(const char *mtype, const char *var1, const char *var2); char *resolve_machine_version(const char *alias, const char *var1, const char *var2); +void migration_test_add(const char *path, void (*fn)(void)); #endif /* MIGRATION_HELPERS_H */ -- Gitee From 6c0dc352e6ff3081cb135b118dee250fbefe79cc Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 4 Jan 2024 11:21:44 -0300 Subject: [PATCH 13/90] tests/qtest/migration: Use the new migration_test_add commit 6f0771de903bb7623dc85bbf9f94f641979daaaa upstream. Replace the tests registration with the new function that prints tests names. Intel-SIG: commit 6f0771de903b tests/qtest/migration: Use the new migration_test_add Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240104142144.9680-8-farosas@suse.de Signed-off-by: Peter Xu Conflicts: tests/qtest/migration-test.c [jz: resolve context conflicts due to live-suspend which is not backported] Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 201 ++++++++++++++++++----------------- 1 file changed, 104 insertions(+), 97 deletions(-) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 0fbaa6a90fd..4884a40be2c 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -3339,62 +3339,65 @@ int main(int argc, char **argv) module_call_init(MODULE_INIT_QOM); if (has_uffd) { - qtest_add_func("/migration/postcopy/plain", test_postcopy); - qtest_add_func("/migration/postcopy/recovery/plain", - test_postcopy_recovery); - qtest_add_func("/migration/postcopy/preempt/plain", test_postcopy_preempt); - qtest_add_func("/migration/postcopy/preempt/recovery/plain", - test_postcopy_preempt_recovery); + migration_test_add("/migration/postcopy/plain", test_postcopy); + migration_test_add("/migration/postcopy/recovery/plain", + test_postcopy_recovery); + migration_test_add("/migration/postcopy/preempt/plain", + test_postcopy_preempt); + migration_test_add("/migration/postcopy/preempt/recovery/plain", + test_postcopy_preempt_recovery); if (getenv("QEMU_TEST_FLAKY_TESTS")) { - qtest_add_func("/migration/postcopy/compress/plain", - test_postcopy_compress); - qtest_add_func("/migration/postcopy/recovery/compress/plain", - test_postcopy_recovery_compress); + migration_test_add("/migration/postcopy/compress/plain", + test_postcopy_compress); + migration_test_add("/migration/postcopy/recovery/compress/plain", + test_postcopy_recovery_compress); } #ifndef _WIN32 - qtest_add_func("/migration/postcopy/recovery/double-failures", - test_postcopy_recovery_double_fail); + migration_test_add("/migration/postcopy/recovery/double-failures", + test_postcopy_recovery_double_fail); #endif /* _WIN32 */ } - qtest_add_func("/migration/bad_dest", test_baddest); + migration_test_add("/migration/bad_dest", test_baddest); #ifndef _WIN32 if (!g_str_equal(arch, "s390x")) { - qtest_add_func("/migration/analyze-script", test_analyze_script); + migration_test_add("/migration/analyze-script", test_analyze_script); } #endif - qtest_add_func("/migration/precopy/unix/plain", test_precopy_unix_plain); - qtest_add_func("/migration/precopy/unix/xbzrle", test_precopy_unix_xbzrle); + migration_test_add("/migration/precopy/unix/plain", + test_precopy_unix_plain); + migration_test_add("/migration/precopy/unix/xbzrle", + test_precopy_unix_xbzrle); /* * Compression fails from time to time. * Put test here but don't enable it until everything is fixed. */ if (getenv("QEMU_TEST_FLAKY_TESTS")) { - qtest_add_func("/migration/precopy/unix/compress/wait", - test_precopy_unix_compress); - qtest_add_func("/migration/precopy/unix/compress/nowait", - test_precopy_unix_compress_nowait); + migration_test_add("/migration/precopy/unix/compress/wait", + test_precopy_unix_compress); + migration_test_add("/migration/precopy/unix/compress/nowait", + test_precopy_unix_compress_nowait); } - qtest_add_func("/migration/precopy/file", - test_precopy_file); - qtest_add_func("/migration/precopy/file/offset", - test_precopy_file_offset); - qtest_add_func("/migration/precopy/file/offset/bad", - test_precopy_file_offset_bad); + migration_test_add("/migration/precopy/file", + test_precopy_file); + migration_test_add("/migration/precopy/file/offset", + test_precopy_file_offset); + migration_test_add("/migration/precopy/file/offset/bad", + test_precopy_file_offset_bad); /* * Our CI system has problems with shared memory. * Don't run this test until we find a workaround. */ if (getenv("QEMU_TEST_FLAKY_TESTS")) { - qtest_add_func("/migration/mode/reboot", test_mode_reboot); + migration_test_add("/migration/mode/reboot", test_mode_reboot); } #ifdef CONFIG_GNUTLS - qtest_add_func("/migration/precopy/unix/tls/psk", - test_precopy_unix_tls_psk); + migration_test_add("/migration/precopy/unix/tls/psk", + test_precopy_unix_tls_psk); if (has_uffd) { /* @@ -3402,110 +3405,114 @@ int main(int argc, char **argv) * channels are tested under precopy. Here what we want to test is the * general postcopy path that has TLS channel enabled. */ - qtest_add_func("/migration/postcopy/tls/psk", test_postcopy_tls_psk); - qtest_add_func("/migration/postcopy/recovery/tls/psk", - test_postcopy_recovery_tls_psk); - qtest_add_func("/migration/postcopy/preempt/tls/psk", - test_postcopy_preempt_tls_psk); - qtest_add_func("/migration/postcopy/preempt/recovery/tls/psk", - test_postcopy_preempt_all); + migration_test_add("/migration/postcopy/tls/psk", + test_postcopy_tls_psk); + migration_test_add("/migration/postcopy/recovery/tls/psk", + test_postcopy_recovery_tls_psk); + migration_test_add("/migration/postcopy/preempt/tls/psk", + test_postcopy_preempt_tls_psk); + migration_test_add("/migration/postcopy/preempt/recovery/tls/psk", + test_postcopy_preempt_all); } #ifdef CONFIG_TASN1 - qtest_add_func("/migration/precopy/unix/tls/x509/default-host", - test_precopy_unix_tls_x509_default_host); - qtest_add_func("/migration/precopy/unix/tls/x509/override-host", - test_precopy_unix_tls_x509_override_host); + migration_test_add("/migration/precopy/unix/tls/x509/default-host", + test_precopy_unix_tls_x509_default_host); + migration_test_add("/migration/precopy/unix/tls/x509/override-host", + test_precopy_unix_tls_x509_override_host); #endif /* CONFIG_TASN1 */ #endif /* CONFIG_GNUTLS */ - qtest_add_func("/migration/precopy/tcp/plain", test_precopy_tcp_plain); + migration_test_add("/migration/precopy/tcp/plain", test_precopy_tcp_plain); - qtest_add_func("/migration/precopy/tcp/plain/switchover-ack", - test_precopy_tcp_switchover_ack); + migration_test_add("/migration/precopy/tcp/plain/switchover-ack", + test_precopy_tcp_switchover_ack); #ifdef CONFIG_GNUTLS - qtest_add_func("/migration/precopy/tcp/tls/psk/match", - test_precopy_tcp_tls_psk_match); - qtest_add_func("/migration/precopy/tcp/tls/psk/mismatch", - test_precopy_tcp_tls_psk_mismatch); + migration_test_add("/migration/precopy/tcp/tls/psk/match", + test_precopy_tcp_tls_psk_match); + migration_test_add("/migration/precopy/tcp/tls/psk/mismatch", + test_precopy_tcp_tls_psk_mismatch); #ifdef CONFIG_TASN1 - qtest_add_func("/migration/precopy/tcp/tls/x509/default-host", - test_precopy_tcp_tls_x509_default_host); - qtest_add_func("/migration/precopy/tcp/tls/x509/override-host", - test_precopy_tcp_tls_x509_override_host); - qtest_add_func("/migration/precopy/tcp/tls/x509/mismatch-host", - test_precopy_tcp_tls_x509_mismatch_host); - qtest_add_func("/migration/precopy/tcp/tls/x509/friendly-client", - test_precopy_tcp_tls_x509_friendly_client); - qtest_add_func("/migration/precopy/tcp/tls/x509/hostile-client", - test_precopy_tcp_tls_x509_hostile_client); - qtest_add_func("/migration/precopy/tcp/tls/x509/allow-anon-client", - test_precopy_tcp_tls_x509_allow_anon_client); - qtest_add_func("/migration/precopy/tcp/tls/x509/reject-anon-client", - test_precopy_tcp_tls_x509_reject_anon_client); + migration_test_add("/migration/precopy/tcp/tls/x509/default-host", + test_precopy_tcp_tls_x509_default_host); + migration_test_add("/migration/precopy/tcp/tls/x509/override-host", + test_precopy_tcp_tls_x509_override_host); + migration_test_add("/migration/precopy/tcp/tls/x509/mismatch-host", + test_precopy_tcp_tls_x509_mismatch_host); + migration_test_add("/migration/precopy/tcp/tls/x509/friendly-client", + test_precopy_tcp_tls_x509_friendly_client); + migration_test_add("/migration/precopy/tcp/tls/x509/hostile-client", + test_precopy_tcp_tls_x509_hostile_client); + migration_test_add("/migration/precopy/tcp/tls/x509/allow-anon-client", + test_precopy_tcp_tls_x509_allow_anon_client); + migration_test_add("/migration/precopy/tcp/tls/x509/reject-anon-client", + test_precopy_tcp_tls_x509_reject_anon_client); #endif /* CONFIG_TASN1 */ #endif /* CONFIG_GNUTLS */ - /* qtest_add_func("/migration/ignore_shared", test_ignore_shared); */ + /* migration_test_add("/migration/ignore_shared", test_ignore_shared); */ #ifndef _WIN32 - qtest_add_func("/migration/fd_proto", test_migrate_fd_proto); + migration_test_add("/migration/fd_proto", test_migrate_fd_proto); #endif - qtest_add_func("/migration/validate_uuid", test_validate_uuid); - qtest_add_func("/migration/validate_uuid_error", test_validate_uuid_error); - qtest_add_func("/migration/validate_uuid_src_not_set", - test_validate_uuid_src_not_set); - qtest_add_func("/migration/validate_uuid_dst_not_set", - test_validate_uuid_dst_not_set); + migration_test_add("/migration/validate_uuid", test_validate_uuid); + migration_test_add("/migration/validate_uuid_error", + test_validate_uuid_error); + migration_test_add("/migration/validate_uuid_src_not_set", + test_validate_uuid_src_not_set); + migration_test_add("/migration/validate_uuid_dst_not_set", + test_validate_uuid_dst_not_set); /* * See explanation why this test is slow on function definition */ if (g_test_slow()) { - qtest_add_func("/migration/auto_converge", test_migrate_auto_converge); + migration_test_add("/migration/auto_converge", + test_migrate_auto_converge); if (g_str_equal(arch, "x86_64") && has_kvm && kvm_dirty_ring_supported()) { - qtest_add_func("/migration/dirty_limit", test_migrate_dirty_limit); + migration_test_add("/migration/dirty_limit", + test_migrate_dirty_limit); } } - qtest_add_func("/migration/multifd/tcp/plain/none", - test_multifd_tcp_none); + migration_test_add("/migration/multifd/tcp/plain/none", + test_multifd_tcp_none); /* * This test is flaky and sometimes fails in CI and otherwise: * don't run unless user opts in via environment variable. */ if (getenv("QEMU_TEST_FLAKY_TESTS")) { - qtest_add_func("/migration/multifd/tcp/plain/cancel", - test_multifd_tcp_cancel); + migration_test_add("/migration/multifd/tcp/plain/cancel", + test_multifd_tcp_cancel); } - qtest_add_func("/migration/multifd/tcp/plain/zlib", - test_multifd_tcp_zlib); + migration_test_add("/migration/multifd/tcp/plain/zlib", + test_multifd_tcp_zlib); #ifdef CONFIG_ZSTD - qtest_add_func("/migration/multifd/tcp/plain/zstd", - test_multifd_tcp_zstd); + migration_test_add("/migration/multifd/tcp/plain/zstd", + test_multifd_tcp_zstd); #endif #ifdef CONFIG_GNUTLS - qtest_add_func("/migration/multifd/tcp/tls/psk/match", - test_multifd_tcp_tls_psk_match); - qtest_add_func("/migration/multifd/tcp/tls/psk/mismatch", - test_multifd_tcp_tls_psk_mismatch); + migration_test_add("/migration/multifd/tcp/tls/psk/match", + test_multifd_tcp_tls_psk_match); + migration_test_add("/migration/multifd/tcp/tls/psk/mismatch", + test_multifd_tcp_tls_psk_mismatch); #ifdef CONFIG_TASN1 - qtest_add_func("/migration/multifd/tcp/tls/x509/default-host", - test_multifd_tcp_tls_x509_default_host); - qtest_add_func("/migration/multifd/tcp/tls/x509/override-host", - test_multifd_tcp_tls_x509_override_host); - qtest_add_func("/migration/multifd/tcp/tls/x509/mismatch-host", - test_multifd_tcp_tls_x509_mismatch_host); - qtest_add_func("/migration/multifd/tcp/tls/x509/allow-anon-client", - test_multifd_tcp_tls_x509_allow_anon_client); - qtest_add_func("/migration/multifd/tcp/tls/x509/reject-anon-client", - test_multifd_tcp_tls_x509_reject_anon_client); + migration_test_add("/migration/multifd/tcp/tls/x509/default-host", + test_multifd_tcp_tls_x509_default_host); + migration_test_add("/migration/multifd/tcp/tls/x509/override-host", + test_multifd_tcp_tls_x509_override_host); + migration_test_add("/migration/multifd/tcp/tls/x509/mismatch-host", + test_multifd_tcp_tls_x509_mismatch_host); + migration_test_add("/migration/multifd/tcp/tls/x509/allow-anon-client", + test_multifd_tcp_tls_x509_allow_anon_client); + migration_test_add("/migration/multifd/tcp/tls/x509/reject-anon-client", + test_multifd_tcp_tls_x509_reject_anon_client); #endif /* CONFIG_TASN1 */ #endif /* CONFIG_GNUTLS */ if (g_str_equal(arch, "x86_64") && has_kvm && kvm_dirty_ring_supported()) { - qtest_add_func("/migration/dirty_ring", - test_precopy_unix_dirty_ring); - qtest_add_func("/migration/vcpu_dirty_limit", - test_vcpu_dirty_limit); + migration_test_add("/migration/dirty_ring", + test_precopy_unix_dirty_ring); + migration_test_add("/migration/vcpu_dirty_limit", + test_vcpu_dirty_limit); } ret = g_test_run(); -- Gitee From 1886ff71903dd908f4cfeb5df064e291760b960a Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Wed, 11 Oct 2023 15:46:04 -0300 Subject: [PATCH 14/90] tests/qtest: Re-enable multifd cancel test commit 75b1f88cd2dd5eeb1fd817a2f3a291c2670f9c50 upstream. We've found the source of flakiness in this test, so re-enable it. Intel-SIG: commit 75b1f88cd2dd tests/qtest: Re-enable multifd cancel test Reviewed-by: Juan Quintela Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20230606144551.24367-4-farosas@suse.de [peterx: rebase to 2a61a6964c, to use migration_test_add()] Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 4884a40be2c..4bdf397828d 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -3475,14 +3475,8 @@ int main(int argc, char **argv) } migration_test_add("/migration/multifd/tcp/plain/none", test_multifd_tcp_none); - /* - * This test is flaky and sometimes fails in CI and otherwise: - * don't run unless user opts in via environment variable. - */ - if (getenv("QEMU_TEST_FLAKY_TESTS")) { - migration_test_add("/migration/multifd/tcp/plain/cancel", - test_multifd_tcp_cancel); - } + migration_test_add("/migration/multifd/tcp/plain/cancel", + test_multifd_tcp_cancel); migration_test_add("/migration/multifd/tcp/plain/zlib", test_multifd_tcp_zlib); #ifdef CONFIG_ZSTD -- Gitee From 62e1c6f3b4ee8c2388e40878d15b0be8febf3c24 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:19 +0800 Subject: [PATCH 15/90] docs/migration: Create migration/ directory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8cb2f8b172e74a7279fabb5d5c20aee32b5b98cd upstream. Migration documentation is growing into a single file too large. Create a sub-directory for it for a split. We also already have separate vfio/virtio documentations, move it all over into the directory. Note that the virtio one is still not yet converted to rST. That is a job for later. Intel-SIG: commit 8cb2f8b172e7 docs/migration: Create migration/ directory Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Alex Williamson Cc: Cédric Le Goater Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-2-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/index-internals.rst | 2 +- docs/devel/{migration.rst => migration/main.rst} | 0 docs/devel/{vfio-migration.rst => migration/vfio.rst} | 0 docs/devel/{virtio-migration.txt => migration/virtio.txt} | 0 4 files changed, 1 insertion(+), 1 deletion(-) rename docs/devel/{migration.rst => migration/main.rst} (100%) rename docs/devel/{vfio-migration.rst => migration/vfio.rst} (100%) rename docs/devel/{virtio-migration.txt => migration/virtio.txt} (100%) diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst index 6f81df92bca..645f0cd7b9c 100644 --- a/docs/devel/index-internals.rst +++ b/docs/devel/index-internals.rst @@ -11,7 +11,7 @@ Details about QEMU's various subsystems including how to add features to them. block-coroutine-wrapper clocks ebpf_rss - migration + migration/main multi-process reset s390-cpu-topology diff --git a/docs/devel/migration.rst b/docs/devel/migration/main.rst similarity index 100% rename from docs/devel/migration.rst rename to docs/devel/migration/main.rst diff --git a/docs/devel/vfio-migration.rst b/docs/devel/migration/vfio.rst similarity index 100% rename from docs/devel/vfio-migration.rst rename to docs/devel/migration/vfio.rst diff --git a/docs/devel/virtio-migration.txt b/docs/devel/migration/virtio.txt similarity index 100% rename from docs/devel/virtio-migration.txt rename to docs/devel/migration/virtio.txt -- Gitee From 20c73ce9a664ec64382c92c7064e3bd5e678d0e2 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:20 +0800 Subject: [PATCH 16/90] docs/migration: Create index page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f6bbac985e6df492f2c6be94fb893ada75ffdefa upstream. Create an index page for migration module. Move VFIO migration there too. A trivial touch-up on the title to use lower case there. Since then we'll have "migration" as the top title, make the main doc file renamed to "migration framework". Intel-SIG: commit f6bbac985e6d docs/migration: Create index page Cc: Alex Williamson Cc: Cédric Le Goater Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-3-peterx@redhat.com Signed-off-by: Peter Xu Conflicts: docs/devel/index-internals.rst [jz: resolve simple context conflict] Signed-off-by: Jason Zeng --- docs/devel/index-internals.rst | 3 +-- docs/devel/migration/index.rst | 11 +++++++++++ docs/devel/migration/main.rst | 6 +++--- docs/devel/migration/vfio.rst | 2 +- 4 files changed, 16 insertions(+), 6 deletions(-) create mode 100644 docs/devel/migration/index.rst diff --git a/docs/devel/index-internals.rst b/docs/devel/index-internals.rst index 645f0cd7b9c..4d1502d4b4d 100644 --- a/docs/devel/index-internals.rst +++ b/docs/devel/index-internals.rst @@ -11,12 +11,11 @@ Details about QEMU's various subsystems including how to add features to them. block-coroutine-wrapper clocks ebpf_rss - migration/main + migration/index multi-process reset s390-cpu-topology s390-dasd-ipl tracing - vfio-migration writing-monitor-commands virtio-backends diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst new file mode 100644 index 00000000000..02cfdcc9692 --- /dev/null +++ b/docs/devel/migration/index.rst @@ -0,0 +1,11 @@ +Migration +========= + +This is the main entry for QEMU migration documentations. It explains how +QEMU live migration works. + +.. toctree:: + :maxdepth: 2 + + main + vfio diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index ec55089b253..82cdb420bfd 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -1,6 +1,6 @@ -========= -Migration -========= +=================== +Migration framework +=================== QEMU has code to load/save the state of the guest that it is running. These are two complementary operations. Saving the state just does diff --git a/docs/devel/migration/vfio.rst b/docs/devel/migration/vfio.rst index 605fe60e969..c49482eab66 100644 --- a/docs/devel/migration/vfio.rst +++ b/docs/devel/migration/vfio.rst @@ -1,5 +1,5 @@ ===================== -VFIO device Migration +VFIO device migration ===================== Migration of virtual machine involves saving the state for each device that -- Gitee From c7c31c00ede23a7b56c97a1d1f13488467e86efd Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:21 +0800 Subject: [PATCH 17/90] docs/migration: Convert virtio.txt into rST MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4d7a691bcfeb5580e3f7457e1f1c2fbd64572161 upstream. Convert the plain old .txt into .rst, add it into migration/index.rst. Intel-SIG: commit 4d7a691bcfeb docs/migration: Convert virtio.txt into rST Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-4-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/index.rst | 1 + .../migration/{virtio.txt => virtio.rst} | 139 +++++++++--------- 2 files changed, 74 insertions(+), 66 deletions(-) rename docs/devel/migration/{virtio.txt => virtio.rst} (37%) diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst index 02cfdcc9692..2cb701c77ca 100644 --- a/docs/devel/migration/index.rst +++ b/docs/devel/migration/index.rst @@ -9,3 +9,4 @@ QEMU live migration works. main vfio + virtio diff --git a/docs/devel/migration/virtio.txt b/docs/devel/migration/virtio.rst similarity index 37% rename from docs/devel/migration/virtio.txt rename to docs/devel/migration/virtio.rst index 98a6b0ffb57..611a18b8215 100644 --- a/docs/devel/migration/virtio.txt +++ b/docs/devel/migration/virtio.rst @@ -1,5 +1,6 @@ -Virtio devices and migration -============================ +======================= +Virtio device migration +======================= Copyright 2015 IBM Corp. @@ -8,91 +9,97 @@ the COPYING file in the top-level directory. Saving and restoring the state of virtio devices is a bit of a twisty maze, for several reasons: + - state is distributed between several parts: + - virtio core, for common fields like features, number of queues, ... + - virtio transport (pci, ccw, ...), for the different proxy devices and transport specific state (msix vectors, indicators, ...) + - virtio device (net, blk, ...), for the different device types and their state (mac address, request queue, ...) + - most fields are saved via the stream interface; subsequently, subsections have been added to make cross-version migration possible This file attempts to document the current procedure and point out some caveats. - Save state procedure ==================== -virtio core virtio transport virtio device ------------ ---------------- ------------- - - save() function registered - via VMState wrapper on - device class -virtio_save() <---------- - ------> save_config() - - save proxy device - - save transport-specific - device fields -- save common device - fields -- save common virtqueue - fields - ------> save_queue() - - save transport-specific - virtqueue fields - ------> save_device() - - save device-specific - fields -- save subsections - - device endianness, - if changed from - default endianness - - 64 bit features, if - any high feature bit - is set - - virtio-1 virtqueue - fields, if VERSION_1 - is set +:: + virtio core virtio transport virtio device + ----------- ---------------- ------------- + + save() function registered + via VMState wrapper on + device class + virtio_save() <---------- + ------> save_config() + - save proxy device + - save transport-specific + device fields + - save common device + fields + - save common virtqueue + fields + ------> save_queue() + - save transport-specific + virtqueue fields + ------> save_device() + - save device-specific + fields + - save subsections + - device endianness, + if changed from + default endianness + - 64 bit features, if + any high feature bit + is set + - virtio-1 virtqueue + fields, if VERSION_1 + is set Load state procedure ==================== -virtio core virtio transport virtio device ------------ ---------------- ------------- - - load() function registered - via VMState wrapper on - device class -virtio_load() <---------- - ------> load_config() - - load proxy device - - load transport-specific - device fields -- load common device - fields -- load common virtqueue - fields - ------> load_queue() - - load transport-specific - virtqueue fields -- notify guest - ------> load_device() - - load device-specific - fields -- load subsections - - device endianness - - 64 bit features - - virtio-1 virtqueue - fields -- sanitize endianness -- sanitize features -- virtqueue index sanity - check - - feature-dependent setup +:: + virtio core virtio transport virtio device + ----------- ---------------- ------------- + + load() function registered + via VMState wrapper on + device class + virtio_load() <---------- + ------> load_config() + - load proxy device + - load transport-specific + device fields + - load common device + fields + - load common virtqueue + fields + ------> load_queue() + - load transport-specific + virtqueue fields + - notify guest + ------> load_device() + - load device-specific + fields + - load subsections + - device endianness + - 64 bit features + - virtio-1 virtqueue + fields + - sanitize endianness + - sanitize features + - virtqueue index sanity + check + - feature-dependent setup Implications of this setup ========================== -- Gitee From e07208965e0c0bbd28dba2720f0111129f9d3212 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:22 +0800 Subject: [PATCH 18/90] docs/migration: Split "Backwards compatibility" separately MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 6cc6a7b98b88f1a7d1d5ed99db0d373a46606aac upstream. Split the section from main.rst into a separate file. Reference it in the index.rst. Intel-SIG: commit 6cc6a7b98b88 docs/migration: Split "Backwards compatibility" separately Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-5-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/compatibility.rst | 517 ++++++++++++++++++++++++ docs/devel/migration/index.rst | 1 + docs/devel/migration/main.rst | 519 ------------------------- 3 files changed, 518 insertions(+), 519 deletions(-) create mode 100644 docs/devel/migration/compatibility.rst diff --git a/docs/devel/migration/compatibility.rst b/docs/devel/migration/compatibility.rst new file mode 100644 index 00000000000..5a5417ef069 --- /dev/null +++ b/docs/devel/migration/compatibility.rst @@ -0,0 +1,517 @@ +Backwards compatibility +======================= + +How backwards compatibility works +--------------------------------- + +When we do migration, we have two QEMU processes: the source and the +target. There are two cases, they are the same version or they are +different versions. The easy case is when they are the same version. +The difficult one is when they are different versions. + +There are two things that are different, but they have very similar +names and sometimes get confused: + +- QEMU version +- machine type version + +Let's start with a practical example, we start with: + +- qemu-system-x86_64 (v5.2), from now on qemu-5.2. +- qemu-system-x86_64 (v5.1), from now on qemu-5.1. + +Related to this are the "latest" machine types defined on each of +them: + +- pc-q35-5.2 (newer one in qemu-5.2) from now on pc-5.2 +- pc-q35-5.1 (newer one in qemu-5.1) from now on pc-5.1 + +First of all, migration is only supposed to work if you use the same +machine type in both source and destination. The QEMU hardware +configuration needs to be the same also on source and destination. +Most aspects of the backend configuration can be changed at will, +except for a few cases where the backend features influence frontend +device feature exposure. But that is not relevant for this section. + +I am going to list the number of combinations that we can have. Let's +start with the trivial ones, QEMU is the same on source and +destination: + +1 - qemu-5.2 -M pc-5.2 -> migrates to -> qemu-5.2 -M pc-5.2 + + This is the latest QEMU with the latest machine type. + This have to work, and if it doesn't work it is a bug. + +2 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 + + Exactly the same case than the previous one, but for 5.1. + Nothing to see here either. + +This are the easiest ones, we will not talk more about them in this +section. + +Now we start with the more interesting cases. Consider the case where +we have the same QEMU version in both sides (qemu-5.2) but we are using +the latest machine type for that version (pc-5.2) but one of an older +QEMU version, in this case pc-5.1. + +3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 + + It needs to use the definition of pc-5.1 and the devices as they + were configured on 5.1, but this should be easy in the sense that + both sides are the same QEMU and both sides have exactly the same + idea of what the pc-5.1 machine is. + +4 - qemu-5.1 -M pc-5.2 -> migrates to -> qemu-5.1 -M pc-5.2 + + This combination is not possible as the qemu-5.1 doesn't understand + pc-5.2 machine type. So nothing to worry here. + +Now it comes the interesting ones, when both QEMU processes are +different. Notice also that the machine type needs to be pc-5.1, +because we have the limitation than qemu-5.1 doesn't know pc-5.2. So +the possible cases are: + +5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 + + This migration is known as newer to older. We need to make sure + when we are developing 5.2 we need to take care about not to break + migration to qemu-5.1. Notice that we can't make updates to + qemu-5.1 to understand whatever qemu-5.2 decides to change, so it is + in qemu-5.2 side to make the relevant changes. + +6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 + + This migration is known as older to newer. We need to make sure + than we are able to receive migrations from qemu-5.1. The problem is + similar to the previous one. + +If qemu-5.1 and qemu-5.2 were the same, there will not be any +compatibility problems. But the reason that we create qemu-5.2 is to +get new features, devices, defaults, etc. + +If we get a device that has a new feature, or change a default value, +we have a problem when we try to migrate between different QEMU +versions. + +So we need a way to tell qemu-5.2 that when we are using machine type +pc-5.1, it needs to **not** use the feature, to be able to migrate to +real qemu-5.1. + +And the equivalent part when migrating from qemu-5.1 to qemu-5.2. +qemu-5.2 has to expect that it is not going to get data for the new +feature, because qemu-5.1 doesn't know about it. + +How do we tell QEMU about these device feature changes? In +hw/core/machine.c:hw_compat_X_Y arrays. + +If we change a default value, we need to put back the old value on +that array. And the device, during initialization needs to look at +that array to see what value it needs to get for that feature. And +what are we going to put in that array, the value of a property. + +To create a property for a device, we need to use one of the +DEFINE_PROP_*() macros. See include/hw/qdev-properties.h to find the +macros that exist. With it, we set the default value for that +property, and that is what it is going to get in the latest released +version. But if we want a different value for a previous version, we +can change that in the hw_compat_X_Y arrays. + +hw_compat_X_Y is an array of registers that have the format: + +- name_device +- name_property +- value + +Let's see a practical example. + +In qemu-5.2 virtio-blk-device got multi queue support. This is a +change that is not backward compatible. In qemu-5.1 it has one +queue. In qemu-5.2 it has the same number of queues as the number of +cpus in the system. + +When we are doing migration, if we migrate from a device that has 4 +queues to a device that have only one queue, we don't know where to +put the extra information for the other 3 queues, and we fail +migration. + +Similar problem when we migrate from qemu-5.1 that has only one queue +to qemu-5.2, we only sent information for one queue, but destination +has 4, and we have 3 queues that are not properly initialized and +anything can happen. + +So, how can we address this problem. Easy, just convince qemu-5.2 +that when it is running pc-5.1, it needs to set the number of queues +for virtio-blk-devices to 1. + +That way we fix the cases 5 and 6. + +5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 + + qemu-5.2 -M pc-5.1 sets number of queues to be 1. + qemu-5.1 -M pc-5.1 expects number of queues to be 1. + + correct. migration works. + +6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 + + qemu-5.1 -M pc-5.1 sets number of queues to be 1. + qemu-5.2 -M pc-5.1 expects number of queues to be 1. + + correct. migration works. + +And now the other interesting case, case 3. In this case we have: + +3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 + + Here we have the same QEMU in both sides. So it doesn't matter a + lot if we have set the number of queues to 1 or not, because + they are the same. + + WRONG! + + Think what happens if we do one of this double migrations: + + A -> migrates -> B -> migrates -> C + + where: + + A: qemu-5.1 -M pc-5.1 + B: qemu-5.2 -M pc-5.1 + C: qemu-5.2 -M pc-5.1 + + migration A -> B is case 6, so number of queues needs to be 1. + + migration B -> C is case 3, so we don't care. But actually we + care because we haven't started the guest in qemu-5.2, it came + migrated from qemu-5.1. So to be in the safe place, we need to + always use number of queues 1 when we are using pc-5.1. + +Now, how was this done in reality? The following commit shows how it +was done:: + + commit 9445e1e15e66c19e42bea942ba810db28052cd05 + Author: Stefan Hajnoczi + Date: Tue Aug 18 15:33:47 2020 +0100 + + virtio-blk-pci: default num_queues to -smp N + +The relevant parts for migration are:: + + @@ -1281,7 +1284,8 @@ static Property virtio_blk_properties[] = { + #endif + DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0, + true), + - DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1), + + DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, + + VIRTIO_BLK_AUTO_NUM_QUEUES), + DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256), + +It changes the default value of num_queues. But it fishes it for old +machine types to have the right value:: + + @@ -31,6 +31,7 @@ + GlobalProperty hw_compat_5_1[] = { + ... + + { "virtio-blk-device", "num-queues", "1"}, + ... + }; + +A device with different features on both sides +---------------------------------------------- + +Let's assume that we are using the same QEMU binary on both sides, +just to make the things easier. But we have a device that has +different features on both sides of the migration. That can be +because the devices are different, because the kernel driver of both +devices have different features, whatever. + +How can we get this to work with migration. The way to do that is +"theoretically" easy. You have to get the features that the device +has in the source of the migration. The features that the device has +on the target of the migration, you get the intersection of the +features of both sides, and that is the way that you should launch +QEMU. + +Notice that this is not completely related to QEMU. The most +important thing here is that this should be handled by the managing +application that launches QEMU. If QEMU is configured correctly, the +migration will succeed. + +That said, actually doing it is complicated. Almost all devices are +bad at being able to be launched with only some features enabled. +With one big exception: cpus. + +You can read the documentation for QEMU x86 cpu models here: + +https://qemu-project.gitlab.io/qemu/system/qemu-cpu-models.html + +See when they talk about migration they recommend that one chooses the +newest cpu model that is supported for all cpus. + +Let's say that we have: + +Host A: + +Device X has the feature Y + +Host B: + +Device X has not the feature Y + +If we try to migrate without any care from host A to host B, it will +fail because when migration tries to load the feature Y on +destination, it will find that the hardware is not there. + +Doing this would be the equivalent of doing with cpus: + +Host A: + +$ qemu-system-x86_64 -cpu host + +Host B: + +$ qemu-system-x86_64 -cpu host + +When both hosts have different cpu features this is guaranteed to +fail. Especially if Host B has less features than host A. If host A +has less features than host B, sometimes it works. Important word of +last sentence is "sometimes". + +So, forgetting about cpu models and continuing with the -cpu host +example, let's see that the differences of the cpus is that Host A and +B have the following features: + +Features: 'pcid' 'stibp' 'taa-no' +Host A: X X +Host B: X + +And we want to migrate between them, the way configure both QEMU cpu +will be: + +Host A: + +$ qemu-system-x86_64 -cpu host,pcid=off,stibp=off + +Host B: + +$ qemu-system-x86_64 -cpu host,taa-no=off + +And you would be able to migrate between them. It is responsibility +of the management application or of the user to make sure that the +configuration is correct. QEMU doesn't know how to look at this kind +of features in general. + +Notice that we don't recommend to use -cpu host for migration. It is +used in this example because it makes the example simpler. + +Other devices have worse control about individual features. If they +want to be able to migrate between hosts that show different features, +the device needs a way to configure which ones it is going to use. + +In this section we have considered that we are using the same QEMU +binary in both sides of the migration. If we use different QEMU +versions process, then we need to have into account all other +differences and the examples become even more complicated. + +How to mitigate when we have a backward compatibility error +----------------------------------------------------------- + +We broke migration for old machine types continuously during +development. But as soon as we find that there is a problem, we fix +it. The problem is what happens when we detect after we have done a +release that something has gone wrong. + +Let see how it worked with one example. + +After the release of qemu-8.0 we found a problem when doing migration +of the machine type pc-7.2. + +- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 + + This migration works + +- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 + + This migration works + +- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 + + This migration fails + +- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 + + This migration fails + +So clearly something fails when migration between qemu-7.2 and +qemu-8.0 with machine type pc-7.2. The error messages, and git bisect +pointed to this commit. + +In qemu-8.0 we got this commit:: + + commit 010746ae1db7f52700cb2e2c46eb94f299cfa0d2 + Author: Jonathan Cameron + Date: Thu Mar 2 13:37:02 2023 +0000 + + hw/pci/aer: Implement PCI_ERR_UNCOR_MASK register + + +The relevant bits of the commit for our example are this ones:: + + --- a/hw/pci/pcie_aer.c + +++ b/hw/pci/pcie_aer.c + @@ -112,6 +112,10 @@ int pcie_aer_init(PCIDevice *dev, + + pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, + PCI_ERR_UNC_SUPPORTED); + + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, + + PCI_ERR_UNC_MASK_DEFAULT); + + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, + + PCI_ERR_UNC_SUPPORTED); + + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, + PCI_ERR_UNC_SEVERITY_DEFAULT); + +The patch changes how we configure PCI space for AER. But QEMU fails +when the PCI space configuration is different between source and +destination. + +The following commit shows how this got fixed:: + + commit 5ed3dabe57dd9f4c007404345e5f5bf0e347317f + Author: Leonardo Bras + Date: Tue May 2 21:27:02 2023 -0300 + + hw/pci: Disable PCI_ERR_UNCOR_MASK register for machine type < 8.0 + + [...] + +The relevant parts of the fix in QEMU are as follow: + +First, we create a new property for the device to be able to configure +the old behaviour or the new behaviour:: + + diff --git a/hw/pci/pci.c b/hw/pci/pci.c + index 8a87ccc8b0..5153ad63d6 100644 + --- a/hw/pci/pci.c + +++ b/hw/pci/pci.c + @@ -79,6 +79,8 @@ static Property pci_props[] = { + DEFINE_PROP_STRING("failover_pair_id", PCIDevice, + failover_pair_id), + DEFINE_PROP_UINT32("acpi-index", PCIDevice, acpi_index, 0), + + DEFINE_PROP_BIT("x-pcie-err-unc-mask", PCIDevice, cap_present, + + QEMU_PCIE_ERR_UNC_MASK_BITNR, true), + DEFINE_PROP_END_OF_LIST() + }; + +Notice that we enable the feature for new machine types. + +Now we see how the fix is done. This is going to depend on what kind +of breakage happens, but in this case it is quite simple:: + + diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c + index 103667c368..374d593ead 100644 + --- a/hw/pci/pcie_aer.c + +++ b/hw/pci/pcie_aer.c + @@ -112,10 +112,13 @@ int pcie_aer_init(PCIDevice *dev, uint8_t cap_ver, + uint16_t offset, + + pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, + PCI_ERR_UNC_SUPPORTED); + - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, + - PCI_ERR_UNC_MASK_DEFAULT); + - pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, + - PCI_ERR_UNC_SUPPORTED); + + + + if (dev->cap_present & QEMU_PCIE_ERR_UNC_MASK) { + + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, + + PCI_ERR_UNC_MASK_DEFAULT); + + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, + + PCI_ERR_UNC_SUPPORTED); + + } + + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, + PCI_ERR_UNC_SEVERITY_DEFAULT); + +I.e. If the property bit is enabled, we configure it as we did for +qemu-8.0. If the property bit is not set, we configure it as it was in 7.2. + +And now, everything that is missing is disabling the feature for old +machine types:: + + diff --git a/hw/core/machine.c b/hw/core/machine.c + index 47a34841a5..07f763eb2e 100644 + --- a/hw/core/machine.c + +++ b/hw/core/machine.c + @@ -48,6 +48,7 @@ GlobalProperty hw_compat_7_2[] = { + { "e1000e", "migrate-timadj", "off" }, + { "virtio-mem", "x-early-migration", "false" }, + { "migration", "x-preempt-pre-7-2", "true" }, + + { TYPE_PCI_DEVICE, "x-pcie-err-unc-mask", "off" }, + }; + const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2); + +And now, when qemu-8.0.1 is released with this fix, all combinations +are going to work as supposed. + +- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) +- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) +- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) +- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) + +So the normality has been restored and everything is ok, no? + +Not really, now our matrix is much bigger. We started with the easy +cases, migration from the same version to the same version always +works: + +- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 +- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 +- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 + +Now the interesting ones. When the QEMU processes versions are +different. For the 1st set, their fail and we can do nothing, both +versions are released and we can't change anything. + +- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 +- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 + +This two are the ones that work. The whole point of making the +change in qemu-8.0.1 release was to fix this issue: + +- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 +- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 + +But now we found that qemu-8.0 neither can migrate to qemu-7.2 not +qemu-8.0.1. + +- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 +- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0 -M pc-7.2 + +So, if we start a pc-7.2 machine in qemu-8.0 we can't migrate it to +anything except to qemu-8.0. + +Can we do better? + +Yeap. If we know that we are going to do this migration: + +- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 + +We can launch the appropriate devices with:: + + --device...,x-pci-e-err-unc-mask=on + +And now we can receive a migration from 8.0. And from now on, we can +do that migration to new machine types if we remember to enable that +property for pc-7.2. Notice that we need to remember, it is not +enough to know that the source of the migration is qemu-8.0. Think of +this example: + +$ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -> qemu-8.2 -M pc-7.2 + +In the second migration, the source is not qemu-8.0, but we still have +that "problem" and have that property enabled. Notice that we need to +continue having this mark/property until we have this machine +rebooted. But it is not a normal reboot (that don't reload QEMU) we +need the machine to poweroff/poweron on a fixed QEMU. And from now +on we can use the proper real machine. diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst index 2cb701c77ca..7fc02b95206 100644 --- a/docs/devel/migration/index.rst +++ b/docs/devel/migration/index.rst @@ -8,5 +8,6 @@ QEMU live migration works. :maxdepth: 2 main + compatibility vfio virtio diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index 82cdb420bfd..04194414af4 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -993,522 +993,3 @@ In some cases it may be best to tie specific firmware versions to specific versioned machine types to cut down on the combinations that will need support. This is also useful when newer versions of firmware outgrow the padding. - - -Backwards compatibility -======================= - -How backwards compatibility works ---------------------------------- - -When we do migration, we have two QEMU processes: the source and the -target. There are two cases, they are the same version or they are -different versions. The easy case is when they are the same version. -The difficult one is when they are different versions. - -There are two things that are different, but they have very similar -names and sometimes get confused: - -- QEMU version -- machine type version - -Let's start with a practical example, we start with: - -- qemu-system-x86_64 (v5.2), from now on qemu-5.2. -- qemu-system-x86_64 (v5.1), from now on qemu-5.1. - -Related to this are the "latest" machine types defined on each of -them: - -- pc-q35-5.2 (newer one in qemu-5.2) from now on pc-5.2 -- pc-q35-5.1 (newer one in qemu-5.1) from now on pc-5.1 - -First of all, migration is only supposed to work if you use the same -machine type in both source and destination. The QEMU hardware -configuration needs to be the same also on source and destination. -Most aspects of the backend configuration can be changed at will, -except for a few cases where the backend features influence frontend -device feature exposure. But that is not relevant for this section. - -I am going to list the number of combinations that we can have. Let's -start with the trivial ones, QEMU is the same on source and -destination: - -1 - qemu-5.2 -M pc-5.2 -> migrates to -> qemu-5.2 -M pc-5.2 - - This is the latest QEMU with the latest machine type. - This have to work, and if it doesn't work it is a bug. - -2 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 - - Exactly the same case than the previous one, but for 5.1. - Nothing to see here either. - -This are the easiest ones, we will not talk more about them in this -section. - -Now we start with the more interesting cases. Consider the case where -we have the same QEMU version in both sides (qemu-5.2) but we are using -the latest machine type for that version (pc-5.2) but one of an older -QEMU version, in this case pc-5.1. - -3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 - - It needs to use the definition of pc-5.1 and the devices as they - were configured on 5.1, but this should be easy in the sense that - both sides are the same QEMU and both sides have exactly the same - idea of what the pc-5.1 machine is. - -4 - qemu-5.1 -M pc-5.2 -> migrates to -> qemu-5.1 -M pc-5.2 - - This combination is not possible as the qemu-5.1 doesn't understand - pc-5.2 machine type. So nothing to worry here. - -Now it comes the interesting ones, when both QEMU processes are -different. Notice also that the machine type needs to be pc-5.1, -because we have the limitation than qemu-5.1 doesn't know pc-5.2. So -the possible cases are: - -5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 - - This migration is known as newer to older. We need to make sure - when we are developing 5.2 we need to take care about not to break - migration to qemu-5.1. Notice that we can't make updates to - qemu-5.1 to understand whatever qemu-5.2 decides to change, so it is - in qemu-5.2 side to make the relevant changes. - -6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 - - This migration is known as older to newer. We need to make sure - than we are able to receive migrations from qemu-5.1. The problem is - similar to the previous one. - -If qemu-5.1 and qemu-5.2 were the same, there will not be any -compatibility problems. But the reason that we create qemu-5.2 is to -get new features, devices, defaults, etc. - -If we get a device that has a new feature, or change a default value, -we have a problem when we try to migrate between different QEMU -versions. - -So we need a way to tell qemu-5.2 that when we are using machine type -pc-5.1, it needs to **not** use the feature, to be able to migrate to -real qemu-5.1. - -And the equivalent part when migrating from qemu-5.1 to qemu-5.2. -qemu-5.2 has to expect that it is not going to get data for the new -feature, because qemu-5.1 doesn't know about it. - -How do we tell QEMU about these device feature changes? In -hw/core/machine.c:hw_compat_X_Y arrays. - -If we change a default value, we need to put back the old value on -that array. And the device, during initialization needs to look at -that array to see what value it needs to get for that feature. And -what are we going to put in that array, the value of a property. - -To create a property for a device, we need to use one of the -DEFINE_PROP_*() macros. See include/hw/qdev-properties.h to find the -macros that exist. With it, we set the default value for that -property, and that is what it is going to get in the latest released -version. But if we want a different value for a previous version, we -can change that in the hw_compat_X_Y arrays. - -hw_compat_X_Y is an array of registers that have the format: - -- name_device -- name_property -- value - -Let's see a practical example. - -In qemu-5.2 virtio-blk-device got multi queue support. This is a -change that is not backward compatible. In qemu-5.1 it has one -queue. In qemu-5.2 it has the same number of queues as the number of -cpus in the system. - -When we are doing migration, if we migrate from a device that has 4 -queues to a device that have only one queue, we don't know where to -put the extra information for the other 3 queues, and we fail -migration. - -Similar problem when we migrate from qemu-5.1 that has only one queue -to qemu-5.2, we only sent information for one queue, but destination -has 4, and we have 3 queues that are not properly initialized and -anything can happen. - -So, how can we address this problem. Easy, just convince qemu-5.2 -that when it is running pc-5.1, it needs to set the number of queues -for virtio-blk-devices to 1. - -That way we fix the cases 5 and 6. - -5 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.1 -M pc-5.1 - - qemu-5.2 -M pc-5.1 sets number of queues to be 1. - qemu-5.1 -M pc-5.1 expects number of queues to be 1. - - correct. migration works. - -6 - qemu-5.1 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 - - qemu-5.1 -M pc-5.1 sets number of queues to be 1. - qemu-5.2 -M pc-5.1 expects number of queues to be 1. - - correct. migration works. - -And now the other interesting case, case 3. In this case we have: - -3 - qemu-5.2 -M pc-5.1 -> migrates to -> qemu-5.2 -M pc-5.1 - - Here we have the same QEMU in both sides. So it doesn't matter a - lot if we have set the number of queues to 1 or not, because - they are the same. - - WRONG! - - Think what happens if we do one of this double migrations: - - A -> migrates -> B -> migrates -> C - - where: - - A: qemu-5.1 -M pc-5.1 - B: qemu-5.2 -M pc-5.1 - C: qemu-5.2 -M pc-5.1 - - migration A -> B is case 6, so number of queues needs to be 1. - - migration B -> C is case 3, so we don't care. But actually we - care because we haven't started the guest in qemu-5.2, it came - migrated from qemu-5.1. So to be in the safe place, we need to - always use number of queues 1 when we are using pc-5.1. - -Now, how was this done in reality? The following commit shows how it -was done:: - - commit 9445e1e15e66c19e42bea942ba810db28052cd05 - Author: Stefan Hajnoczi - Date: Tue Aug 18 15:33:47 2020 +0100 - - virtio-blk-pci: default num_queues to -smp N - -The relevant parts for migration are:: - - @@ -1281,7 +1284,8 @@ static Property virtio_blk_properties[] = { - #endif - DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0, - true), - - DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1), - + DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, - + VIRTIO_BLK_AUTO_NUM_QUEUES), - DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256), - -It changes the default value of num_queues. But it fishes it for old -machine types to have the right value:: - - @@ -31,6 +31,7 @@ - GlobalProperty hw_compat_5_1[] = { - ... - + { "virtio-blk-device", "num-queues", "1"}, - ... - }; - -A device with different features on both sides ----------------------------------------------- - -Let's assume that we are using the same QEMU binary on both sides, -just to make the things easier. But we have a device that has -different features on both sides of the migration. That can be -because the devices are different, because the kernel driver of both -devices have different features, whatever. - -How can we get this to work with migration. The way to do that is -"theoretically" easy. You have to get the features that the device -has in the source of the migration. The features that the device has -on the target of the migration, you get the intersection of the -features of both sides, and that is the way that you should launch -QEMU. - -Notice that this is not completely related to QEMU. The most -important thing here is that this should be handled by the managing -application that launches QEMU. If QEMU is configured correctly, the -migration will succeed. - -That said, actually doing it is complicated. Almost all devices are -bad at being able to be launched with only some features enabled. -With one big exception: cpus. - -You can read the documentation for QEMU x86 cpu models here: - -https://qemu-project.gitlab.io/qemu/system/qemu-cpu-models.html - -See when they talk about migration they recommend that one chooses the -newest cpu model that is supported for all cpus. - -Let's say that we have: - -Host A: - -Device X has the feature Y - -Host B: - -Device X has not the feature Y - -If we try to migrate without any care from host A to host B, it will -fail because when migration tries to load the feature Y on -destination, it will find that the hardware is not there. - -Doing this would be the equivalent of doing with cpus: - -Host A: - -$ qemu-system-x86_64 -cpu host - -Host B: - -$ qemu-system-x86_64 -cpu host - -When both hosts have different cpu features this is guaranteed to -fail. Especially if Host B has less features than host A. If host A -has less features than host B, sometimes it works. Important word of -last sentence is "sometimes". - -So, forgetting about cpu models and continuing with the -cpu host -example, let's see that the differences of the cpus is that Host A and -B have the following features: - -Features: 'pcid' 'stibp' 'taa-no' -Host A: X X -Host B: X - -And we want to migrate between them, the way configure both QEMU cpu -will be: - -Host A: - -$ qemu-system-x86_64 -cpu host,pcid=off,stibp=off - -Host B: - -$ qemu-system-x86_64 -cpu host,taa-no=off - -And you would be able to migrate between them. It is responsibility -of the management application or of the user to make sure that the -configuration is correct. QEMU doesn't know how to look at this kind -of features in general. - -Notice that we don't recommend to use -cpu host for migration. It is -used in this example because it makes the example simpler. - -Other devices have worse control about individual features. If they -want to be able to migrate between hosts that show different features, -the device needs a way to configure which ones it is going to use. - -In this section we have considered that we are using the same QEMU -binary in both sides of the migration. If we use different QEMU -versions process, then we need to have into account all other -differences and the examples become even more complicated. - -How to mitigate when we have a backward compatibility error ------------------------------------------------------------ - -We broke migration for old machine types continuously during -development. But as soon as we find that there is a problem, we fix -it. The problem is what happens when we detect after we have done a -release that something has gone wrong. - -Let see how it worked with one example. - -After the release of qemu-8.0 we found a problem when doing migration -of the machine type pc-7.2. - -- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 - - This migration works - -- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 - - This migration works - -- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 - - This migration fails - -- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 - - This migration fails - -So clearly something fails when migration between qemu-7.2 and -qemu-8.0 with machine type pc-7.2. The error messages, and git bisect -pointed to this commit. - -In qemu-8.0 we got this commit:: - - commit 010746ae1db7f52700cb2e2c46eb94f299cfa0d2 - Author: Jonathan Cameron - Date: Thu Mar 2 13:37:02 2023 +0000 - - hw/pci/aer: Implement PCI_ERR_UNCOR_MASK register - - -The relevant bits of the commit for our example are this ones:: - - --- a/hw/pci/pcie_aer.c - +++ b/hw/pci/pcie_aer.c - @@ -112,6 +112,10 @@ int pcie_aer_init(PCIDevice *dev, - - pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, - PCI_ERR_UNC_SUPPORTED); - + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, - + PCI_ERR_UNC_MASK_DEFAULT); - + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, - + PCI_ERR_UNC_SUPPORTED); - - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, - PCI_ERR_UNC_SEVERITY_DEFAULT); - -The patch changes how we configure PCI space for AER. But QEMU fails -when the PCI space configuration is different between source and -destination. - -The following commit shows how this got fixed:: - - commit 5ed3dabe57dd9f4c007404345e5f5bf0e347317f - Author: Leonardo Bras - Date: Tue May 2 21:27:02 2023 -0300 - - hw/pci: Disable PCI_ERR_UNCOR_MASK register for machine type < 8.0 - - [...] - -The relevant parts of the fix in QEMU are as follow: - -First, we create a new property for the device to be able to configure -the old behaviour or the new behaviour:: - - diff --git a/hw/pci/pci.c b/hw/pci/pci.c - index 8a87ccc8b0..5153ad63d6 100644 - --- a/hw/pci/pci.c - +++ b/hw/pci/pci.c - @@ -79,6 +79,8 @@ static Property pci_props[] = { - DEFINE_PROP_STRING("failover_pair_id", PCIDevice, - failover_pair_id), - DEFINE_PROP_UINT32("acpi-index", PCIDevice, acpi_index, 0), - + DEFINE_PROP_BIT("x-pcie-err-unc-mask", PCIDevice, cap_present, - + QEMU_PCIE_ERR_UNC_MASK_BITNR, true), - DEFINE_PROP_END_OF_LIST() - }; - -Notice that we enable the feature for new machine types. - -Now we see how the fix is done. This is going to depend on what kind -of breakage happens, but in this case it is quite simple:: - - diff --git a/hw/pci/pcie_aer.c b/hw/pci/pcie_aer.c - index 103667c368..374d593ead 100644 - --- a/hw/pci/pcie_aer.c - +++ b/hw/pci/pcie_aer.c - @@ -112,10 +112,13 @@ int pcie_aer_init(PCIDevice *dev, uint8_t cap_ver, - uint16_t offset, - - pci_set_long(dev->w1cmask + offset + PCI_ERR_UNCOR_STATUS, - PCI_ERR_UNC_SUPPORTED); - - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, - - PCI_ERR_UNC_MASK_DEFAULT); - - pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, - - PCI_ERR_UNC_SUPPORTED); - + - + if (dev->cap_present & QEMU_PCIE_ERR_UNC_MASK) { - + pci_set_long(dev->config + offset + PCI_ERR_UNCOR_MASK, - + PCI_ERR_UNC_MASK_DEFAULT); - + pci_set_long(dev->wmask + offset + PCI_ERR_UNCOR_MASK, - + PCI_ERR_UNC_SUPPORTED); - + } - - pci_set_long(dev->config + offset + PCI_ERR_UNCOR_SEVER, - PCI_ERR_UNC_SEVERITY_DEFAULT); - -I.e. If the property bit is enabled, we configure it as we did for -qemu-8.0. If the property bit is not set, we configure it as it was in 7.2. - -And now, everything that is missing is disabling the feature for old -machine types:: - - diff --git a/hw/core/machine.c b/hw/core/machine.c - index 47a34841a5..07f763eb2e 100644 - --- a/hw/core/machine.c - +++ b/hw/core/machine.c - @@ -48,6 +48,7 @@ GlobalProperty hw_compat_7_2[] = { - { "e1000e", "migrate-timadj", "off" }, - { "virtio-mem", "x-early-migration", "false" }, - { "migration", "x-preempt-pre-7-2", "true" }, - + { TYPE_PCI_DEVICE, "x-pcie-err-unc-mask", "off" }, - }; - const size_t hw_compat_7_2_len = G_N_ELEMENTS(hw_compat_7_2); - -And now, when qemu-8.0.1 is released with this fix, all combinations -are going to work as supposed. - -- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) -- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) -- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 (works) -- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 (works) - -So the normality has been restored and everything is ok, no? - -Not really, now our matrix is much bigger. We started with the easy -cases, migration from the same version to the same version always -works: - -- $ qemu-7.2 -M pc-7.2 -> qemu-7.2 -M pc-7.2 -- $ qemu-8.0 -M pc-7.2 -> qemu-8.0 -M pc-7.2 -- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 - -Now the interesting ones. When the QEMU processes versions are -different. For the 1st set, their fail and we can do nothing, both -versions are released and we can't change anything. - -- $ qemu-7.2 -M pc-7.2 -> qemu-8.0 -M pc-7.2 -- $ qemu-8.0 -M pc-7.2 -> qemu-7.2 -M pc-7.2 - -This two are the ones that work. The whole point of making the -change in qemu-8.0.1 release was to fix this issue: - -- $ qemu-7.2 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -- $ qemu-8.0.1 -M pc-7.2 -> qemu-7.2 -M pc-7.2 - -But now we found that qemu-8.0 neither can migrate to qemu-7.2 not -qemu-8.0.1. - -- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -- $ qemu-8.0.1 -M pc-7.2 -> qemu-8.0 -M pc-7.2 - -So, if we start a pc-7.2 machine in qemu-8.0 we can't migrate it to -anything except to qemu-8.0. - -Can we do better? - -Yeap. If we know that we are going to do this migration: - -- $ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 - -We can launch the appropriate devices with:: - - --device...,x-pci-e-err-unc-mask=on - -And now we can receive a migration from 8.0. And from now on, we can -do that migration to new machine types if we remember to enable that -property for pc-7.2. Notice that we need to remember, it is not -enough to know that the source of the migration is qemu-8.0. Think of -this example: - -$ qemu-8.0 -M pc-7.2 -> qemu-8.0.1 -M pc-7.2 -> qemu-8.2 -M pc-7.2 - -In the second migration, the source is not qemu-8.0, but we still have -that "problem" and have that property enabled. Notice that we need to -continue having this mark/property until we have this machine -rebooted. But it is not a normal reboot (that don't reload QEMU) we -need the machine to poweroff/poweron on a fixed QEMU. And from now -on we can use the proper real machine. -- Gitee From fd730b00791f09462bca5562842952ddb1ef9a18 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:23 +0800 Subject: [PATCH 19/90] docs/migration: Split "Debugging" and "Firmware" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 774ad6b53b9449223115ffa8851eb93de92b0ce7 upstream. Move the two sections into a separate file called "best-practices.rst". Add the entry into index. Intel-SIG: commit 774ad6b53b94 docs/migration: Split "Debugging" and "Firmware" Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-6-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/best-practices.rst | 48 +++++++++++++++++++++++++ docs/devel/migration/index.rst | 1 + docs/devel/migration/main.rst | 44 ----------------------- 3 files changed, 49 insertions(+), 44 deletions(-) create mode 100644 docs/devel/migration/best-practices.rst diff --git a/docs/devel/migration/best-practices.rst b/docs/devel/migration/best-practices.rst new file mode 100644 index 00000000000..d7c34a30149 --- /dev/null +++ b/docs/devel/migration/best-practices.rst @@ -0,0 +1,48 @@ +============== +Best practices +============== + +Debugging +========= + +The migration stream can be analyzed thanks to ``scripts/analyze-migration.py``. + +Example usage: + +.. code-block:: shell + + $ qemu-system-x86_64 -display none -monitor stdio + (qemu) migrate "exec:cat > mig" + (qemu) q + $ ./scripts/analyze-migration.py -f mig + { + "ram (3)": { + "section sizes": { + "pc.ram": "0x0000000008000000", + ... + +See also ``analyze-migration.py -h`` help for more options. + +Firmware +======== + +Migration migrates the copies of RAM and ROM, and thus when running +on the destination it includes the firmware from the source. Even after +resetting a VM, the old firmware is used. Only once QEMU has been restarted +is the new firmware in use. + +- Changes in firmware size can cause changes in the required RAMBlock size + to hold the firmware and thus migration can fail. In practice it's best + to pad firmware images to convenient powers of 2 with plenty of space + for growth. + +- Care should be taken with device emulation code so that newer + emulation code can work with older firmware to allow forward migration. + +- Care should be taken with newer firmware so that backward migration + to older systems with older device emulation code will work. + +In some cases it may be best to tie specific firmware versions to specific +versioned machine types to cut down on the combinations that will need +support. This is also useful when newer versions of firmware outgrow +the padding. diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst index 7fc02b95206..9a8fd1ead75 100644 --- a/docs/devel/migration/index.rst +++ b/docs/devel/migration/index.rst @@ -11,3 +11,4 @@ QEMU live migration works. compatibility vfio virtio + best-practices diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index 04194414af4..7ca3b4dd3fd 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -52,27 +52,6 @@ All these migration protocols use the same infrastructure to save/restore state devices. This infrastructure is shared with the savevm/loadvm functionality. -Debugging -========= - -The migration stream can be analyzed thanks to ``scripts/analyze-migration.py``. - -Example usage: - -.. code-block:: shell - - $ qemu-system-x86_64 -display none -monitor stdio - (qemu) migrate "exec:cat > mig" - (qemu) q - $ ./scripts/analyze-migration.py -f mig - { - "ram (3)": { - "section sizes": { - "pc.ram": "0x0000000008000000", - ... - -See also ``analyze-migration.py -h`` help for more options. - Common infrastructure ===================== @@ -970,26 +949,3 @@ the background migration channel. Anyone who cares about latencies of page faults during a postcopy migration should enable this feature. By default, it's not enabled. -Firmware -======== - -Migration migrates the copies of RAM and ROM, and thus when running -on the destination it includes the firmware from the source. Even after -resetting a VM, the old firmware is used. Only once QEMU has been restarted -is the new firmware in use. - -- Changes in firmware size can cause changes in the required RAMBlock size - to hold the firmware and thus migration can fail. In practice it's best - to pad firmware images to convenient powers of 2 with plenty of space - for growth. - -- Care should be taken with device emulation code so that newer - emulation code can work with older firmware to allow forward migration. - -- Care should be taken with newer firmware so that backward migration - to older systems with older device emulation code will work. - -In some cases it may be best to tie specific firmware versions to specific -versioned machine types to cut down on the combinations that will need -support. This is also useful when newer versions of firmware outgrow -the padding. -- Gitee From e061fd4fea68d99ec6b84cab4e13c1f5d41b6cea Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:24 +0800 Subject: [PATCH 20/90] docs/migration: Split "Postcopy" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit bfb4c7cd99f1c39dedf33381954d03b9f8f244ec upstream. Split postcopy into a separate file. Introduce a head page "features.rst" to keep all the features on top of migration framework. Intel-SIG: commit bfb4c7cd99f1 docs/migration: Split "Postcopy" Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-7-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/features.rst | 9 + docs/devel/migration/index.rst | 1 + docs/devel/migration/main.rst | 305 ------------------------------ docs/devel/migration/postcopy.rst | 304 +++++++++++++++++++++++++++++ 4 files changed, 314 insertions(+), 305 deletions(-) create mode 100644 docs/devel/migration/features.rst create mode 100644 docs/devel/migration/postcopy.rst diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst new file mode 100644 index 00000000000..0054e0c900a --- /dev/null +++ b/docs/devel/migration/features.rst @@ -0,0 +1,9 @@ +Migration features +================== + +Migration has plenty of features to support different use cases. + +.. toctree:: + :maxdepth: 2 + + postcopy diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst index 9a8fd1ead75..21ad58b1890 100644 --- a/docs/devel/migration/index.rst +++ b/docs/devel/migration/index.rst @@ -8,6 +8,7 @@ QEMU live migration works. :maxdepth: 2 main + features compatibility vfio virtio diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index 7ca3b4dd3fd..1e98e9e40c5 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -644,308 +644,3 @@ algorithm will restrict virtual CPUs as needed to keep their dirty page rate inside the limit. This leads to more steady reading performance during live migration and can aid in improving large guest responsiveness. -Postcopy -======== - -'Postcopy' migration is a way to deal with migrations that refuse to converge -(or take too long to converge) its plus side is that there is an upper bound on -the amount of migration traffic and time it takes, the down side is that during -the postcopy phase, a failure of *either* side causes the guest to be lost. - -In postcopy the destination CPUs are started before all the memory has been -transferred, and accesses to pages that are yet to be transferred cause -a fault that's translated by QEMU into a request to the source QEMU. - -Postcopy can be combined with precopy (i.e. normal migration) so that if precopy -doesn't finish in a given time the switch is made to postcopy. - -Enabling postcopy ------------------ - -To enable postcopy, issue this command on the monitor (both source and -destination) prior to the start of migration: - -``migrate_set_capability postcopy-ram on`` - -The normal commands are then used to start a migration, which is still -started in precopy mode. Issuing: - -``migrate_start_postcopy`` - -will now cause the transition from precopy to postcopy. -It can be issued immediately after migration is started or any -time later on. Issuing it after the end of a migration is harmless. - -Blocktime is a postcopy live migration metric, intended to show how -long the vCPU was in state of interruptible sleep due to pagefault. -That metric is calculated both for all vCPUs as overlapped value, and -separately for each vCPU. These values are calculated on destination -side. To enable postcopy blocktime calculation, enter following -command on destination monitor: - -``migrate_set_capability postcopy-blocktime on`` - -Postcopy blocktime can be retrieved by query-migrate qmp command. -postcopy-blocktime value of qmp command will show overlapped blocking -time for all vCPU, postcopy-vcpu-blocktime will show list of blocking -time per vCPU. - -.. note:: - During the postcopy phase, the bandwidth limits set using - ``migrate_set_parameter`` is ignored (to avoid delaying requested pages that - the destination is waiting for). - -Postcopy device transfer ------------------------- - -Loading of device data may cause the device emulation to access guest RAM -that may trigger faults that have to be resolved by the source, as such -the migration stream has to be able to respond with page data *during* the -device load, and hence the device data has to be read from the stream completely -before the device load begins to free the stream up. This is achieved by -'packaging' the device data into a blob that's read in one go. - -Source behaviour ----------------- - -Until postcopy is entered the migration stream is identical to normal -precopy, except for the addition of a 'postcopy advise' command at -the beginning, to tell the destination that postcopy might happen. -When postcopy starts the source sends the page discard data and then -forms the 'package' containing: - - - Command: 'postcopy listen' - - The device state - - A series of sections, identical to the precopy streams device state stream - containing everything except postcopiable devices (i.e. RAM) - - Command: 'postcopy run' - -The 'package' is sent as the data part of a Command: ``CMD_PACKAGED``, and the -contents are formatted in the same way as the main migration stream. - -During postcopy the source scans the list of dirty pages and sends them -to the destination without being requested (in much the same way as precopy), -however when a page request is received from the destination, the dirty page -scanning restarts from the requested location. This causes requested pages -to be sent quickly, and also causes pages directly after the requested page -to be sent quickly in the hope that those pages are likely to be used -by the destination soon. - -Destination behaviour ---------------------- - -Initially the destination looks the same as precopy, with a single thread -reading the migration stream; the 'postcopy advise' and 'discard' commands -are processed to change the way RAM is managed, but don't affect the stream -processing. - -:: - - ------------------------------------------------------------------------------ - 1 2 3 4 5 6 7 - main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN ) - thread | | - | (page request) - | \___ - v \ - listen thread: --- page -- page -- page -- page -- page -- - - a b c - ------------------------------------------------------------------------------ - -- On receipt of ``CMD_PACKAGED`` (1) - - All the data associated with the package - the ( ... ) section in the diagram - - is read into memory, and the main thread recurses into qemu_loadvm_state_main - to process the contents of the package (2) which contains commands (3,6) and - devices (4...) - -- On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package) - - a new thread (a) is started that takes over servicing the migration stream, - while the main thread carries on loading the package. It loads normal - background page data (b) but if during a device load a fault happens (5) - the returned page (c) is loaded by the listen thread allowing the main - threads device load to carry on. - -- The last thing in the ``CMD_PACKAGED`` is a 'RUN' command (6) - - letting the destination CPUs start running. At the end of the - ``CMD_PACKAGED`` (7) the main thread returns to normal running behaviour and - is no longer used by migration, while the listen thread carries on servicing - page data until the end of migration. - -Postcopy Recovery ------------------ - -Comparing to precopy, postcopy is special on error handlings. When any -error happens (in this case, mostly network errors), QEMU cannot easily -fail a migration because VM data resides in both source and destination -QEMU instances. On the other hand, when issue happens QEMU on both sides -will go into a paused state. It'll need a recovery phase to continue a -paused postcopy migration. - -The recovery phase normally contains a few steps: - - - When network issue occurs, both QEMU will go into PAUSED state - - - When the network is recovered (or a new network is provided), the admin - can setup the new channel for migration using QMP command - 'migrate-recover' on destination node, preparing for a resume. - - - On source host, the admin can continue the interrupted postcopy - migration using QMP command 'migrate' with resume=true flag set. - - - After the connection is re-established, QEMU will continue the postcopy - migration on both sides. - -During a paused postcopy migration, the VM can logically still continue -running, and it will not be impacted from any page access to pages that -were already migrated to destination VM before the interruption happens. -However, if any of the missing pages got accessed on destination VM, the VM -thread will be halted waiting for the page to be migrated, it means it can -be halted until the recovery is complete. - -The impact of accessing missing pages can be relevant to different -configurations of the guest. For example, when with async page fault -enabled, logically the guest can proactively schedule out the threads -accessing missing pages. - -Postcopy states ---------------- - -Postcopy moves through a series of states (see postcopy_state) from -ADVISE->DISCARD->LISTEN->RUNNING->END - - - Advise - - Set at the start of migration if postcopy is enabled, even - if it hasn't had the start command; here the destination - checks that its OS has the support needed for postcopy, and performs - setup to ensure the RAM mappings are suitable for later postcopy. - The destination will fail early in migration at this point if the - required OS support is not present. - (Triggered by reception of POSTCOPY_ADVISE command) - - - Discard - - Entered on receipt of the first 'discard' command; prior to - the first Discard being performed, hugepages are switched off - (using madvise) to ensure that no new huge pages are created - during the postcopy phase, and to cause any huge pages that - have discards on them to be broken. - - - Listen - - The first command in the package, POSTCOPY_LISTEN, switches - the destination state to Listen, and starts a new thread - (the 'listen thread') which takes over the job of receiving - pages off the migration stream, while the main thread carries - on processing the blob. With this thread able to process page - reception, the destination now 'sensitises' the RAM to detect - any access to missing pages (on Linux using the 'userfault' - system). - - - Running - - POSTCOPY_RUN causes the destination to synchronise all - state and start the CPUs and IO devices running. The main - thread now finishes processing the migration package and - now carries on as it would for normal precopy migration - (although it can't do the cleanup it would do as it - finishes a normal migration). - - - Paused - - Postcopy can run into a paused state (normally on both sides when - happens), where all threads will be temporarily halted mostly due to - network errors. When reaching paused state, migration will make sure - the qemu binary on both sides maintain the data without corrupting - the VM. To continue the migration, the admin needs to fix the - migration channel using the QMP command 'migrate-recover' on the - destination node, then resume the migration using QMP command 'migrate' - again on source node, with resume=true flag set. - - - End - - The listen thread can now quit, and perform the cleanup of migration - state, the migration is now complete. - -Source side page map --------------------- - -The 'migration bitmap' in postcopy is basically the same as in the precopy, -where each of the bit to indicate that page is 'dirty' - i.e. needs -sending. During the precopy phase this is updated as the CPU dirties -pages, however during postcopy the CPUs are stopped and nothing should -dirty anything any more. Instead, dirty bits are cleared when the relevant -pages are sent during postcopy. - -Postcopy with hugepages ------------------------ - -Postcopy now works with hugetlbfs backed memory: - - a) The linux kernel on the destination must support userfault on hugepages. - b) The huge-page configuration on the source and destination VMs must be - identical; i.e. RAMBlocks on both sides must use the same page size. - c) Note that ``-mem-path /dev/hugepages`` will fall back to allocating normal - RAM if it doesn't have enough hugepages, triggering (b) to fail. - Using ``-mem-prealloc`` enforces the allocation using hugepages. - d) Care should be taken with the size of hugepage used; postcopy with 2MB - hugepages works well, however 1GB hugepages are likely to be problematic - since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link, - and until the full page is transferred the destination thread is blocked. - -Postcopy with shared memory ---------------------------- - -Postcopy migration with shared memory needs explicit support from the other -processes that share memory and from QEMU. There are restrictions on the type of -memory that userfault can support shared. - -The Linux kernel userfault support works on ``/dev/shm`` memory and on ``hugetlbfs`` -(although the kernel doesn't provide an equivalent to ``madvise(MADV_DONTNEED)`` -for hugetlbfs which may be a problem in some configurations). - -The vhost-user code in QEMU supports clients that have Postcopy support, -and the ``vhost-user-bridge`` (in ``tests/``) and the DPDK package have changes -to support postcopy. - -The client needs to open a userfaultfd and register the areas -of memory that it maps with userfault. The client must then pass the -userfaultfd back to QEMU together with a mapping table that allows -fault addresses in the clients address space to be converted back to -RAMBlock/offsets. The client's userfaultfd is added to the postcopy -fault-thread and page requests are made on behalf of the client by QEMU. -QEMU performs 'wake' operations on the client's userfaultfd to allow it -to continue after a page has arrived. - -.. note:: - There are two future improvements that would be nice: - a) Some way to make QEMU ignorant of the addresses in the clients - address space - b) Avoiding the need for QEMU to perform ufd-wake calls after the - pages have arrived - -Retro-fitting postcopy to existing clients is possible: - a) A mechanism is needed for the registration with userfault as above, - and the registration needs to be coordinated with the phases of - postcopy. In vhost-user extra messages are added to the existing - control channel. - b) Any thread that can block due to guest memory accesses must be - identified and the implication understood; for example if the - guest memory access is made while holding a lock then all other - threads waiting for that lock will also be blocked. - -Postcopy Preemption Mode ------------------------- - -Postcopy preempt is a new capability introduced in 8.0 QEMU release, it -allows urgent pages (those got page fault requested from destination QEMU -explicitly) to be sent in a separate preempt channel, rather than queued in -the background migration channel. Anyone who cares about latencies of page -faults during a postcopy migration should enable this feature. By default, -it's not enabled. - diff --git a/docs/devel/migration/postcopy.rst b/docs/devel/migration/postcopy.rst new file mode 100644 index 00000000000..d60eec06ab7 --- /dev/null +++ b/docs/devel/migration/postcopy.rst @@ -0,0 +1,304 @@ +Postcopy +======== + +'Postcopy' migration is a way to deal with migrations that refuse to converge +(or take too long to converge) its plus side is that there is an upper bound on +the amount of migration traffic and time it takes, the down side is that during +the postcopy phase, a failure of *either* side causes the guest to be lost. + +In postcopy the destination CPUs are started before all the memory has been +transferred, and accesses to pages that are yet to be transferred cause +a fault that's translated by QEMU into a request to the source QEMU. + +Postcopy can be combined with precopy (i.e. normal migration) so that if precopy +doesn't finish in a given time the switch is made to postcopy. + +Enabling postcopy +----------------- + +To enable postcopy, issue this command on the monitor (both source and +destination) prior to the start of migration: + +``migrate_set_capability postcopy-ram on`` + +The normal commands are then used to start a migration, which is still +started in precopy mode. Issuing: + +``migrate_start_postcopy`` + +will now cause the transition from precopy to postcopy. +It can be issued immediately after migration is started or any +time later on. Issuing it after the end of a migration is harmless. + +Blocktime is a postcopy live migration metric, intended to show how +long the vCPU was in state of interruptible sleep due to pagefault. +That metric is calculated both for all vCPUs as overlapped value, and +separately for each vCPU. These values are calculated on destination +side. To enable postcopy blocktime calculation, enter following +command on destination monitor: + +``migrate_set_capability postcopy-blocktime on`` + +Postcopy blocktime can be retrieved by query-migrate qmp command. +postcopy-blocktime value of qmp command will show overlapped blocking +time for all vCPU, postcopy-vcpu-blocktime will show list of blocking +time per vCPU. + +.. note:: + During the postcopy phase, the bandwidth limits set using + ``migrate_set_parameter`` is ignored (to avoid delaying requested pages that + the destination is waiting for). + +Postcopy device transfer +------------------------ + +Loading of device data may cause the device emulation to access guest RAM +that may trigger faults that have to be resolved by the source, as such +the migration stream has to be able to respond with page data *during* the +device load, and hence the device data has to be read from the stream completely +before the device load begins to free the stream up. This is achieved by +'packaging' the device data into a blob that's read in one go. + +Source behaviour +---------------- + +Until postcopy is entered the migration stream is identical to normal +precopy, except for the addition of a 'postcopy advise' command at +the beginning, to tell the destination that postcopy might happen. +When postcopy starts the source sends the page discard data and then +forms the 'package' containing: + + - Command: 'postcopy listen' + - The device state + + A series of sections, identical to the precopy streams device state stream + containing everything except postcopiable devices (i.e. RAM) + - Command: 'postcopy run' + +The 'package' is sent as the data part of a Command: ``CMD_PACKAGED``, and the +contents are formatted in the same way as the main migration stream. + +During postcopy the source scans the list of dirty pages and sends them +to the destination without being requested (in much the same way as precopy), +however when a page request is received from the destination, the dirty page +scanning restarts from the requested location. This causes requested pages +to be sent quickly, and also causes pages directly after the requested page +to be sent quickly in the hope that those pages are likely to be used +by the destination soon. + +Destination behaviour +--------------------- + +Initially the destination looks the same as precopy, with a single thread +reading the migration stream; the 'postcopy advise' and 'discard' commands +are processed to change the way RAM is managed, but don't affect the stream +processing. + +:: + + ------------------------------------------------------------------------------ + 1 2 3 4 5 6 7 + main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN ) + thread | | + | (page request) + | \___ + v \ + listen thread: --- page -- page -- page -- page -- page -- + + a b c + ------------------------------------------------------------------------------ + +- On receipt of ``CMD_PACKAGED`` (1) + + All the data associated with the package - the ( ... ) section in the diagram - + is read into memory, and the main thread recurses into qemu_loadvm_state_main + to process the contents of the package (2) which contains commands (3,6) and + devices (4...) + +- On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package) + + a new thread (a) is started that takes over servicing the migration stream, + while the main thread carries on loading the package. It loads normal + background page data (b) but if during a device load a fault happens (5) + the returned page (c) is loaded by the listen thread allowing the main + threads device load to carry on. + +- The last thing in the ``CMD_PACKAGED`` is a 'RUN' command (6) + + letting the destination CPUs start running. At the end of the + ``CMD_PACKAGED`` (7) the main thread returns to normal running behaviour and + is no longer used by migration, while the listen thread carries on servicing + page data until the end of migration. + +Postcopy Recovery +----------------- + +Comparing to precopy, postcopy is special on error handlings. When any +error happens (in this case, mostly network errors), QEMU cannot easily +fail a migration because VM data resides in both source and destination +QEMU instances. On the other hand, when issue happens QEMU on both sides +will go into a paused state. It'll need a recovery phase to continue a +paused postcopy migration. + +The recovery phase normally contains a few steps: + + - When network issue occurs, both QEMU will go into PAUSED state + + - When the network is recovered (or a new network is provided), the admin + can setup the new channel for migration using QMP command + 'migrate-recover' on destination node, preparing for a resume. + + - On source host, the admin can continue the interrupted postcopy + migration using QMP command 'migrate' with resume=true flag set. + + - After the connection is re-established, QEMU will continue the postcopy + migration on both sides. + +During a paused postcopy migration, the VM can logically still continue +running, and it will not be impacted from any page access to pages that +were already migrated to destination VM before the interruption happens. +However, if any of the missing pages got accessed on destination VM, the VM +thread will be halted waiting for the page to be migrated, it means it can +be halted until the recovery is complete. + +The impact of accessing missing pages can be relevant to different +configurations of the guest. For example, when with async page fault +enabled, logically the guest can proactively schedule out the threads +accessing missing pages. + +Postcopy states +--------------- + +Postcopy moves through a series of states (see postcopy_state) from +ADVISE->DISCARD->LISTEN->RUNNING->END + + - Advise + + Set at the start of migration if postcopy is enabled, even + if it hasn't had the start command; here the destination + checks that its OS has the support needed for postcopy, and performs + setup to ensure the RAM mappings are suitable for later postcopy. + The destination will fail early in migration at this point if the + required OS support is not present. + (Triggered by reception of POSTCOPY_ADVISE command) + + - Discard + + Entered on receipt of the first 'discard' command; prior to + the first Discard being performed, hugepages are switched off + (using madvise) to ensure that no new huge pages are created + during the postcopy phase, and to cause any huge pages that + have discards on them to be broken. + + - Listen + + The first command in the package, POSTCOPY_LISTEN, switches + the destination state to Listen, and starts a new thread + (the 'listen thread') which takes over the job of receiving + pages off the migration stream, while the main thread carries + on processing the blob. With this thread able to process page + reception, the destination now 'sensitises' the RAM to detect + any access to missing pages (on Linux using the 'userfault' + system). + + - Running + + POSTCOPY_RUN causes the destination to synchronise all + state and start the CPUs and IO devices running. The main + thread now finishes processing the migration package and + now carries on as it would for normal precopy migration + (although it can't do the cleanup it would do as it + finishes a normal migration). + + - Paused + + Postcopy can run into a paused state (normally on both sides when + happens), where all threads will be temporarily halted mostly due to + network errors. When reaching paused state, migration will make sure + the qemu binary on both sides maintain the data without corrupting + the VM. To continue the migration, the admin needs to fix the + migration channel using the QMP command 'migrate-recover' on the + destination node, then resume the migration using QMP command 'migrate' + again on source node, with resume=true flag set. + + - End + + The listen thread can now quit, and perform the cleanup of migration + state, the migration is now complete. + +Source side page map +-------------------- + +The 'migration bitmap' in postcopy is basically the same as in the precopy, +where each of the bit to indicate that page is 'dirty' - i.e. needs +sending. During the precopy phase this is updated as the CPU dirties +pages, however during postcopy the CPUs are stopped and nothing should +dirty anything any more. Instead, dirty bits are cleared when the relevant +pages are sent during postcopy. + +Postcopy with hugepages +----------------------- + +Postcopy now works with hugetlbfs backed memory: + + a) The linux kernel on the destination must support userfault on hugepages. + b) The huge-page configuration on the source and destination VMs must be + identical; i.e. RAMBlocks on both sides must use the same page size. + c) Note that ``-mem-path /dev/hugepages`` will fall back to allocating normal + RAM if it doesn't have enough hugepages, triggering (b) to fail. + Using ``-mem-prealloc`` enforces the allocation using hugepages. + d) Care should be taken with the size of hugepage used; postcopy with 2MB + hugepages works well, however 1GB hugepages are likely to be problematic + since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link, + and until the full page is transferred the destination thread is blocked. + +Postcopy with shared memory +--------------------------- + +Postcopy migration with shared memory needs explicit support from the other +processes that share memory and from QEMU. There are restrictions on the type of +memory that userfault can support shared. + +The Linux kernel userfault support works on ``/dev/shm`` memory and on ``hugetlbfs`` +(although the kernel doesn't provide an equivalent to ``madvise(MADV_DONTNEED)`` +for hugetlbfs which may be a problem in some configurations). + +The vhost-user code in QEMU supports clients that have Postcopy support, +and the ``vhost-user-bridge`` (in ``tests/``) and the DPDK package have changes +to support postcopy. + +The client needs to open a userfaultfd and register the areas +of memory that it maps with userfault. The client must then pass the +userfaultfd back to QEMU together with a mapping table that allows +fault addresses in the clients address space to be converted back to +RAMBlock/offsets. The client's userfaultfd is added to the postcopy +fault-thread and page requests are made on behalf of the client by QEMU. +QEMU performs 'wake' operations on the client's userfaultfd to allow it +to continue after a page has arrived. + +.. note:: + There are two future improvements that would be nice: + a) Some way to make QEMU ignorant of the addresses in the clients + address space + b) Avoiding the need for QEMU to perform ufd-wake calls after the + pages have arrived + +Retro-fitting postcopy to existing clients is possible: + a) A mechanism is needed for the registration with userfault as above, + and the registration needs to be coordinated with the phases of + postcopy. In vhost-user extra messages are added to the existing + control channel. + b) Any thread that can block due to guest memory accesses must be + identified and the implication understood; for example if the + guest memory access is made while holding a lock then all other + threads waiting for that lock will also be blocked. + +Postcopy Preemption Mode +------------------------ + +Postcopy preempt is a new capability introduced in 8.0 QEMU release, it +allows urgent pages (those got page fault requested from destination QEMU +explicitly) to be sent in a separate preempt channel, rather than queued in +the background migration channel. Anyone who cares about latencies of page +faults during a postcopy migration should enable this feature. By default, +it's not enabled. -- Gitee From 1d76d7d35365554c96af2f904b3ad45fca72b000 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:25 +0800 Subject: [PATCH 21/90] docs/migration: Split "dirty limit" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4c6f8a79ae539eeb1f86af6522e4000edde3638b upstream. Split that into a separate file, put under "features". Intel-SIG: commit 4c6f8a79ae53 docs/migration: Split "dirty limit" Cc: Yong Huang Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-8-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/dirty-limit.rst | 71 ++++++++++++++++++++++++++++ docs/devel/migration/features.rst | 1 + docs/devel/migration/main.rst | 71 ---------------------------- 3 files changed, 72 insertions(+), 71 deletions(-) create mode 100644 docs/devel/migration/dirty-limit.rst diff --git a/docs/devel/migration/dirty-limit.rst b/docs/devel/migration/dirty-limit.rst new file mode 100644 index 00000000000..8f32329d5fd --- /dev/null +++ b/docs/devel/migration/dirty-limit.rst @@ -0,0 +1,71 @@ +Dirty limit +=========== + +The dirty limit, short for dirty page rate upper limit, is a new capability +introduced in the 8.1 QEMU release that uses a new algorithm based on the KVM +dirty ring to throttle down the guest during live migration. + +The algorithm framework is as follows: + +:: + + ------------------------------------------------------------------------------ + main --------------> throttle thread ------------> PREPARE(1) <-------- + thread \ | | + \ | | + \ V | + -\ CALCULATE(2) | + \ | | + \ | | + \ V | + \ SET PENALTY(3) ----- + -\ | + \ | + \ V + -> virtual CPU thread -------> ACCEPT PENALTY(4) + ------------------------------------------------------------------------------ + +When the qmp command qmp_set_vcpu_dirty_limit is called for the first time, +the QEMU main thread starts the throttle thread. The throttle thread, once +launched, executes the loop, which consists of three steps: + + - PREPARE (1) + + The entire work of PREPARE (1) is preparation for the second stage, + CALCULATE(2), as the name implies. It involves preparing the dirty + page rate value and the corresponding upper limit of the VM: + The dirty page rate is calculated via the KVM dirty ring mechanism, + which tells QEMU how many dirty pages a virtual CPU has had since the + last KVM_EXIT_DIRTY_RING_FULL exception; The dirty page rate upper + limit is specified by caller, therefore fetch it directly. + + - CALCULATE (2) + + Calculate a suitable sleep period for each virtual CPU, which will be + used to determine the penalty for the target virtual CPU. The + computation must be done carefully in order to reduce the dirty page + rate progressively down to the upper limit without oscillation. To + achieve this, two strategies are provided: the first is to add or + subtract sleep time based on the ratio of the current dirty page rate + to the limit, which is used when the current dirty page rate is far + from the limit; the second is to add or subtract a fixed time when + the current dirty page rate is close to the limit. + + - SET PENALTY (3) + + Set the sleep time for each virtual CPU that should be penalized based + on the results of the calculation supplied by step CALCULATE (2). + +After completing the three above stages, the throttle thread loops back +to step PREPARE (1) until the dirty limit is reached. + +On the other hand, each virtual CPU thread reads the sleep duration and +sleeps in the path of the KVM_EXIT_DIRTY_RING_FULL exception handler, that +is ACCEPT PENALTY (4). Virtual CPUs tied with writing processes will +obviously exit to the path and get penalized, whereas virtual CPUs involved +with read processes will not. + +In summary, thanks to the KVM dirty ring technology, the dirty limit +algorithm will restrict virtual CPUs as needed to keep their dirty page +rate inside the limit. This leads to more steady reading performance during +live migration and can aid in improving large guest responsiveness. diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index 0054e0c900a..e257d0d1009 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -7,3 +7,4 @@ Migration has plenty of features to support different use cases. :maxdepth: 2 postcopy + dirty-limit diff --git a/docs/devel/migration/main.rst b/docs/devel/migration/main.rst index 1e98e9e40c5..396c7c51ca8 100644 --- a/docs/devel/migration/main.rst +++ b/docs/devel/migration/main.rst @@ -573,74 +573,3 @@ path. Return path - opened by main thread, written by main thread AND postcopy thread (protected by rp_mutex) -Dirty limit -===================== -The dirty limit, short for dirty page rate upper limit, is a new capability -introduced in the 8.1 QEMU release that uses a new algorithm based on the KVM -dirty ring to throttle down the guest during live migration. - -The algorithm framework is as follows: - -:: - - ------------------------------------------------------------------------------ - main --------------> throttle thread ------------> PREPARE(1) <-------- - thread \ | | - \ | | - \ V | - -\ CALCULATE(2) | - \ | | - \ | | - \ V | - \ SET PENALTY(3) ----- - -\ | - \ | - \ V - -> virtual CPU thread -------> ACCEPT PENALTY(4) - ------------------------------------------------------------------------------ - -When the qmp command qmp_set_vcpu_dirty_limit is called for the first time, -the QEMU main thread starts the throttle thread. The throttle thread, once -launched, executes the loop, which consists of three steps: - - - PREPARE (1) - - The entire work of PREPARE (1) is preparation for the second stage, - CALCULATE(2), as the name implies. It involves preparing the dirty - page rate value and the corresponding upper limit of the VM: - The dirty page rate is calculated via the KVM dirty ring mechanism, - which tells QEMU how many dirty pages a virtual CPU has had since the - last KVM_EXIT_DIRTY_RING_FULL exception; The dirty page rate upper - limit is specified by caller, therefore fetch it directly. - - - CALCULATE (2) - - Calculate a suitable sleep period for each virtual CPU, which will be - used to determine the penalty for the target virtual CPU. The - computation must be done carefully in order to reduce the dirty page - rate progressively down to the upper limit without oscillation. To - achieve this, two strategies are provided: the first is to add or - subtract sleep time based on the ratio of the current dirty page rate - to the limit, which is used when the current dirty page rate is far - from the limit; the second is to add or subtract a fixed time when - the current dirty page rate is close to the limit. - - - SET PENALTY (3) - - Set the sleep time for each virtual CPU that should be penalized based - on the results of the calculation supplied by step CALCULATE (2). - -After completing the three above stages, the throttle thread loops back -to step PREPARE (1) until the dirty limit is reached. - -On the other hand, each virtual CPU thread reads the sleep duration and -sleeps in the path of the KVM_EXIT_DIRTY_RING_FULL exception handler, that -is ACCEPT PENALTY (4). Virtual CPUs tied with writing processes will -obviously exit to the path and get penalized, whereas virtual CPUs involved -with read processes will not. - -In summary, thanks to the KVM dirty ring technology, the dirty limit -algorithm will restrict virtual CPUs as needed to keep their dirty page -rate inside the limit. This leads to more steady reading performance during -live migration and can aid in improving large guest responsiveness. - -- Gitee From a3e36370e93a1c3ec71b4d43c6c2d6a17733bce0 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:26 +0800 Subject: [PATCH 22/90] docs/migration: Organize "Postcopy" page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 21b17cd011c959c3fd3fdad994389410a02df901 upstream. Reorganize the page, moving things around, and add a few headlines ("Postcopy internals", "Postcopy features") to cover sub-areas. Intel-SIG: commit 21b17cd011c9 docs/migration: Organize "Postcopy" page Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-9-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/postcopy.rst | 159 ++++++++++++++++-------------- 1 file changed, 84 insertions(+), 75 deletions(-) diff --git a/docs/devel/migration/postcopy.rst b/docs/devel/migration/postcopy.rst index d60eec06ab7..6c51e96d798 100644 --- a/docs/devel/migration/postcopy.rst +++ b/docs/devel/migration/postcopy.rst @@ -1,6 +1,9 @@ +======== Postcopy ======== +.. contents:: + 'Postcopy' migration is a way to deal with migrations that refuse to converge (or take too long to converge) its plus side is that there is an upper bound on the amount of migration traffic and time it takes, the down side is that during @@ -14,7 +17,7 @@ Postcopy can be combined with precopy (i.e. normal migration) so that if precopy doesn't finish in a given time the switch is made to postcopy. Enabling postcopy ------------------ +================= To enable postcopy, issue this command on the monitor (both source and destination) prior to the start of migration: @@ -49,8 +52,71 @@ time per vCPU. ``migrate_set_parameter`` is ignored (to avoid delaying requested pages that the destination is waiting for). -Postcopy device transfer ------------------------- +Postcopy internals +================== + +State machine +------------- + +Postcopy moves through a series of states (see postcopy_state) from +ADVISE->DISCARD->LISTEN->RUNNING->END + + - Advise + + Set at the start of migration if postcopy is enabled, even + if it hasn't had the start command; here the destination + checks that its OS has the support needed for postcopy, and performs + setup to ensure the RAM mappings are suitable for later postcopy. + The destination will fail early in migration at this point if the + required OS support is not present. + (Triggered by reception of POSTCOPY_ADVISE command) + + - Discard + + Entered on receipt of the first 'discard' command; prior to + the first Discard being performed, hugepages are switched off + (using madvise) to ensure that no new huge pages are created + during the postcopy phase, and to cause any huge pages that + have discards on them to be broken. + + - Listen + + The first command in the package, POSTCOPY_LISTEN, switches + the destination state to Listen, and starts a new thread + (the 'listen thread') which takes over the job of receiving + pages off the migration stream, while the main thread carries + on processing the blob. With this thread able to process page + reception, the destination now 'sensitises' the RAM to detect + any access to missing pages (on Linux using the 'userfault' + system). + + - Running + + POSTCOPY_RUN causes the destination to synchronise all + state and start the CPUs and IO devices running. The main + thread now finishes processing the migration package and + now carries on as it would for normal precopy migration + (although it can't do the cleanup it would do as it + finishes a normal migration). + + - Paused + + Postcopy can run into a paused state (normally on both sides when + happens), where all threads will be temporarily halted mostly due to + network errors. When reaching paused state, migration will make sure + the qemu binary on both sides maintain the data without corrupting + the VM. To continue the migration, the admin needs to fix the + migration channel using the QMP command 'migrate-recover' on the + destination node, then resume the migration using QMP command 'migrate' + again on source node, with resume=true flag set. + + - End + + The listen thread can now quit, and perform the cleanup of migration + state, the migration is now complete. + +Device transfer +--------------- Loading of device data may cause the device emulation to access guest RAM that may trigger faults that have to be resolved by the source, as such @@ -130,7 +196,20 @@ processing. is no longer used by migration, while the listen thread carries on servicing page data until the end of migration. -Postcopy Recovery +Source side page bitmap +----------------------- + +The 'migration bitmap' in postcopy is basically the same as in the precopy, +where each of the bit to indicate that page is 'dirty' - i.e. needs +sending. During the precopy phase this is updated as the CPU dirties +pages, however during postcopy the CPUs are stopped and nothing should +dirty anything any more. Instead, dirty bits are cleared when the relevant +pages are sent during postcopy. + +Postcopy features +================= + +Postcopy recovery ----------------- Comparing to precopy, postcopy is special on error handlings. When any @@ -166,76 +245,6 @@ configurations of the guest. For example, when with async page fault enabled, logically the guest can proactively schedule out the threads accessing missing pages. -Postcopy states ---------------- - -Postcopy moves through a series of states (see postcopy_state) from -ADVISE->DISCARD->LISTEN->RUNNING->END - - - Advise - - Set at the start of migration if postcopy is enabled, even - if it hasn't had the start command; here the destination - checks that its OS has the support needed for postcopy, and performs - setup to ensure the RAM mappings are suitable for later postcopy. - The destination will fail early in migration at this point if the - required OS support is not present. - (Triggered by reception of POSTCOPY_ADVISE command) - - - Discard - - Entered on receipt of the first 'discard' command; prior to - the first Discard being performed, hugepages are switched off - (using madvise) to ensure that no new huge pages are created - during the postcopy phase, and to cause any huge pages that - have discards on them to be broken. - - - Listen - - The first command in the package, POSTCOPY_LISTEN, switches - the destination state to Listen, and starts a new thread - (the 'listen thread') which takes over the job of receiving - pages off the migration stream, while the main thread carries - on processing the blob. With this thread able to process page - reception, the destination now 'sensitises' the RAM to detect - any access to missing pages (on Linux using the 'userfault' - system). - - - Running - - POSTCOPY_RUN causes the destination to synchronise all - state and start the CPUs and IO devices running. The main - thread now finishes processing the migration package and - now carries on as it would for normal precopy migration - (although it can't do the cleanup it would do as it - finishes a normal migration). - - - Paused - - Postcopy can run into a paused state (normally on both sides when - happens), where all threads will be temporarily halted mostly due to - network errors. When reaching paused state, migration will make sure - the qemu binary on both sides maintain the data without corrupting - the VM. To continue the migration, the admin needs to fix the - migration channel using the QMP command 'migrate-recover' on the - destination node, then resume the migration using QMP command 'migrate' - again on source node, with resume=true flag set. - - - End - - The listen thread can now quit, and perform the cleanup of migration - state, the migration is now complete. - -Source side page map --------------------- - -The 'migration bitmap' in postcopy is basically the same as in the precopy, -where each of the bit to indicate that page is 'dirty' - i.e. needs -sending. During the precopy phase this is updated as the CPU dirties -pages, however during postcopy the CPUs are stopped and nothing should -dirty anything any more. Instead, dirty bits are cleared when the relevant -pages are sent during postcopy. - Postcopy with hugepages ----------------------- @@ -293,7 +302,7 @@ Retro-fitting postcopy to existing clients is possible: guest memory access is made while holding a lock then all other threads waiting for that lock will also be blocked. -Postcopy Preemption Mode +Postcopy preemption mode ------------------------ Postcopy preempt is a new capability introduced in 8.0 QEMU release, it -- Gitee From 434703ce4d118ebcd0238d4711357509191ca69a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:27 +0800 Subject: [PATCH 23/90] docs/migration: Further move vfio to be feature of migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 66fd3b1a7ab02f7d8c84f92eba23e3ddc955204d upstream. Move it one layer down, so taking VFIO-migration as a feature for migration. Intel-SIG: commit 66fd3b1a7ab0 docs/migration: Further move vfio to be feature of migration Cc: Alex Williamson Cc: Cédric Le Goater Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-10-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/features.rst | 1 + docs/devel/migration/index.rst | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index e257d0d1009..dea016f7075 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -8,3 +8,4 @@ Migration has plenty of features to support different use cases. postcopy dirty-limit + vfio diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst index 21ad58b1890..b1357309e15 100644 --- a/docs/devel/migration/index.rst +++ b/docs/devel/migration/index.rst @@ -10,6 +10,5 @@ QEMU live migration works. main features compatibility - vfio virtio best-practices -- Gitee From cc42a1017d59b7af4b645745d255b7deb8b22408 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 9 Jan 2024 14:46:28 +0800 Subject: [PATCH 24/90] docs/migration: Further move virtio to be feature of migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit eb9f6daae49c06bb91e9660908587cc55265e43a upstream. Move it one layer down, so taking Virtio-migration as a feature for migration. Intel-SIG: commit eb9f6daae49c docs/migration: Further move virtio to be feature of migration Cc: "Michael S. Tsirkin" Cc: Jason Wang Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/r/20240109064628.595453-11-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- docs/devel/migration/features.rst | 1 + docs/devel/migration/index.rst | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index dea016f7075..a9acaf618ee 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -9,3 +9,4 @@ Migration has plenty of features to support different use cases. postcopy dirty-limit vfio + virtio diff --git a/docs/devel/migration/index.rst b/docs/devel/migration/index.rst index b1357309e15..2aa294d6314 100644 --- a/docs/devel/migration/index.rst +++ b/docs/devel/migration/index.rst @@ -10,5 +10,4 @@ QEMU live migration works. main features compatibility - virtio best-practices -- Gitee From 91284e8e9e5f66739b401416f73771099505daa7 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:35 +0800 Subject: [PATCH 25/90] migration/multifd: Drop stale comment for multifd zero copy commit 8888a552bf7af200e36ff123772547dfb4f133c4 upstream. We've already done that with multifd_flush_after_each_section, for multifd in general. Drop the stale "TODO-like" comment. Intel-SIG: commit 8888a552bf7a migration/multifd: Drop stale comment for multifd zero copy Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-2-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 25cbc6dc6be..eee2586770e 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -598,17 +598,6 @@ int multifd_send_sync_main(void) } } - /* - * When using zero-copy, it's necessary to flush the pages before any of - * the pages can be sent again, so we'll make sure the new version of the - * pages will always arrive _later_ than the old pages. - * - * Currently we achieve this by flushing the zero-page requested writes - * per ram iteration, but in the future we could potentially optimize it - * to be less frequent, e.g. only after we finished one whole scanning of - * all the dirty bitmaps. - */ - flush_zero_copy = migrate_zero_copy_send(); for (i = 0; i < migrate_multifd_channels(); i++) { -- Gitee From fe9e7f18425a314f1371fa73c0ec76a28b7aff39 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:36 +0800 Subject: [PATCH 26/90] migration/multifd: multifd_send_kick_main() commit 48c0f5d56fd2ff0a0cda23301637b742c690f59a upstream. When a multifd sender thread hit errors, it always needs to kick the main thread by kicking all the semaphores that it can be waiting upon. Provide a helper for it and deduplicate the code. Intel-SIG: commit 48c0f5d56fd2 migration/multifd: multifd_send_kick_main() Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-3-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index eee2586770e..b8d2c96533c 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -372,6 +372,18 @@ struct { MultiFDMethods *ops; } *multifd_send_state; +/* + * The migration thread can wait on either of the two semaphores. This + * function can be used to kick the main thread out of waiting on either of + * them. Should mostly only be called when something wrong happened with + * the current multifd send thread. + */ +static void multifd_send_kick_main(MultiFDSendParams *p) +{ + qemu_sem_post(&p->sem_sync); + qemu_sem_post(&multifd_send_state->channels_ready); +} + /* * How we use multifd_send_state->pages and channel->pages? * @@ -739,8 +751,7 @@ out: assert(local_err); trace_multifd_send_error(p->id); multifd_send_terminate_threads(local_err); - qemu_sem_post(&p->sem_sync); - qemu_sem_post(&multifd_send_state->channels_ready); + multifd_send_kick_main(p); error_free(local_err); } @@ -781,8 +792,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, * is not created, and then tell who pay attention to me. */ p->quit = true; - qemu_sem_post(&multifd_send_state->channels_ready); - qemu_sem_post(&p->sem_sync); + multifd_send_kick_main(p); error_free(err); } @@ -852,8 +862,7 @@ static void multifd_new_send_channel_cleanup(MultiFDSendParams *p, { migrate_set_error(migrate_get_current(), err); /* Error happen, we need to tell who pay attention to me */ - qemu_sem_post(&multifd_send_state->channels_ready); - qemu_sem_post(&p->sem_sync); + multifd_send_kick_main(p); /* * Although multifd_send_thread is not created, but main migration * thread need to judge whether it is running, so we need to mark -- Gitee From 91bae92961a27269e6420d33fe5cf619ba9bb3f4 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:37 +0800 Subject: [PATCH 27/90] migration/multifd: Drop MultiFDSendParams.quit, cleanup error paths commit 15f3f21d598148895c33b6fc41e29777cf6ad992 upstream. Multifd send side has two fields to indicate error quits: - MultiFDSendParams.quit - &multifd_send_state->exiting Merge them into the global one. The replacement is done by changing all p->quit checks into the global var check. The global check doesn't need any lock. A few more things done on top of this altogether: - multifd_send_terminate_threads() Moving the xchg() of &multifd_send_state->exiting upper, so as to cover the tracepoint, migrate_set_error() and migrate_set_state(). - multifd_send_sync_main() In the 2nd loop, add one more check over the global var to make sure we don't keep the looping if QEMU already decided to quit. - multifd_tls_outgoing_handshake() Use multifd_send_terminate_threads() to set the error state. That has a benefit of updating MigrationState.error to that error too, so we can persist that 1st error we hit in that specific channel. - multifd_new_send_channel_async() Take similar approach like above, drop the migrate_set_error() because multifd_send_terminate_threads() already covers that. Unwrap the helper multifd_new_send_channel_cleanup() along the way; not really needed. Intel-SIG: commit 15f3f21d5981 migration/multifd: Drop MultiFDSendParams.quit, cleanup error paths Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-4-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 85 ++++++++++++++++++--------------------------- migration/multifd.h | 2 -- 2 files changed, 33 insertions(+), 54 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index b8d2c96533c..2c98023d67f 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -372,6 +372,11 @@ struct { MultiFDMethods *ops; } *multifd_send_state; +static bool multifd_send_should_exit(void) +{ + return qatomic_read(&multifd_send_state->exiting); +} + /* * The migration thread can wait on either of the two semaphores. This * function can be used to kick the main thread out of waiting on either of @@ -409,7 +414,7 @@ static int multifd_send_pages(void) MultiFDSendParams *p = NULL; /* make happy gcc */ MultiFDPages_t *pages = multifd_send_state->pages; - if (qatomic_read(&multifd_send_state->exiting)) { + if (multifd_send_should_exit()) { return -1; } @@ -421,14 +426,11 @@ static int multifd_send_pages(void) */ next_channel %= migrate_multifd_channels(); for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) { - p = &multifd_send_state->params[i]; - - qemu_mutex_lock(&p->mutex); - if (p->quit) { - error_report("%s: channel %d has already quit!", __func__, i); - qemu_mutex_unlock(&p->mutex); + if (multifd_send_should_exit()) { return -1; } + p = &multifd_send_state->params[i]; + qemu_mutex_lock(&p->mutex); if (!p->pending_job) { p->pending_job++; next_channel = (i + 1) % migrate_multifd_channels(); @@ -483,6 +485,16 @@ static void multifd_send_terminate_threads(Error *err) { int i; + /* + * We don't want to exit each threads twice. Depending on where + * we get the error, or if there are two independent errors in two + * threads at the same time, we can end calling this function + * twice. + */ + if (qatomic_xchg(&multifd_send_state->exiting, 1)) { + return; + } + trace_multifd_send_terminate_threads(err != NULL); if (err) { @@ -497,26 +509,13 @@ static void multifd_send_terminate_threads(Error *err) } } - /* - * We don't want to exit each threads twice. Depending on where - * we get the error, or if there are two independent errors in two - * threads at the same time, we can end calling this function - * twice. - */ - if (qatomic_xchg(&multifd_send_state->exiting, 1)) { - return; - } - for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - qemu_mutex_lock(&p->mutex); - p->quit = true; qemu_sem_post(&p->sem); if (p->c) { qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); } - qemu_mutex_unlock(&p->mutex); } } @@ -615,16 +614,13 @@ int multifd_send_sync_main(void) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - trace_multifd_send_sync_main_signal(p->id); - - qemu_mutex_lock(&p->mutex); - - if (p->quit) { - error_report("%s: channel %d has already quit", __func__, i); - qemu_mutex_unlock(&p->mutex); + if (multifd_send_should_exit()) { return -1; } + trace_multifd_send_sync_main_signal(p->id); + + qemu_mutex_lock(&p->mutex); p->packet_num = multifd_send_state->packet_num++; p->flags |= MULTIFD_FLAG_SYNC; p->pending_job++; @@ -634,6 +630,10 @@ int multifd_send_sync_main(void) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; + if (multifd_send_should_exit()) { + return -1; + } + qemu_sem_wait(&multifd_send_state->channels_ready); trace_multifd_send_sync_main_wait(p->id); qemu_sem_wait(&p->sem_sync); @@ -671,7 +671,7 @@ static void *multifd_send_thread(void *opaque) qemu_sem_post(&multifd_send_state->channels_ready); qemu_sem_wait(&p->sem); - if (qatomic_read(&multifd_send_state->exiting)) { + if (multifd_send_should_exit()) { break; } qemu_mutex_lock(&p->mutex); @@ -786,12 +786,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); - migrate_set_error(migrate_get_current(), err); - /* - * Error happen, mark multifd_send_thread status as 'quit' although it - * is not created, and then tell who pay attention to me. - */ - p->quit = true; + multifd_send_terminate_threads(err); multifd_send_kick_main(p); error_free(err); } @@ -857,22 +852,6 @@ static bool multifd_channel_connect(MultiFDSendParams *p, return true; } -static void multifd_new_send_channel_cleanup(MultiFDSendParams *p, - QIOChannel *ioc, Error *err) -{ - migrate_set_error(migrate_get_current(), err); - /* Error happen, we need to tell who pay attention to me */ - multifd_send_kick_main(p); - /* - * Although multifd_send_thread is not created, but main migration - * thread need to judge whether it is running, so we need to mark - * its status. - */ - p->quit = true; - object_unref(OBJECT(ioc)); - error_free(err); -} - static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) { MultiFDSendParams *p = opaque; @@ -889,7 +868,10 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) } trace_multifd_new_send_channel_async_error(p->id, local_err); - multifd_new_send_channel_cleanup(p, ioc, local_err); + multifd_send_terminate_threads(local_err); + multifd_send_kick_main(p); + object_unref(OBJECT(ioc)); + error_free(local_err); } static void multifd_new_send_channel_create(gpointer opaque) @@ -921,7 +903,6 @@ int multifd_save_setup(Error **errp) qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem, 0); qemu_sem_init(&p->sem_sync, 0); - p->quit = false; p->pending_job = 0; p->id = i; p->pages = multifd_pages_init(page_count); diff --git a/migration/multifd.h b/migration/multifd.h index 35d11f103cd..7c040cb85a8 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -95,8 +95,6 @@ typedef struct { QemuMutex mutex; /* is this channel thread running */ bool running; - /* should this thread finish */ - bool quit; /* multifd flags for each packet */ uint32_t flags; /* global number of generated multifd packets */ -- Gitee From 1f319463b77b489b5a321708e0b2b7f19a0ce8f0 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:38 +0800 Subject: [PATCH 28/90] migration/multifd: Postpone reset of MultiFDPages_t commit 836eca47f62f9f6d5b8e9b6fedfc3539775c4e2e upstream. Now we reset MultiFDPages_t object in the multifd sender thread in the middle of the sending job. That's not necessary, because the "*pages" struct will not be reused anyway until pending_job is cleared. Move that to the end after the job is completed, provide a helper to reset a "*pages" object. Use that same helper when free the object too. This prepares us to keep using p->pages in the follow up patches, where we may drop p->normal[]. Intel-SIG: commit 836eca47f62f migration/multifd: Postpone reset of MultiFDPages_t Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-5-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 2c98023d67f..5633ac245a2 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -172,6 +172,17 @@ void multifd_register_ops(int method, MultiFDMethods *ops) multifd_ops[method] = ops; } +/* Reset a MultiFDPages_t* object for the next use */ +static void multifd_pages_reset(MultiFDPages_t *pages) +{ + /* + * We don't need to touch offset[] array, because it will be + * overwritten later when reused. + */ + pages->num = 0; + pages->block = NULL; +} + static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp) { MultiFDInit_t msg = {}; @@ -248,9 +259,8 @@ static MultiFDPages_t *multifd_pages_init(uint32_t n) static void multifd_pages_clear(MultiFDPages_t *pages) { - pages->num = 0; + multifd_pages_reset(pages); pages->allocated = 0; - pages->block = NULL; g_free(pages->offset); pages->offset = NULL; g_free(pages); @@ -704,8 +714,6 @@ static void *multifd_send_thread(void *opaque) p->flags = 0; p->num_packets++; p->total_normal_pages += p->normal_num; - p->pages->num = 0; - p->pages->block = NULL; qemu_mutex_unlock(&p->mutex); trace_multifd_send(p->id, packet_num, p->normal_num, flags, @@ -732,6 +740,8 @@ static void *multifd_send_thread(void *opaque) stat64_add(&mig_stats.multifd_bytes, p->next_packet_size + p->packet_len); + + multifd_pages_reset(p->pages); p->next_packet_size = 0; qemu_mutex_lock(&p->mutex); p->pending_job--; -- Gitee From b207b8bd34f023f816ffd6d89a45dde9cc306d90 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:39 +0800 Subject: [PATCH 29/90] migration/multifd: Drop MultiFDSendParams.normal[] array commit efd8c5439db7eaf00f35adc0fcc4f01d916e8619 upstream. This array is redundant when p->pages exists. Now we extended the life of p->pages to the whole period where pending_job is set, it should be safe to always use p->pages->offset[] rather than p->normal[]. Drop the array. Alongside, the normal_num is also redundant, which is the same to p->pages->num. This doesn't apply to recv side, because there's no extra buffering on recv side, so p->normal[] array is still needed. Intel-SIG: commit efd8c5439db7 migration/multifd: Drop MultiFDSendParams.normal[] array Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-6-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd-zlib.c | 7 ++++--- migration/multifd-zstd.c | 7 ++++--- migration/multifd.c | 33 +++++++++++++-------------------- migration/multifd.h | 4 ---- 4 files changed, 21 insertions(+), 30 deletions(-) diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 37ce48621e7..100809abc16 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -116,17 +116,18 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) { + MultiFDPages_t *pages = p->pages; struct zlib_data *z = p->data; z_stream *zs = &z->zs; uint32_t out_size = 0; int ret; uint32_t i; - for (i = 0; i < p->normal_num; i++) { + for (i = 0; i < pages->num; i++) { uint32_t available = z->zbuff_len - out_size; int flush = Z_NO_FLUSH; - if (i == p->normal_num - 1) { + if (i == pages->num - 1) { flush = Z_SYNC_FLUSH; } @@ -135,7 +136,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) * with compression. zlib does not guarantee that this is safe, * therefore copy the page before calling deflate(). */ - memcpy(z->buf, p->pages->block->host + p->normal[i], p->page_size); + memcpy(z->buf, p->pages->block->host + pages->offset[i], p->page_size); zs->avail_in = p->page_size; zs->next_in = z->buf; diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index b471daadcd0..2023edd8cc1 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -113,6 +113,7 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) { + MultiFDPages_t *pages = p->pages; struct zstd_data *z = p->data; int ret; uint32_t i; @@ -121,13 +122,13 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) z->out.size = z->zbuff_len; z->out.pos = 0; - for (i = 0; i < p->normal_num; i++) { + for (i = 0; i < pages->num; i++) { ZSTD_EndDirective flush = ZSTD_e_continue; - if (i == p->normal_num - 1) { + if (i == pages->num - 1) { flush = ZSTD_e_flush; } - z->in.src = p->pages->block->host + p->normal[i]; + z->in.src = p->pages->block->host + pages->offset[i]; z->in.size = p->page_size; z->in.pos = 0; diff --git a/migration/multifd.c b/migration/multifd.c index 5633ac245a2..8bb1fd95cfe 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -90,13 +90,13 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) { MultiFDPages_t *pages = p->pages; - for (int i = 0; i < p->normal_num; i++) { - p->iov[p->iovs_num].iov_base = pages->block->host + p->normal[i]; + for (int i = 0; i < pages->num; i++) { + p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; p->iov[p->iovs_num].iov_len = p->page_size; p->iovs_num++; } - p->next_packet_size = p->normal_num * p->page_size; + p->next_packet_size = pages->num * p->page_size; p->flags |= MULTIFD_FLAG_NOCOMP; return 0; } @@ -269,21 +269,22 @@ static void multifd_pages_clear(MultiFDPages_t *pages) static void multifd_send_fill_packet(MultiFDSendParams *p) { MultiFDPacket_t *packet = p->packet; + MultiFDPages_t *pages = p->pages; int i; packet->flags = cpu_to_be32(p->flags); packet->pages_alloc = cpu_to_be32(p->pages->allocated); - packet->normal_pages = cpu_to_be32(p->normal_num); + packet->normal_pages = cpu_to_be32(pages->num); packet->next_packet_size = cpu_to_be32(p->next_packet_size); packet->packet_num = cpu_to_be64(p->packet_num); - if (p->pages->block) { - strncpy(packet->ramblock, p->pages->block->idstr, 256); + if (pages->block) { + strncpy(packet->ramblock, pages->block->idstr, 256); } - for (i = 0; i < p->normal_num; i++) { + for (i = 0; i < pages->num; i++) { /* there are architectures where ram_addr_t is 32 bit */ - uint64_t temp = p->normal[i]; + uint64_t temp = pages->offset[i]; packet->offset[i] = cpu_to_be64(temp); } @@ -570,8 +571,6 @@ void multifd_save_cleanup(void) p->packet = NULL; g_free(p->iov); p->iov = NULL; - g_free(p->normal); - p->normal = NULL; multifd_send_state->ops->send_cleanup(p, &local_err); if (local_err) { migrate_set_error(migrate_get_current(), local_err); @@ -688,8 +687,8 @@ static void *multifd_send_thread(void *opaque) if (p->pending_job) { uint64_t packet_num = p->packet_num; + MultiFDPages_t *pages = p->pages; uint32_t flags; - p->normal_num = 0; if (use_zero_copy_send) { p->iovs_num = 0; @@ -697,12 +696,7 @@ static void *multifd_send_thread(void *opaque) p->iovs_num = 1; } - for (int i = 0; i < p->pages->num; i++) { - p->normal[p->normal_num] = p->pages->offset[i]; - p->normal_num++; - } - - if (p->normal_num) { + if (pages->num) { ret = multifd_send_state->ops->send_prepare(p, &local_err); if (ret != 0) { qemu_mutex_unlock(&p->mutex); @@ -713,10 +707,10 @@ static void *multifd_send_thread(void *opaque) flags = p->flags; p->flags = 0; p->num_packets++; - p->total_normal_pages += p->normal_num; + p->total_normal_pages += pages->num; qemu_mutex_unlock(&p->mutex); - trace_multifd_send(p->id, packet_num, p->normal_num, flags, + trace_multifd_send(p->id, packet_num, pages->num, flags, p->next_packet_size); if (use_zero_copy_send) { @@ -924,7 +918,6 @@ int multifd_save_setup(Error **errp) p->name = g_strdup_printf("multifdsend_%d", i); /* We need one extra place for the packet header */ p->iov = g_new0(struct iovec, page_count + 1); - p->normal = g_new0(ram_addr_t, page_count); p->page_size = qemu_target_page_size(); p->page_count = page_count; diff --git a/migration/multifd.h b/migration/multifd.h index 7c040cb85a8..3920bdbcf19 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -122,10 +122,6 @@ typedef struct { struct iovec *iov; /* number of iovs used */ uint32_t iovs_num; - /* Pages that are not zero */ - ram_addr_t *normal; - /* num of non zero pages */ - uint32_t normal_num; /* used for compression methods */ void *data; } MultiFDSendParams; -- Gitee From c18980de7ca5e6445082930b7c6dc012b7946313 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:40 +0800 Subject: [PATCH 30/90] migration/multifd: Separate SYNC request with normal jobs commit f5f48a7891cf6664a920ba52f6f4dea1646049a4 upstream. Multifd provide a threaded model for processing jobs. On sender side, there can be two kinds of job: (1) a list of pages to send, or (2) a sync request. The sync request is a very special kind of job. It never contains a page array, but only a multifd packet telling the dest side to synchronize with sent pages. Before this patch, both requests use the pending_job field, no matter what the request is, it will boost pending_job, while multifd sender thread will decrement it after it finishes one job. However this should be racy, because SYNC is special in that it needs to set p->flags with MULTIFD_FLAG_SYNC, showing that this is a sync request. Consider a sequence of operations where: - migration thread enqueue a job to send some pages, pending_job++ (0->1) - [...before the selected multifd sender thread wakes up...] - migration thread enqueue another job to sync, pending_job++ (1->2), setup p->flags=MULTIFD_FLAG_SYNC - multifd sender thread wakes up, found pending_job==2 - send the 1st packet with MULTIFD_FLAG_SYNC and list of pages - send the 2nd packet with flags==0 and no pages This is not expected, because MULTIFD_FLAG_SYNC should hopefully be done after all the pages are received. Meanwhile, the 2nd packet will be completely useless, which contains zero information. I didn't verify above, but I think this issue is still benign in that at least on the recv side we always receive pages before handling MULTIFD_FLAG_SYNC. However that's not always guaranteed and just tricky. One other reason I want to separate it is using p->flags to communicate between the two threads is also not clearly defined, it's very hard to read and understand why accessing p->flags is always safe; see the current impl of multifd_send_thread() where we tried to cache only p->flags. It doesn't need to be that complicated. This patch introduces pending_sync, a separate flag just to show that the requester needs a sync. Alongside, we remove the tricky caching of p->flags now because after this patch p->flags should only be used by multifd sender thread now, which will be crystal clear. So it is always thread safe to access p->flags. With that, we can also safely convert the pending_job into a boolean, because we don't support >1 pending jobs anyway. Always use atomic ops to access both flags to make sure no cache effect. When at it, drop the initial setting of "pending_job = 0" because it's always allocated using g_new0(). Intel-SIG: commit f5f48a7891cf migration/multifd: Separate SYNC request with normal jobs Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-7-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 39 +++++++++++++++++++++++++-------------- migration/multifd.h | 13 +++++++++++-- 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 8bb1fd95cfe..ea25bbe6bd8 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -442,8 +442,8 @@ static int multifd_send_pages(void) } p = &multifd_send_state->params[i]; qemu_mutex_lock(&p->mutex); - if (!p->pending_job) { - p->pending_job++; + if (qatomic_read(&p->pending_job) == false) { + qatomic_set(&p->pending_job, true); next_channel = (i + 1) % migrate_multifd_channels(); break; } @@ -631,8 +631,12 @@ int multifd_send_sync_main(void) qemu_mutex_lock(&p->mutex); p->packet_num = multifd_send_state->packet_num++; - p->flags |= MULTIFD_FLAG_SYNC; - p->pending_job++; + /* + * We should be the only user so far, so not possible to be set by + * others concurrently. + */ + assert(qatomic_read(&p->pending_sync) == false); + qatomic_set(&p->pending_sync, true); qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem); } @@ -685,10 +689,9 @@ static void *multifd_send_thread(void *opaque) } qemu_mutex_lock(&p->mutex); - if (p->pending_job) { + if (qatomic_read(&p->pending_job)) { uint64_t packet_num = p->packet_num; MultiFDPages_t *pages = p->pages; - uint32_t flags; if (use_zero_copy_send) { p->iovs_num = 0; @@ -704,13 +707,11 @@ static void *multifd_send_thread(void *opaque) } } multifd_send_fill_packet(p); - flags = p->flags; - p->flags = 0; p->num_packets++; p->total_normal_pages += pages->num; qemu_mutex_unlock(&p->mutex); - trace_multifd_send(p->id, packet_num, pages->num, flags, + trace_multifd_send(p->id, packet_num, pages->num, p->flags, p->next_packet_size); if (use_zero_copy_send) { @@ -738,12 +739,23 @@ static void *multifd_send_thread(void *opaque) multifd_pages_reset(p->pages); p->next_packet_size = 0; qemu_mutex_lock(&p->mutex); - p->pending_job--; + qatomic_set(&p->pending_job, false); qemu_mutex_unlock(&p->mutex); - - if (flags & MULTIFD_FLAG_SYNC) { - qemu_sem_post(&p->sem_sync); + } else if (qatomic_read(&p->pending_sync)) { + p->flags = MULTIFD_FLAG_SYNC; + multifd_send_fill_packet(p); + ret = qio_channel_write_all(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret != 0) { + qemu_mutex_unlock(&p->mutex); + break; } + /* p->next_packet_size will always be zero for a SYNC packet */ + stat64_add(&mig_stats.multifd_bytes, p->packet_len); + p->flags = 0; + qatomic_set(&p->pending_sync, false); + qemu_mutex_unlock(&p->mutex); + qemu_sem_post(&p->sem_sync); } else { qemu_mutex_unlock(&p->mutex); /* sometimes there are spurious wakeups */ @@ -907,7 +919,6 @@ int multifd_save_setup(Error **errp) qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem, 0); qemu_sem_init(&p->sem_sync, 0); - p->pending_job = 0; p->id = i; p->pages = multifd_pages_init(page_count); p->packet_len = sizeof(MultiFDPacket_t) diff --git a/migration/multifd.h b/migration/multifd.h index 3920bdbcf19..08f26ef3fe3 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -99,8 +99,17 @@ typedef struct { uint32_t flags; /* global number of generated multifd packets */ uint64_t packet_num; - /* thread has work to do */ - int pending_job; + /* + * The sender thread has work to do if either of below boolean is set. + * + * @pending_job: a job is pending + * @pending_sync: a sync request is pending + * + * For both of these fields, they're only set by the requesters, and + * cleared by the multifd sender threads. + */ + bool pending_job; + bool pending_sync; /* array of pages to sent. * The owner of 'pages' depends of 'pending_job' value: * pending_job == 0 -> migration_thread can use it. -- Gitee From 2df63772f0bf31d75c2880e8a19249cdcba94341 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:41 +0800 Subject: [PATCH 31/90] migration/multifd: Simplify locking in sender thread commit e3cce9af10b06c51434ced4e1a6686f1ce43e124 upstream. The sender thread will yield the p->mutex before IO starts, trying to not block the requester thread. This may be unnecessary lock optimizations, because the requester can already read pending_job safely even without the lock, because the requester is currently the only one who can assign a task. Drop that lock complication on both sides: (1) in the sender thread, always take the mutex until job done (2) in the requester thread, check pending_job clear lockless Intel-SIG: commit e3cce9af10b0 migration/multifd: Simplify locking in sender thread Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-8-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index ea25bbe6bd8..4d5a01ed93e 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -429,7 +429,9 @@ static int multifd_send_pages(void) return -1; } + /* We wait here, until at least one channel is ready */ qemu_sem_wait(&multifd_send_state->channels_ready); + /* * next_channel can remain from a previous migration that was * using more channels, so ensure it doesn't overflow if the @@ -441,17 +443,26 @@ static int multifd_send_pages(void) return -1; } p = &multifd_send_state->params[i]; - qemu_mutex_lock(&p->mutex); + /* + * Lockless read to p->pending_job is safe, because only multifd + * sender thread can clear it. + */ if (qatomic_read(&p->pending_job) == false) { - qatomic_set(&p->pending_job, true); next_channel = (i + 1) % migrate_multifd_channels(); break; } - qemu_mutex_unlock(&p->mutex); } + + qemu_mutex_lock(&p->mutex); assert(!p->pages->num); assert(!p->pages->block); - + /* + * Double check on pending_job==false with the lock. In the future if + * we can have >1 requester thread, we can replace this with a "goto + * retry", but that is for later. + */ + assert(qatomic_read(&p->pending_job) == false); + qatomic_set(&p->pending_job, true); p->packet_num = multifd_send_state->packet_num++; multifd_send_state->pages = p->pages; p->pages = pages; @@ -709,8 +720,6 @@ static void *multifd_send_thread(void *opaque) multifd_send_fill_packet(p); p->num_packets++; p->total_normal_pages += pages->num; - qemu_mutex_unlock(&p->mutex); - trace_multifd_send(p->id, packet_num, pages->num, p->flags, p->next_packet_size); @@ -730,6 +739,7 @@ static void *multifd_send_thread(void *opaque) ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, 0, p->write_flags, &local_err); if (ret != 0) { + qemu_mutex_unlock(&p->mutex); break; } @@ -738,7 +748,6 @@ static void *multifd_send_thread(void *opaque) multifd_pages_reset(p->pages); p->next_packet_size = 0; - qemu_mutex_lock(&p->mutex); qatomic_set(&p->pending_job, false); qemu_mutex_unlock(&p->mutex); } else if (qatomic_read(&p->pending_sync)) { -- Gitee From 8c4fa4234af2781f9479dd48dc197e23a182c184 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:42 +0800 Subject: [PATCH 32/90] migration/multifd: Drop pages->num check in sender thread commit 83c560fb4249ee5698652249e0c1730c3d611a9b upstream. Now with a split SYNC handler, we always have pages->num set for pending_job==true. Assert it instead. Intel-SIG: commit 83c560fb4249 migration/multifd: Drop pages->num check in sender thread Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-9-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 4d5a01ed93e..518f9de7234 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -710,13 +710,14 @@ static void *multifd_send_thread(void *opaque) p->iovs_num = 1; } - if (pages->num) { - ret = multifd_send_state->ops->send_prepare(p, &local_err); - if (ret != 0) { - qemu_mutex_unlock(&p->mutex); - break; - } + assert(pages->num); + + ret = multifd_send_state->ops->send_prepare(p, &local_err); + if (ret != 0) { + qemu_mutex_unlock(&p->mutex); + break; } + multifd_send_fill_packet(p); p->num_packets++; p->total_normal_pages += pages->num; -- Gitee From 8a9fd02ccc266ed47844d10862a1dc47d28aa128 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:43 +0800 Subject: [PATCH 33/90] migration/multifd: Rename p->num_packets and clean it up commit 05b7ec1890158471afb8537a6817a7e0d0a6c938 upstream. This field, no matter whether on src or dest, is only used for debugging purpose. They can even be removed already, unless it still more or less provide some accounting on "how many packets are sent/recved for this thread". The other more important one is called packet_num, which is embeded in the multifd packet headers (MultiFDPacket_t). So let's keep them for now, but make them much easier to understand, by doing below: - Rename both of them to packets_sent / packets_recved, the old name (num_packets) are waaay too confusing when we already have MultiFDPacket_t.packets_num. - Avoid worrying on the "initial packet": we know we will send it, that's good enough. The accounting won't matter a great deal to start with 0 or with 1. - Move them to where we send/recv the packets. They're: - multifd_send_fill_packet() for senders. - multifd_recv_unfill_packet() for receivers. Intel-SIG: commit 05b7ec189015 migration/multifd: Rename p->num_packets and clean it up Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-10-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 13 +++++-------- migration/multifd.h | 6 +++--- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 518f9de7234..eca76e2c18c 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -288,6 +288,8 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) packet->offset[i] = cpu_to_be64(temp); } + + p->packets_sent++; } static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) @@ -335,6 +337,7 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->next_packet_size = be32_to_cpu(packet->next_packet_size); p->packet_num = be64_to_cpu(packet->packet_num); + p->packets_recved++; if (p->normal_num == 0) { return 0; @@ -688,8 +691,6 @@ static void *multifd_send_thread(void *opaque) ret = -1; goto out; } - /* initial packet */ - p->num_packets = 1; while (true) { qemu_sem_post(&multifd_send_state->channels_ready); @@ -719,7 +720,6 @@ static void *multifd_send_thread(void *opaque) } multifd_send_fill_packet(p); - p->num_packets++; p->total_normal_pages += pages->num; trace_multifd_send(p->id, packet_num, pages->num, p->flags, p->next_packet_size); @@ -787,7 +787,7 @@ out: rcu_unregister_thread(); migration_threads_remove(thread); - trace_multifd_send_thread_end(p->id, p->num_packets, p->total_normal_pages); + trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); return NULL; } @@ -1124,7 +1124,6 @@ static void *multifd_recv_thread(void *opaque) p->flags &= ~MULTIFD_FLAG_SYNC; trace_multifd_recv(p->id, p->packet_num, p->normal_num, flags, p->next_packet_size); - p->num_packets++; p->total_normal_pages += p->normal_num; qemu_mutex_unlock(&p->mutex); @@ -1150,7 +1149,7 @@ static void *multifd_recv_thread(void *opaque) qemu_mutex_unlock(&p->mutex); rcu_unregister_thread(); - trace_multifd_recv_thread_end(p->id, p->num_packets, p->total_normal_pages); + trace_multifd_recv_thread_end(p->id, p->packets_recved, p->total_normal_pages); return NULL; } @@ -1252,8 +1251,6 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) } p->c = ioc; object_ref(OBJECT(ioc)); - /* initial packet */ - p->num_packets = 1; p->running = true; qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, diff --git a/migration/multifd.h b/migration/multifd.h index 08f26ef3fe3..2e4ad0dc56c 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -124,7 +124,7 @@ typedef struct { /* size of the next packet that contains pages */ uint32_t next_packet_size; /* packets sent through this channel */ - uint64_t num_packets; + uint64_t packets_sent; /* non zero pages sent through this channel */ uint64_t total_normal_pages; /* buffers to send */ @@ -174,8 +174,8 @@ typedef struct { MultiFDPacket_t *packet; /* size of the next packet that contains pages */ uint32_t next_packet_size; - /* packets sent through this channel */ - uint64_t num_packets; + /* packets received through this channel */ + uint64_t packets_recved; /* ramblock */ RAMBlock *block; /* ramblock host address */ -- Gitee From 4f223b20901aa93ed1f44359287b4505a94ecc6d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:44 +0800 Subject: [PATCH 34/90] migration/multifd: Move total_normal_pages accounting commit db7e1cc5103137743394a939045a17fa2b30a0dc upstream. Just like the previous patch, move the accounting for total_normal_pages on both src/dst sides into the packet fill/unfill procedures. Intel-SIG: commit db7e1cc51031 migration/multifd: Move total_normal_pages accounting Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-11-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index eca76e2c18c..94a01249346 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -290,6 +290,7 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) } p->packets_sent++; + p->total_normal_pages += pages->num; } static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) @@ -338,6 +339,7 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->next_packet_size = be32_to_cpu(packet->next_packet_size); p->packet_num = be64_to_cpu(packet->packet_num); p->packets_recved++; + p->total_normal_pages += p->normal_num; if (p->normal_num == 0) { return 0; @@ -720,7 +722,6 @@ static void *multifd_send_thread(void *opaque) } multifd_send_fill_packet(p); - p->total_normal_pages += pages->num; trace_multifd_send(p->id, packet_num, pages->num, p->flags, p->next_packet_size); @@ -1124,7 +1125,6 @@ static void *multifd_recv_thread(void *opaque) p->flags &= ~MULTIFD_FLAG_SYNC; trace_multifd_recv(p->id, p->packet_num, p->normal_num, flags, p->next_packet_size); - p->total_normal_pages += p->normal_num; qemu_mutex_unlock(&p->mutex); if (p->normal_num) { -- Gitee From 2cc17fa2196a0e288ff2cfe617dca20f9d5ef969 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:45 +0800 Subject: [PATCH 35/90] migration/multifd: Move trace_multifd_send|recv() commit 8a9ef1738037e1d1132f9e1bd3e2f1102bde719f upstream. Move them into fill/unfill of packets. With that, we can further cleanup the send/recv thread procedure, and remove one more temp var. Intel-SIG: commit 8a9ef1738037 migration/multifd: Move trace_multifd_send|recv() Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-12-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 94a01249346..44163e4e289 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -291,6 +291,9 @@ static void multifd_send_fill_packet(MultiFDSendParams *p) p->packets_sent++; p->total_normal_pages += pages->num; + + trace_multifd_send(p->id, p->packet_num, pages->num, p->flags, + p->next_packet_size); } static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) @@ -341,6 +344,9 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->packets_recved++; p->total_normal_pages += p->normal_num; + trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->flags, + p->next_packet_size); + if (p->normal_num == 0) { return 0; } @@ -704,7 +710,6 @@ static void *multifd_send_thread(void *opaque) qemu_mutex_lock(&p->mutex); if (qatomic_read(&p->pending_job)) { - uint64_t packet_num = p->packet_num; MultiFDPages_t *pages = p->pages; if (use_zero_copy_send) { @@ -722,8 +727,6 @@ static void *multifd_send_thread(void *opaque) } multifd_send_fill_packet(p); - trace_multifd_send(p->id, packet_num, pages->num, p->flags, - p->next_packet_size); if (use_zero_copy_send) { /* Send header first, without zerocopy */ @@ -1123,8 +1126,6 @@ static void *multifd_recv_thread(void *opaque) flags = p->flags; /* recv methods don't know how to handle the SYNC flag */ p->flags &= ~MULTIFD_FLAG_SYNC; - trace_multifd_recv(p->id, p->packet_num, p->normal_num, flags, - p->next_packet_size); qemu_mutex_unlock(&p->mutex); if (p->normal_num) { -- Gitee From 1c985de91dc88274c680c714b617a059bbe6de43 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:46 +0800 Subject: [PATCH 36/90] migration/multifd: multifd_send_prepare_header() commit 452b205702335ddd45554aaf0eb37baf50bdfa00 upstream. Introduce a helper multifd_send_prepare_header() to setup the header packet for multifd sender. It's fine to setup the IOV[0] _before_ send_prepare() because the packet buffer is already ready, even if the content is to be filled in. With this helper, we can already slightly clean up the zero copy path. Note that I explicitly put it into multifd.h, because I want it inlined directly into multifd*.c where necessary later. Intel-SIG: commit 452b20570233 migration/multifd: multifd_send_prepare_header() Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-13-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 16 ++++++++-------- migration/multifd.h | 8 ++++++++ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 44163e4e289..cd4467aff4f 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -712,10 +712,14 @@ static void *multifd_send_thread(void *opaque) if (qatomic_read(&p->pending_job)) { MultiFDPages_t *pages = p->pages; - if (use_zero_copy_send) { - p->iovs_num = 0; - } else { - p->iovs_num = 1; + p->iovs_num = 0; + + if (!use_zero_copy_send) { + /* + * Only !zerocopy needs the header in IOV; zerocopy will + * send it separately. + */ + multifd_send_prepare_header(p); } assert(pages->num); @@ -735,10 +739,6 @@ static void *multifd_send_thread(void *opaque) if (ret != 0) { break; } - } else { - /* Send header using the same writev call */ - p->iov[0].iov_len = p->packet_len; - p->iov[0].iov_base = p->packet; } ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, diff --git a/migration/multifd.h b/migration/multifd.h index 2e4ad0dc56c..4ec005f53f7 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -209,5 +209,13 @@ typedef struct { void multifd_register_ops(int method, MultiFDMethods *ops); +static inline void multifd_send_prepare_header(MultiFDSendParams *p) +{ + p->iov[0].iov_len = p->packet_len; + p->iov[0].iov_base = p->packet; + p->iovs_num++; +} + + #endif -- Gitee From a1eeffb89e3c45afe534e1364d1c9499dce467e6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:47 +0800 Subject: [PATCH 37/90] migration/multifd: Move header prepare/fill into send_prepare() commit 25a1f8787597f6906b151b2f73ae6cc92a31de57 upstream. This patch redefines the interfacing of ->send_prepare(). It further simplifies multifd_send_thread() especially on zero copy. Now with the new interface, we require the hook to do all the work for preparing the IOVs to send. After it's completed, the IOVs should be ready to be dumped into the specific multifd QIOChannel later. So now the API looks like: p->pages -----------> send_prepare() -------------> IOVs This also prepares for the case where the input can be extended to even not any p->pages. But that's for later. This patch will achieve similar goal of what Fabiano used to propose here: https://lore.kernel.org/r/20240126221943.26628-1-farosas@suse.de However the send() interface may not be necessary. I'm boldly attaching a "Co-developed-by" for Fabiano. Intel-SIG: commit 25a1f8787597 migration/multifd: Move header prepare/fill into send_prepare() Co-developed-by: Fabiano Rosas Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-14-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd-zlib.c | 4 +++ migration/multifd-zstd.c | 4 +++ migration/multifd.c | 61 ++++++++++++++++++---------------------- migration/multifd.h | 1 + 4 files changed, 37 insertions(+), 33 deletions(-) diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 100809abc16..012e3bdea1d 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -123,6 +123,8 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) int ret; uint32_t i; + multifd_send_prepare_header(p); + for (i = 0; i < pages->num; i++) { uint32_t available = z->zbuff_len - out_size; int flush = Z_NO_FLUSH; @@ -172,6 +174,8 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) p->next_packet_size = out_size; p->flags |= MULTIFD_FLAG_ZLIB; + multifd_send_fill_packet(p); + return 0; } diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index 2023edd8cc1..dc8fe43e948 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -118,6 +118,8 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) int ret; uint32_t i; + multifd_send_prepare_header(p); + z->out.dst = z->zbuff; z->out.size = z->zbuff_len; z->out.pos = 0; @@ -161,6 +163,8 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) p->next_packet_size = z->out.pos; p->flags |= MULTIFD_FLAG_ZSTD; + multifd_send_fill_packet(p); + return 0; } diff --git a/migration/multifd.c b/migration/multifd.c index cd4467aff4f..6aa44340dec 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -50,15 +50,15 @@ typedef struct { /** * nocomp_send_setup: setup send side * - * For no compression this function does nothing. - * - * Returns 0 for success or -1 for error - * * @p: Params for the channel that we are using * @errp: pointer to an error */ static int nocomp_send_setup(MultiFDSendParams *p, Error **errp) { + if (migrate_zero_copy_send()) { + p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; + } + return 0; } @@ -88,7 +88,17 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) { + bool use_zero_copy_send = migrate_zero_copy_send(); MultiFDPages_t *pages = p->pages; + int ret; + + if (!use_zero_copy_send) { + /* + * Only !zerocopy needs the header in IOV; zerocopy will + * send it separately. + */ + multifd_send_prepare_header(p); + } for (int i = 0; i < pages->num; i++) { p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; @@ -98,6 +108,18 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) p->next_packet_size = pages->num * p->page_size; p->flags |= MULTIFD_FLAG_NOCOMP; + + multifd_send_fill_packet(p); + + if (use_zero_copy_send) { + /* Send header first, without zerocopy */ + ret = qio_channel_write_all(p->c, (void *)p->packet, + p->packet_len, errp); + if (ret != 0) { + return -1; + } + } + return 0; } @@ -266,7 +288,7 @@ static void multifd_pages_clear(MultiFDPages_t *pages) g_free(pages); } -static void multifd_send_fill_packet(MultiFDSendParams *p) +void multifd_send_fill_packet(MultiFDSendParams *p) { MultiFDPacket_t *packet = p->packet; MultiFDPages_t *pages = p->pages; @@ -688,7 +710,6 @@ static void *multifd_send_thread(void *opaque) MigrationThread *thread = NULL; Error *local_err = NULL; int ret = 0; - bool use_zero_copy_send = migrate_zero_copy_send(); thread = migration_threads_add(p->name, qemu_get_thread_id()); @@ -713,15 +734,6 @@ static void *multifd_send_thread(void *opaque) MultiFDPages_t *pages = p->pages; p->iovs_num = 0; - - if (!use_zero_copy_send) { - /* - * Only !zerocopy needs the header in IOV; zerocopy will - * send it separately. - */ - multifd_send_prepare_header(p); - } - assert(pages->num); ret = multifd_send_state->ops->send_prepare(p, &local_err); @@ -730,17 +742,6 @@ static void *multifd_send_thread(void *opaque) break; } - multifd_send_fill_packet(p); - - if (use_zero_copy_send) { - /* Send header first, without zerocopy */ - ret = qio_channel_write_all(p->c, (void *)p->packet, - p->packet_len, &local_err); - if (ret != 0) { - break; - } - } - ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, 0, p->write_flags, &local_err); if (ret != 0) { @@ -945,13 +946,7 @@ int multifd_save_setup(Error **errp) p->iov = g_new0(struct iovec, page_count + 1); p->page_size = qemu_target_page_size(); p->page_count = page_count; - - if (migrate_zero_copy_send()) { - p->write_flags = QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; - } else { - p->write_flags = 0; - } - + p->write_flags = 0; multifd_new_send_channel_create(p); } diff --git a/migration/multifd.h b/migration/multifd.h index 4ec005f53f7..34a2ecb9f42 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -208,6 +208,7 @@ typedef struct { } MultiFDMethods; void multifd_register_ops(int method, MultiFDMethods *ops); +void multifd_send_fill_packet(MultiFDSendParams *p); static inline void multifd_send_prepare_header(MultiFDSendParams *p) { -- Gitee From a8a64660ed908edaa979ede483762a6a691f9688 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:48 +0800 Subject: [PATCH 38/90] migration/multifd: Forbid spurious wakeups commit 859ebaf346e8b5dece6cf255c604fe953d8ec9ab upstream. Now multifd's logic is designed to have no spurious wakeup. I still remember a talk to Juan and he seems to agree we should drop it now, and if my memory was right it was there because multifd used to hit that when still debugging. Let's drop it and see what can explode; as long as it's not reaching soft-freeze. Intel-SIG: commit 859ebaf346e8 migration/multifd: Forbid spurious wakeups Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-15-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 6aa44340dec..28b54100cd8 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -756,7 +756,9 @@ static void *multifd_send_thread(void *opaque) p->next_packet_size = 0; qatomic_set(&p->pending_job, false); qemu_mutex_unlock(&p->mutex); - } else if (qatomic_read(&p->pending_sync)) { + } else { + /* If not a normal job, must be a sync request */ + assert(qatomic_read(&p->pending_sync)); p->flags = MULTIFD_FLAG_SYNC; multifd_send_fill_packet(p); ret = qio_channel_write_all(p->c, (void *)p->packet, @@ -771,9 +773,6 @@ static void *multifd_send_thread(void *opaque) qatomic_set(&p->pending_sync, false); qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem_sync); - } else { - qemu_mutex_unlock(&p->mutex); - /* sometimes there are spurious wakeups */ } } -- Gitee From c924d002f62e3b4c0499854f28153c6f45855665 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:49 +0800 Subject: [PATCH 39/90] migration/multifd: Split multifd_send_terminate_threads() commit 3ab4441d97af59ea09ee015d68c4770704b2b34f upstream. Split multifd_send_terminate_threads() into two functions: - multifd_send_set_error(): used when an error happened on the sender side, set error and quit state only - multifd_send_terminate_threads(): used only by the main thread to kick all multifd send threads out of sleep, for the last recycling. Use multifd_send_set_error() in the three old call sites where only the error will be set. Use multifd_send_terminate_threads() in the last one where the main thread will kick the multifd threads at last in multifd_save_cleanup(). Both helpers will need to set quitting=1. Intel-SIG: commit 3ab4441d97af migration/multifd: Split multifd_send_terminate_threads() Suggested-by: Fabiano Rosas Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-16-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 27 ++++++++++++++++++--------- migration/trace-events | 2 +- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 28b54100cd8..ba86f9dda53 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -536,10 +536,9 @@ int multifd_queue_page(RAMBlock *block, ram_addr_t offset) return 1; } -static void multifd_send_terminate_threads(Error *err) +/* Multifd send side hit an error; remember it and prepare to quit */ +static void multifd_send_set_error(Error *err) { - int i; - /* * We don't want to exit each threads twice. Depending on where * we get the error, or if there are two independent errors in two @@ -550,8 +549,6 @@ static void multifd_send_terminate_threads(Error *err) return; } - trace_multifd_send_terminate_threads(err != NULL); - if (err) { MigrationState *s = migrate_get_current(); migrate_set_error(s, err); @@ -563,7 +560,19 @@ static void multifd_send_terminate_threads(Error *err) MIGRATION_STATUS_FAILED); } } +} + +static void multifd_send_terminate_threads(void) +{ + int i; + + trace_multifd_send_terminate_threads(); + /* + * Tell everyone we're quitting. No xchg() needed here; we simply + * always set it. + */ + qatomic_set(&multifd_send_state->exiting, 1); for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; @@ -586,7 +595,7 @@ void multifd_save_cleanup(void) if (!migrate_multifd()) { return; } - multifd_send_terminate_threads(NULL); + multifd_send_terminate_threads(); for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; @@ -780,7 +789,7 @@ out: if (ret) { assert(local_err); trace_multifd_send_error(p->id); - multifd_send_terminate_threads(local_err); + multifd_send_set_error(local_err); multifd_send_kick_main(p); error_free(local_err); } @@ -816,7 +825,7 @@ static void multifd_tls_outgoing_handshake(QIOTask *task, trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); - multifd_send_terminate_threads(err); + multifd_send_set_error(err); multifd_send_kick_main(p); error_free(err); } @@ -898,7 +907,7 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) } trace_multifd_new_send_channel_async_error(p->id, local_err); - multifd_send_terminate_threads(local_err); + multifd_send_set_error(local_err); multifd_send_kick_main(p); object_unref(OBJECT(ioc)); error_free(local_err); diff --git a/migration/trace-events b/migration/trace-events index de4a743c8a7..298ad2b0dd6 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -141,7 +141,7 @@ multifd_send_error(uint8_t id) "channel %u" multifd_send_sync_main(long packet_num) "packet num %ld" multifd_send_sync_main_signal(uint8_t id) "channel %u" multifd_send_sync_main_wait(uint8_t id) "channel %u" -multifd_send_terminate_threads(bool error) "error %d" +multifd_send_terminate_threads(void) "" multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 multifd_send_thread_start(uint8_t id) "%u" multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s" -- Gitee From 0d04230bc298e9399a33409232c4cfc3d596ca7b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:50 +0800 Subject: [PATCH 40/90] migration/multifd: Change retval of multifd_queue_page() commit d6556d174a6b9fc443f2320193f18e71eb67052a upstream. Using int is an overkill when there're only two options. Change it to a boolean. Intel-SIG: commit d6556d174a6b migration/multifd: Change retval of multifd_queue_page() Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-17-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 9 +++++---- migration/multifd.h | 2 +- migration/ram.c | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index ba86f9dda53..12e587fda80 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -505,7 +505,8 @@ static int multifd_send_pages(void) return 1; } -int multifd_queue_page(RAMBlock *block, ram_addr_t offset) +/* Returns true if enqueue successful, false otherwise */ +bool multifd_queue_page(RAMBlock *block, ram_addr_t offset) { MultiFDPages_t *pages = multifd_send_state->pages; bool changed = false; @@ -519,21 +520,21 @@ int multifd_queue_page(RAMBlock *block, ram_addr_t offset) pages->num++; if (pages->num < pages->allocated) { - return 1; + return true; } } else { changed = true; } if (multifd_send_pages() < 0) { - return -1; + return false; } if (changed) { return multifd_queue_page(block, offset); } - return 1; + return true; } /* Multifd send side hit an error; remember it and prepare to quit */ diff --git a/migration/multifd.h b/migration/multifd.h index 34a2ecb9f42..a320c53a6f4 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -22,7 +22,7 @@ bool multifd_recv_all_channels_created(void); void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); void multifd_recv_sync_main(void); int multifd_send_sync_main(void); -int multifd_queue_page(RAMBlock *block, ram_addr_t offset); +bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); /* Multifd Compression flags */ #define MULTIFD_FLAG_SYNC (1 << 0) diff --git a/migration/ram.c b/migration/ram.c index 4706dcda7dd..9d176281008 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1386,7 +1386,7 @@ static int ram_save_page(RAMState *rs, PageSearchStatus *pss) static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) { - if (multifd_queue_page(block, offset) < 0) { + if (!multifd_queue_page(block, offset)) { return -1; } stat64_add(&mig_stats.normal_pages, 1); -- Gitee From 991fc020ce88b835267c05302d05b8b32884a831 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:51 +0800 Subject: [PATCH 41/90] migration/multifd: Change retval of multifd_send_pages() commit 3b40964a863d69121733c8b9794a02347ed0000b upstream. Using int is an overkill when there're only two options. Change it to a boolean. Intel-SIG: commit 3b40964a863d migration/multifd: Change retval of multifd_send_pages() Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-18-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 12e587fda80..35d4e8ad1f0 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -449,9 +449,10 @@ static void multifd_send_kick_main(MultiFDSendParams *p) * thread is using the channel mutex when changing it, and the channel * have to had finish with its own, otherwise pending_job can't be * false. + * + * Returns true if succeed, false otherwise. */ - -static int multifd_send_pages(void) +static bool multifd_send_pages(void) { int i; static int next_channel; @@ -459,7 +460,7 @@ static int multifd_send_pages(void) MultiFDPages_t *pages = multifd_send_state->pages; if (multifd_send_should_exit()) { - return -1; + return false; } /* We wait here, until at least one channel is ready */ @@ -473,7 +474,7 @@ static int multifd_send_pages(void) next_channel %= migrate_multifd_channels(); for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) { if (multifd_send_should_exit()) { - return -1; + return false; } p = &multifd_send_state->params[i]; /* @@ -502,7 +503,7 @@ static int multifd_send_pages(void) qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem); - return 1; + return true; } /* Returns true if enqueue successful, false otherwise */ @@ -526,7 +527,7 @@ bool multifd_queue_page(RAMBlock *block, ram_addr_t offset) changed = true; } - if (multifd_send_pages() < 0) { + if (!multifd_send_pages()) { return false; } @@ -666,7 +667,7 @@ int multifd_send_sync_main(void) return 0; } if (multifd_send_state->pages->num) { - if (multifd_send_pages() < 0) { + if (!multifd_send_pages()) { error_report("%s: multifd_send_pages fail", __func__); return -1; } -- Gitee From ce9aefc58e9b0ab02c4310d1f79dd0002951a728 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:52 +0800 Subject: [PATCH 42/90] migration/multifd: Rewrite multifd_queue_page() commit f88f86c4ee3fe673b34873e27af2de0a16fe01fd upstream. The current multifd_queue_page() is not easy to read and follow. It is not good with a few reasons: - No helper at all to show what exactly does a condition mean; in short, readability is low. - Rely on pages->ramblock being cleared to detect an empty queue. It's slightly an overload of the ramblock pointer, per Fabiano [1], which I also agree. - Contains a self recursion, even if not necessary.. Rewrite this function. We add some comments to make it even clearer on what it does. [1] https://lore.kernel.org/r/87wmrpjzew.fsf@suse.de Intel-SIG: commit f88f86c4ee3f migration/multifd: Rewrite multifd_queue_page() Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-19-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 56 ++++++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 35d4e8ad1f0..4ab8e6eff22 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -506,35 +506,53 @@ static bool multifd_send_pages(void) return true; } +static inline bool multifd_queue_empty(MultiFDPages_t *pages) +{ + return pages->num == 0; +} + +static inline bool multifd_queue_full(MultiFDPages_t *pages) +{ + return pages->num == pages->allocated; +} + +static inline void multifd_enqueue(MultiFDPages_t *pages, ram_addr_t offset) +{ + pages->offset[pages->num++] = offset; +} + /* Returns true if enqueue successful, false otherwise */ bool multifd_queue_page(RAMBlock *block, ram_addr_t offset) { - MultiFDPages_t *pages = multifd_send_state->pages; - bool changed = false; + MultiFDPages_t *pages; + +retry: + pages = multifd_send_state->pages; - if (!pages->block) { + /* If the queue is empty, we can already enqueue now */ + if (multifd_queue_empty(pages)) { pages->block = block; + multifd_enqueue(pages, offset); + return true; } - if (pages->block == block) { - pages->offset[pages->num] = offset; - pages->num++; - - if (pages->num < pages->allocated) { - return true; + /* + * Not empty, meanwhile we need a flush. It can because of either: + * + * (1) The page is not on the same ramblock of previous ones, or, + * (2) The queue is full. + * + * After flush, always retry. + */ + if (pages->block != block || multifd_queue_full(pages)) { + if (!multifd_send_pages()) { + return false; } - } else { - changed = true; - } - - if (!multifd_send_pages()) { - return false; - } - - if (changed) { - return multifd_queue_page(block, offset); + goto retry; } + /* Not empty, and we still have space, do it! */ + multifd_enqueue(pages, offset); return true; } -- Gitee From c429c88457585082c90d424ed6c2783bdee81d4e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:53 +0800 Subject: [PATCH 43/90] migration/multifd: Cleanup multifd_save_cleanup() commit 12808db3b8c22d26c9bc3da6f41756890ce882e4 upstream. Shrink the function by moving relevant works into helpers: move the thread join()s into multifd_send_terminate_threads(), then create two more helpers to cover channel/state cleanups. Add a TODO entry for the thread terminate process because p->running is still buggy. We need to fix it at some point but not yet covered. Intel-SIG: commit 12808db3b8c2 migration/multifd: Cleanup multifd_save_cleanup() Suggested-by: Fabiano Rosas Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-20-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 91 +++++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 32 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 4ab8e6eff22..4cb0d2cc173 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -593,6 +593,11 @@ static void multifd_send_terminate_threads(void) * always set it. */ qatomic_set(&multifd_send_state->exiting, 1); + + /* + * Firstly, kick all threads out; no matter whether they are just idle, + * or blocked in an IO system call. + */ for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; @@ -601,6 +606,21 @@ static void multifd_send_terminate_threads(void) qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); } } + + /* + * Finally recycle all the threads. + * + * TODO: p->running is still buggy, e.g. we can reach here without the + * corresponding multifd_new_send_channel_async() get invoked yet, + * then a new thread can even be created after this function returns. + */ + for (i = 0; i < migrate_multifd_channels(); i++) { + MultiFDSendParams *p = &multifd_send_state->params[i]; + + if (p->running) { + qemu_thread_join(&p->thread); + } + } } static int multifd_send_channel_destroy(QIOChannel *send) @@ -608,6 +628,41 @@ static int multifd_send_channel_destroy(QIOChannel *send) return socket_send_channel_destroy(send); } +static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) +{ + if (p->registered_yank) { + migration_ioc_unregister_yank(p->c); + } + multifd_send_channel_destroy(p->c); + p->c = NULL; + qemu_mutex_destroy(&p->mutex); + qemu_sem_destroy(&p->sem); + qemu_sem_destroy(&p->sem_sync); + g_free(p->name); + p->name = NULL; + multifd_pages_clear(p->pages); + p->pages = NULL; + p->packet_len = 0; + g_free(p->packet); + p->packet = NULL; + g_free(p->iov); + p->iov = NULL; + multifd_send_state->ops->send_cleanup(p, errp); + + return *errp == NULL; +} + +static void multifd_send_cleanup_state(void) +{ + qemu_sem_destroy(&multifd_send_state->channels_ready); + g_free(multifd_send_state->params); + multifd_send_state->params = NULL; + multifd_pages_clear(multifd_send_state->pages); + multifd_send_state->pages = NULL; + g_free(multifd_send_state); + multifd_send_state = NULL; +} + void multifd_save_cleanup(void) { int i; @@ -615,48 +670,20 @@ void multifd_save_cleanup(void) if (!migrate_multifd()) { return; } + multifd_send_terminate_threads(); - for (i = 0; i < migrate_multifd_channels(); i++) { - MultiFDSendParams *p = &multifd_send_state->params[i]; - if (p->running) { - qemu_thread_join(&p->thread); - } - } for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; Error *local_err = NULL; - if (p->registered_yank) { - migration_ioc_unregister_yank(p->c); - } - multifd_send_channel_destroy(p->c); - p->c = NULL; - qemu_mutex_destroy(&p->mutex); - qemu_sem_destroy(&p->sem); - qemu_sem_destroy(&p->sem_sync); - g_free(p->name); - p->name = NULL; - multifd_pages_clear(p->pages); - p->pages = NULL; - p->packet_len = 0; - g_free(p->packet); - p->packet = NULL; - g_free(p->iov); - p->iov = NULL; - multifd_send_state->ops->send_cleanup(p, &local_err); - if (local_err) { + if (!multifd_send_cleanup_channel(p, &local_err)) { migrate_set_error(migrate_get_current(), local_err); error_free(local_err); } } - qemu_sem_destroy(&multifd_send_state->channels_ready); - g_free(multifd_send_state->params); - multifd_send_state->params = NULL; - multifd_pages_clear(multifd_send_state->pages); - multifd_send_state->pages = NULL; - g_free(multifd_send_state); - multifd_send_state = NULL; + + multifd_send_cleanup_state(); } static int multifd_zero_copy_flush(QIOChannel *c) -- Gitee From 8de765d55e7c11a031aca0576c3a289048446e64 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:54 +0800 Subject: [PATCH 44/90] migration/multifd: Cleanup multifd_load_cleanup() commit 5e6ea8a1d64e72e648b5a5277f08ec7fb09c3b8e upstream. Use similar logic to cleanup the recv side. Note that multifd_recv_terminate_threads() may need some similar rework like the sender side, but let's leave that for later. Intel-SIG: commit 5e6ea8a1d64e migration/multifd: Cleanup multifd_load_cleanup() Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-21-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 52 ++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 4cb0d2cc173..e2dd2f6e045 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -1070,6 +1070,34 @@ void multifd_load_shutdown(void) } } +static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) +{ + migration_ioc_unregister_yank(p->c); + object_unref(OBJECT(p->c)); + p->c = NULL; + qemu_mutex_destroy(&p->mutex); + qemu_sem_destroy(&p->sem_sync); + g_free(p->name); + p->name = NULL; + p->packet_len = 0; + g_free(p->packet); + p->packet = NULL; + g_free(p->iov); + p->iov = NULL; + g_free(p->normal); + p->normal = NULL; + multifd_recv_state->ops->recv_cleanup(p); +} + +static void multifd_recv_cleanup_state(void) +{ + qemu_sem_destroy(&multifd_recv_state->sem_sync); + g_free(multifd_recv_state->params); + multifd_recv_state->params = NULL; + g_free(multifd_recv_state); + multifd_recv_state = NULL; +} + void multifd_load_cleanup(void) { int i; @@ -1092,29 +1120,9 @@ void multifd_load_cleanup(void) qemu_thread_join(&p->thread); } for (i = 0; i < migrate_multifd_channels(); i++) { - MultiFDRecvParams *p = &multifd_recv_state->params[i]; - - migration_ioc_unregister_yank(p->c); - object_unref(OBJECT(p->c)); - p->c = NULL; - qemu_mutex_destroy(&p->mutex); - qemu_sem_destroy(&p->sem_sync); - g_free(p->name); - p->name = NULL; - p->packet_len = 0; - g_free(p->packet); - p->packet = NULL; - g_free(p->iov); - p->iov = NULL; - g_free(p->normal); - p->normal = NULL; - multifd_recv_state->ops->recv_cleanup(p); + multifd_recv_cleanup_channel(&multifd_recv_state->params[i]); } - qemu_sem_destroy(&multifd_recv_state->sem_sync); - g_free(multifd_recv_state->params); - multifd_recv_state->params = NULL; - g_free(multifd_recv_state); - multifd_recv_state = NULL; + multifd_recv_cleanup_state(); } void multifd_recv_sync_main(void) -- Gitee From 89fb0318d215c7f6dd7b2c38cc3c07f2e7c508dc Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:55 +0800 Subject: [PATCH 45/90] migration/multifd: Stick with send/recv on function names commit cde85c37ca54e4a2dbee8653181938499887f6be upstream. Most of the multifd code uses send/recv to represent the two sides, but some rare cases use save/load. Since send/recv is the majority, replacing the save/load use cases to use send/recv globally. Now we reach a consensus on the naming. Intel-SIG: commit cde85c37ca54 migration/multifd: Stick with send/recv on function names Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-22-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 12 ++++++------ migration/multifd.c | 10 +++++----- migration/multifd.h | 10 +++++----- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 699ba0c8347..0765b515ea2 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -276,7 +276,7 @@ void migration_incoming_state_destroy(void) { struct MigrationIncomingState *mis = migration_incoming_get_current(); - multifd_load_cleanup(); + multifd_recv_cleanup(); compress_threads_load_cleanup(); if (mis->to_src_file) { @@ -629,7 +629,7 @@ static void process_incoming_migration_bh(void *opaque) trace_vmstate_downtime_checkpoint("dst-precopy-bh-announced"); - multifd_load_shutdown(); + multifd_recv_shutdown(); dirty_bitmap_mig_before_vm_start(); @@ -730,7 +730,7 @@ fail: MIGRATION_STATUS_FAILED); qemu_fclose(mis->from_src_file); - multifd_load_cleanup(); + multifd_recv_cleanup(); compress_threads_load_cleanup(); exit(EXIT_FAILURE); @@ -863,7 +863,7 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp) default_channel = !mis->from_src_file; } - if (multifd_load_setup(errp) != 0) { + if (multifd_recv_setup(errp) != 0) { return; } @@ -1315,7 +1315,7 @@ static void migrate_fd_cleanup(MigrationState *s) } qemu_mutex_lock_iothread(); - multifd_save_cleanup(); + multifd_send_shutdown(); qemu_mutex_lock(&s->qemu_file_lock); tmp = s->to_dst_file; s->to_dst_file = NULL; @@ -3650,7 +3650,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) return; } - if (multifd_save_setup(&local_err) != 0) { + if (multifd_send_setup(&local_err) != 0) { migrate_set_error(s, local_err); error_report_err(local_err); migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, diff --git a/migration/multifd.c b/migration/multifd.c index e2dd2f6e045..130f86a1fbb 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -663,7 +663,7 @@ static void multifd_send_cleanup_state(void) multifd_send_state = NULL; } -void multifd_save_cleanup(void) +void multifd_send_shutdown(void) { int i; @@ -965,7 +965,7 @@ static void multifd_new_send_channel_create(gpointer opaque) socket_send_channel_create(multifd_new_send_channel_async, opaque); } -int multifd_save_setup(Error **errp) +int multifd_send_setup(Error **errp) { int thread_count; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); @@ -1063,7 +1063,7 @@ static void multifd_recv_terminate_threads(Error *err) } } -void multifd_load_shutdown(void) +void multifd_recv_shutdown(void) { if (migrate_multifd()) { multifd_recv_terminate_threads(NULL); @@ -1098,7 +1098,7 @@ static void multifd_recv_cleanup_state(void) multifd_recv_state = NULL; } -void multifd_load_cleanup(void) +void multifd_recv_cleanup(void) { int i; @@ -1213,7 +1213,7 @@ static void *multifd_recv_thread(void *opaque) return NULL; } -int multifd_load_setup(Error **errp) +int multifd_recv_setup(Error **errp) { int thread_count; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); diff --git a/migration/multifd.h b/migration/multifd.h index a320c53a6f4..9b40a53cb67 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -13,11 +13,11 @@ #ifndef QEMU_MIGRATION_MULTIFD_H #define QEMU_MIGRATION_MULTIFD_H -int multifd_save_setup(Error **errp); -void multifd_save_cleanup(void); -int multifd_load_setup(Error **errp); -void multifd_load_cleanup(void); -void multifd_load_shutdown(void); +int multifd_send_setup(Error **errp); +void multifd_send_shutdown(void); +int multifd_recv_setup(Error **errp); +void multifd_recv_cleanup(void); +void multifd_recv_shutdown(void); bool multifd_recv_all_channels_created(void); void multifd_recv_new_channel(QIOChannel *ioc, Error **errp); void multifd_recv_sync_main(void); -- Gitee From 8fdd0750061e22c6b83a5d7782034886a592dcb7 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:56 +0800 Subject: [PATCH 46/90] migration/multifd: Fix MultiFDSendParams.packet_num race commit 98ea497d8b8a5076be7b6ceb0dcc4a475373eb76 upstream. As reported correctly by Fabiano [1] (while per Fabiano, it sourced back to Elena's initial report in Oct 2023), MultiFDSendParams.packet_num is buggy to be assigned and stored. Consider two consequent operations of: (1) queue a job into multifd send thread X, then (2) queue another sync request to the same send thread X. Then the MultiFDSendParams.packet_num will be assigned twice, and the first assignment can get lost already. To avoid that, we move the packet_num assignment from p->packet_num into where the thread will fill in the packet. Use atomic operations to protect the field, making sure there's no race. Note that atomic fetch_add() may not be good for scaling purposes, however multifd should be fine as number of threads should normally not go beyond 16 threads. Let's leave that concern for later but fix the issue first. There's also a trick on how to make it always work even on 32 bit hosts for uint64_t packet number. Switching to uintptr_t as of now to simply the case. It will cause packet number to overflow easier on 32 bit, but that shouldn't be a major concern for now as 32 bit systems is not the major audience for any performance concerns like what multifd wants to address. We also need to move multifd_send_state definition upper, so that multifd_send_fill_packet() can reference it. [1] https://lore.kernel.org/r/87o7d1jlu5.fsf@suse.de Intel-SIG: commit 98ea497d8b8a migration/multifd: Fix MultiFDSendParams.packet_num race Reported-by: Elena Ufimtseva Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-23-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 56 +++++++++++++++++++++++++++------------------ migration/multifd.h | 2 -- 2 files changed, 34 insertions(+), 24 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 130f86a1fbb..b317d57d619 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -45,6 +45,35 @@ typedef struct { uint64_t unused2[4]; /* Reserved for future use */ } __attribute__((packed)) MultiFDInit_t; +struct { + MultiFDSendParams *params; + /* array of pages to sent */ + MultiFDPages_t *pages; + /* + * Global number of generated multifd packets. + * + * Note that we used 'uintptr_t' because it'll naturally support atomic + * operations on both 32bit / 64 bits hosts. It means on 32bit systems + * multifd will overflow the packet_num easier, but that should be + * fine. + * + * Another option is to use QEMU's Stat64 then it'll be 64 bits on all + * hosts, however so far it does not support atomic fetch_add() yet. + * Make it easy for now. + */ + uintptr_t packet_num; + /* send channels ready */ + QemuSemaphore channels_ready; + /* + * Have we already run terminate threads. There is a race when it + * happens that we got one error while we are exiting. + * We will use atomic operations. Only valid values are 0 and 1. + */ + int exiting; + /* multifd ops */ + MultiFDMethods *ops; +} *multifd_send_state; + /* Multifd without compression */ /** @@ -292,13 +321,16 @@ void multifd_send_fill_packet(MultiFDSendParams *p) { MultiFDPacket_t *packet = p->packet; MultiFDPages_t *pages = p->pages; + uint64_t packet_num; int i; packet->flags = cpu_to_be32(p->flags); packet->pages_alloc = cpu_to_be32(p->pages->allocated); packet->normal_pages = cpu_to_be32(pages->num); packet->next_packet_size = cpu_to_be32(p->next_packet_size); - packet->packet_num = cpu_to_be64(p->packet_num); + + packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num); + packet->packet_num = cpu_to_be64(packet_num); if (pages->block) { strncpy(packet->ramblock, pages->block->idstr, 256); @@ -314,7 +346,7 @@ void multifd_send_fill_packet(MultiFDSendParams *p) p->packets_sent++; p->total_normal_pages += pages->num; - trace_multifd_send(p->id, p->packet_num, pages->num, p->flags, + trace_multifd_send(p->id, packet_num, pages->num, p->flags, p->next_packet_size); } @@ -398,24 +430,6 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) return 0; } -struct { - MultiFDSendParams *params; - /* array of pages to sent */ - MultiFDPages_t *pages; - /* global number of generated multifd packets */ - uint64_t packet_num; - /* send channels ready */ - QemuSemaphore channels_ready; - /* - * Have we already run terminate threads. There is a race when it - * happens that we got one error while we are exiting. - * We will use atomic operations. Only valid values are 0 and 1. - */ - int exiting; - /* multifd ops */ - MultiFDMethods *ops; -} *multifd_send_state; - static bool multifd_send_should_exit(void) { return qatomic_read(&multifd_send_state->exiting); @@ -497,7 +511,6 @@ static bool multifd_send_pages(void) */ assert(qatomic_read(&p->pending_job) == false); qatomic_set(&p->pending_job, true); - p->packet_num = multifd_send_state->packet_num++; multifd_send_state->pages = p->pages; p->pages = pages; qemu_mutex_unlock(&p->mutex); @@ -730,7 +743,6 @@ int multifd_send_sync_main(void) trace_multifd_send_sync_main_signal(p->id); qemu_mutex_lock(&p->mutex); - p->packet_num = multifd_send_state->packet_num++; /* * We should be the only user so far, so not possible to be set by * others concurrently. diff --git a/migration/multifd.h b/migration/multifd.h index 9b40a53cb67..98876ff94a6 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -97,8 +97,6 @@ typedef struct { bool running; /* multifd flags for each packet */ uint32_t flags; - /* global number of generated multifd packets */ - uint64_t packet_num; /* * The sender thread has work to do if either of below boolean is set. * -- Gitee From 1488f7ea5ccc6a45e8d4788f3490d642f0c86717 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 2 Feb 2024 18:28:57 +0800 Subject: [PATCH 47/90] migration/multifd: Optimize sender side to be lockless commit 488c84acb465c21b716c3fd14de27ab5ce388c85 upstream. When reviewing my attempt to refactor send_prepare(), Fabiano suggested we try out with dropping the mutex in multifd code [1]. I thought about that before but I never tried to change the code. Now maybe it's time to give it a stab. This only optimizes the sender side. The trick here is multifd has a clear provider/consumer model, that the migration main thread publishes requests (either pending_job/pending_sync), while the multifd sender threads are consumers. Here we don't have a lot of complicated data sharing, and the jobs can logically be submitted lockless. Arm the code with atomic weapons. Two things worth mentioning: - For multifd_send_pages(): we can use qatomic_load_acquire() when trying to find a free channel, but that's expensive if we attach one ACQUIRE per channel. Instead, keep the qatomic_read() on reading the pending_job flag as we do already, meanwhile use one smp_mb_acquire() after the loop to guarantee the memory ordering. - For pending_sync: it doesn't have any extra data required since now p->flags are never touched, it should be safe to not use memory barrier. That's different from pending_job. Provide rich comments for all the lockless operations to state how they are paired. With that, we can remove the mutex. [1] https://lore.kernel.org/r/87o7d1jlu5.fsf@suse.de Intel-SIG: commit 488c84acb465 migration/multifd: Optimize sender side to be lockless Suggested-by: Fabiano Rosas Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240202102857.110210-24-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 51 +++++++++++++++++++++++---------------------- migration/multifd.h | 2 -- 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index b317d57d619..fbdb129088d 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -501,19 +501,19 @@ static bool multifd_send_pages(void) } } - qemu_mutex_lock(&p->mutex); - assert(!p->pages->num); - assert(!p->pages->block); /* - * Double check on pending_job==false with the lock. In the future if - * we can have >1 requester thread, we can replace this with a "goto - * retry", but that is for later. + * Make sure we read p->pending_job before all the rest. Pairs with + * qatomic_store_release() in multifd_send_thread(). */ - assert(qatomic_read(&p->pending_job) == false); - qatomic_set(&p->pending_job, true); + smp_mb_acquire(); + assert(!p->pages->num); multifd_send_state->pages = p->pages; p->pages = pages; - qemu_mutex_unlock(&p->mutex); + /* + * Making sure p->pages is setup before marking pending_job=true. Pairs + * with the qatomic_load_acquire() in multifd_send_thread(). + */ + qatomic_store_release(&p->pending_job, true); qemu_sem_post(&p->sem); return true; @@ -648,7 +648,6 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) } multifd_send_channel_destroy(p->c); p->c = NULL; - qemu_mutex_destroy(&p->mutex); qemu_sem_destroy(&p->sem); qemu_sem_destroy(&p->sem_sync); g_free(p->name); @@ -742,14 +741,12 @@ int multifd_send_sync_main(void) trace_multifd_send_sync_main_signal(p->id); - qemu_mutex_lock(&p->mutex); /* * We should be the only user so far, so not possible to be set by * others concurrently. */ assert(qatomic_read(&p->pending_sync) == false); qatomic_set(&p->pending_sync, true); - qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem); } for (i = 0; i < migrate_multifd_channels(); i++) { @@ -796,9 +793,12 @@ static void *multifd_send_thread(void *opaque) if (multifd_send_should_exit()) { break; } - qemu_mutex_lock(&p->mutex); - if (qatomic_read(&p->pending_job)) { + /* + * Read pending_job flag before p->pages. Pairs with the + * qatomic_store_release() in multifd_send_pages(). + */ + if (qatomic_load_acquire(&p->pending_job)) { MultiFDPages_t *pages = p->pages; p->iovs_num = 0; @@ -806,14 +806,12 @@ static void *multifd_send_thread(void *opaque) ret = multifd_send_state->ops->send_prepare(p, &local_err); if (ret != 0) { - qemu_mutex_unlock(&p->mutex); break; } ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num, NULL, 0, p->write_flags, &local_err); if (ret != 0) { - qemu_mutex_unlock(&p->mutex); break; } @@ -822,24 +820,31 @@ static void *multifd_send_thread(void *opaque) multifd_pages_reset(p->pages); p->next_packet_size = 0; - qatomic_set(&p->pending_job, false); - qemu_mutex_unlock(&p->mutex); + + /* + * Making sure p->pages is published before saying "we're + * free". Pairs with the smp_mb_acquire() in + * multifd_send_pages(). + */ + qatomic_store_release(&p->pending_job, false); } else { - /* If not a normal job, must be a sync request */ + /* + * If not a normal job, must be a sync request. Note that + * pending_sync is a standalone flag (unlike pending_job), so + * it doesn't require explicit memory barriers. + */ assert(qatomic_read(&p->pending_sync)); p->flags = MULTIFD_FLAG_SYNC; multifd_send_fill_packet(p); ret = qio_channel_write_all(p->c, (void *)p->packet, p->packet_len, &local_err); if (ret != 0) { - qemu_mutex_unlock(&p->mutex); break; } /* p->next_packet_size will always be zero for a SYNC packet */ stat64_add(&mig_stats.multifd_bytes, p->packet_len); p->flags = 0; qatomic_set(&p->pending_sync, false); - qemu_mutex_unlock(&p->mutex); qemu_sem_post(&p->sem_sync); } } @@ -853,10 +858,7 @@ out: error_free(local_err); } - qemu_mutex_lock(&p->mutex); p->running = false; - qemu_mutex_unlock(&p->mutex); - rcu_unregister_thread(); migration_threads_remove(thread); trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); @@ -998,7 +1000,6 @@ int multifd_send_setup(Error **errp) for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem, 0); qemu_sem_init(&p->sem_sync, 0); p->id = i; diff --git a/migration/multifd.h b/migration/multifd.h index 98876ff94a6..78a2317263f 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -91,8 +91,6 @@ typedef struct { /* syncs main thread and channels */ QemuSemaphore sem_sync; - /* this mutex protects the following parameters */ - QemuMutex mutex; /* is this channel thread running */ bool running; /* multifd flags for each packet */ -- Gitee From 2d9dc310c816f5be1ca1701b60693b02e821fa0f Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 6 Feb 2024 18:51:13 -0300 Subject: [PATCH 48/90] migration/multifd: Join the TLS thread commit e1921f10d9afe651f4887284e85f6789b37e67d3 upstream. We're currently leaking the resources of the TLS thread by not joining it and also overwriting the p->thread pointer altogether. Intel-SIG: commit e1921f10d9af migration/multifd: Join the TLS thread Fixes: a1af605bd5 ("migration/multifd: fix hangup with TLS-Multifd due to blocking handshake") Cc: qemu-stable Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240206215118.6171-2-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 8 +++++++- migration/multifd.h | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/migration/multifd.c b/migration/multifd.c index fbdb129088d..5551711a2af 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -630,6 +630,10 @@ static void multifd_send_terminate_threads(void) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; + if (p->tls_thread_created) { + qemu_thread_join(&p->tls_thread); + } + if (p->running) { qemu_thread_join(&p->thread); } @@ -921,7 +925,9 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname); qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing"); p->c = QIO_CHANNEL(tioc); - qemu_thread_create(&p->thread, "multifd-tls-handshake-worker", + + p->tls_thread_created = true; + qemu_thread_create(&p->tls_thread, "multifd-tls-handshake-worker", multifd_tls_handshake_thread, p, QEMU_THREAD_JOINABLE); return true; diff --git a/migration/multifd.h b/migration/multifd.h index 78a2317263f..720c9d50db7 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -73,6 +73,8 @@ typedef struct { char *name; /* channel thread id */ QemuThread thread; + QemuThread tls_thread; + bool tls_thread_created; /* communication channel */ QIOChannel *c; /* is the yank function registered */ -- Gitee From 79e5fdf98daac8295771b55ea326a69b1f5cd8b8 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 6 Feb 2024 18:51:14 -0300 Subject: [PATCH 49/90] migration/multifd: Remove p->running commit a2a63c4abd52f4e3ff4046dcb67fe44ebf0bb8de upstream. We currently only need p->running to avoid calling qemu_thread_join() on a non existent thread if the thread has never been created. However, there are at least two bugs in this logic: 1) On the sending side, p->running is set too early and qemu_thread_create() can be skipped due to an error during TLS handshake, leaving the flag set and leading to a crash when multifd_send_cleanup() calls qemu_thread_join(). 2) During exit, the multifd thread clears the flag while holding the channel lock. The counterpart at multifd_send_cleanup() reads the flag outside of the lock and might free the mutex while the multifd thread still has it locked. Fix the first issue by setting the flag right before creating the thread. Rename it from p->running to p->thread_created to clarify its usage. Fix the second issue by not clearing the flag at the multifd thread exit. We don't have any use for that. Note that these bugs are straight-forward logic issues and not race conditions. There is still a gap for races to affect this code due to multifd_send_cleanup() being allowed to run concurrently with the thread creation loop. This issue is solved in the next patches. Intel-SIG: commit a2a63c4abd52 migration/multifd: Remove p->running Cc: qemu-stable Fixes: 29647140157a ("migration/tls: add support for multifd tls-handshake") Reported-by: Avihai Horon Reported-by: chenyuhui5@huawei.com Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240206215118.6171-3-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 27 ++++++++++++--------------- migration/multifd.h | 7 ++----- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 5551711a2af..e6ac1ad6dce 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -634,7 +634,7 @@ static void multifd_send_terminate_threads(void) qemu_thread_join(&p->tls_thread); } - if (p->running) { + if (p->thread_created) { qemu_thread_join(&p->thread); } } @@ -862,7 +862,6 @@ out: error_free(local_err); } - p->running = false; rcu_unregister_thread(); migration_threads_remove(thread); trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); @@ -953,6 +952,8 @@ static bool multifd_channel_connect(MultiFDSendParams *p, migration_ioc_register_yank(ioc); p->registered_yank = true; p->c = ioc; + + p->thread_created = true; qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, QEMU_THREAD_JOINABLE); return true; @@ -967,7 +968,6 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) trace_multifd_new_send_channel_async(p->id); if (!qio_task_propagate_error(task, &local_err)) { qio_channel_set_delay(ioc, false); - p->running = true; if (multifd_channel_connect(p, ioc, &local_err)) { return; } @@ -1128,15 +1128,15 @@ void multifd_recv_cleanup(void) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; - if (p->running) { - /* - * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, - * however try to wakeup it without harm in cleanup phase. - */ - qemu_sem_post(&p->sem_sync); - } + /* + * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, + * however try to wakeup it without harm in cleanup phase. + */ + qemu_sem_post(&p->sem_sync); - qemu_thread_join(&p->thread); + if (p->thread_created) { + qemu_thread_join(&p->thread); + } } for (i = 0; i < migrate_multifd_channels(); i++) { multifd_recv_cleanup_channel(&multifd_recv_state->params[i]); @@ -1222,9 +1222,6 @@ static void *multifd_recv_thread(void *opaque) multifd_recv_terminate_threads(local_err); error_free(local_err); } - qemu_mutex_lock(&p->mutex); - p->running = false; - qemu_mutex_unlock(&p->mutex); rcu_unregister_thread(); trace_multifd_recv_thread_end(p->id, p->packets_recved, p->total_normal_pages); @@ -1330,7 +1327,7 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) p->c = ioc; object_ref(OBJECT(ioc)); - p->running = true; + p->thread_created = true; qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p, QEMU_THREAD_JOINABLE); qatomic_inc(&multifd_recv_state->count); diff --git a/migration/multifd.h b/migration/multifd.h index 720c9d50db7..7881980ee67 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -73,6 +73,7 @@ typedef struct { char *name; /* channel thread id */ QemuThread thread; + bool thread_created; QemuThread tls_thread; bool tls_thread_created; /* communication channel */ @@ -93,8 +94,6 @@ typedef struct { /* syncs main thread and channels */ QemuSemaphore sem_sync; - /* is this channel thread running */ - bool running; /* multifd flags for each packet */ uint32_t flags; /* @@ -143,6 +142,7 @@ typedef struct { char *name; /* channel thread id */ QemuThread thread; + bool thread_created; /* communication channel */ QIOChannel *c; /* packet allocated len */ @@ -157,8 +157,6 @@ typedef struct { /* this mutex protects the following parameters */ QemuMutex mutex; - /* is this channel thread running */ - bool running; /* should this thread finish */ bool quit; /* multifd flags for each packet */ @@ -217,4 +215,3 @@ static inline void multifd_send_prepare_header(MultiFDSendParams *p) #endif - -- Gitee From 3bd28c76f7d4a6eaa43aa18abde464a62e118694 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 6 Feb 2024 18:51:15 -0300 Subject: [PATCH 50/90] migration/multifd: Move multifd_send_setup error handling in to the function commit bd8b0a8f82d8fc17aa285ab963ba75675c2fbe7a upstream. Hide the error handling inside multifd_send_setup to make it cleaner for the next patch to move the function around. Intel-SIG: commit bd8b0a8f82d8 migration/multifd: Move multifd_send_setup error handling in to the function Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240206215118.6171-4-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration.c | 6 +----- migration/multifd.c | 24 +++++++++++++++++------- migration/multifd.h | 2 +- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 0765b515ea2..27c7eda2224 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3650,11 +3650,7 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) return; } - if (multifd_send_setup(&local_err) != 0) { - migrate_set_error(s, local_err); - error_report_err(local_err); - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, - MIGRATION_STATUS_FAILED); + if (!multifd_send_setup()) { migrate_fd_cleanup(s); return; } diff --git a/migration/multifd.c b/migration/multifd.c index e6ac1ad6dce..cf865edba09 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -985,14 +985,16 @@ static void multifd_new_send_channel_create(gpointer opaque) socket_send_channel_create(multifd_new_send_channel_async, opaque); } -int multifd_send_setup(Error **errp) +bool multifd_send_setup(void) { - int thread_count; + MigrationState *s = migrate_get_current(); + Error *local_err = NULL; + int thread_count, ret = 0; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); uint8_t i; if (!migrate_multifd()) { - return 0; + return true; } thread_count = migrate_multifd_channels(); @@ -1026,14 +1028,22 @@ int multifd_send_setup(Error **errp) for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; - int ret; - ret = multifd_send_state->ops->send_setup(p, errp); + ret = multifd_send_state->ops->send_setup(p, &local_err); if (ret) { - return ret; + break; } } - return 0; + + if (ret) { + migrate_set_error(s, local_err); + error_report_err(local_err); + migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_FAILED); + return false; + } + + return true; } struct { diff --git a/migration/multifd.h b/migration/multifd.h index 7881980ee67..8a1cad09967 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -13,7 +13,7 @@ #ifndef QEMU_MIGRATION_MULTIFD_H #define QEMU_MIGRATION_MULTIFD_H -int multifd_send_setup(Error **errp); +bool multifd_send_setup(void); void multifd_send_shutdown(void); int multifd_recv_setup(Error **errp); void multifd_recv_cleanup(void); -- Gitee From 7e74a8c4a32e0fcf9a96ffa57540d0a008424421 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 6 Feb 2024 18:51:16 -0300 Subject: [PATCH 51/90] migration/multifd: Move multifd_send_setup into migration thread commit dd904bc13f2af0c605c3fe72f118ea4e27a6610c upstream. We currently have an unfavorable situation around multifd channels creation and the migration thread execution. We create the multifd channels with qio_channel_socket_connect_async -> qio_task_run_in_thread, but only connect them at the multifd_new_send_channel_async callback, called from qio_task_complete, which is registered as a glib event. So at multifd_send_setup() we create the channels, but they will only be actually usable after the whole multifd_send_setup() calling stack returns back to the main loop. Which means that the migration thread is already up and running without any possibility for the multifd channels to be ready on time. We currently rely on the channels-ready semaphore blocking multifd_send_sync_main() until channels start to come up and release it. However there have been bugs recently found when a channel's creation fails and multifd_send_cleanup() is allowed to run while other channels are still being created. Let's start to organize this situation by moving the multifd_send_setup() call into the migration thread. That way we unblock the main-loop to dispatch the completion callbacks and actually have a chance of getting the multifd channels ready for when the migration thread needs them. The next patches will deal with the synchronization aspects. Note that this takes multifd_send_setup() out of the BQL. Intel-SIG: commit dd904bc13f2a migration/multifd: Move multifd_send_setup into migration thread Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240206215118.6171-5-farosas@suse.de Signed-off-by: Peter Xu Conflicts: migration/migration.c [bql] [jz: upstream renamed qemu_mutex_lock_iothread() to bql_lock(), while we havenot yet. Resolve context conflict due to this] Signed-off-by: Jason Zeng --- migration/migration.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index 27c7eda2224..e7a513cb182 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -3326,6 +3326,10 @@ static void *migration_thread(void *opaque) object_ref(OBJECT(s)); update_iteration_initial_status(s); + if (!multifd_send_setup()) { + goto out; + } + qemu_mutex_lock_iothread(); qemu_savevm_state_header(s->to_dst_file); qemu_mutex_unlock_iothread(); @@ -3397,6 +3401,7 @@ static void *migration_thread(void *opaque) urgent = migration_rate_limit(); } +out: trace_migration_thread_after_loop(); migration_iteration_finish(s); object_unref(OBJECT(s)); @@ -3650,11 +3655,6 @@ void migrate_fd_connect(MigrationState *s, Error *error_in) return; } - if (!multifd_send_setup()) { - migrate_fd_cleanup(s); - return; - } - if (migrate_background_snapshot()) { qemu_thread_create(&s->thread, "bg_snapshot", bg_migration_thread, s, QEMU_THREAD_JOINABLE); -- Gitee From 388637c4a357128ea21e8f7593ebdbb4b5358007 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 6 Feb 2024 18:51:17 -0300 Subject: [PATCH 52/90] migration/multifd: Unify multifd and TLS connection paths commit 2576ae488ef9aa692486157df7d8b410919cd219 upstream. During multifd channel creation (multifd_send_new_channel_async) when TLS is enabled, the multifd_channel_connect function is called twice, once to create the TLS handshake thread and another time after the asynchrounous TLS handshake has finished. This creates a slightly confusing call stack where multifd_channel_connect() is called more times than the number of channels. It also splits error handling between the two callers of multifd_channel_connect() causing some code duplication. Lastly, it gets in the way of having a single point to determine whether all channel creation tasks have been initiated. Refactor the code to move the reentrancy one level up at the multifd_new_send_channel_async() level, de-duplicating the error handling and allowing for the next patch to introduce a synchronization point common to all the multifd channel creation, regardless of TLS. Note that the previous code would never fail once p->c had been set. This patch changes this assumption, which affects refcounting, so add comments around object_unref to explain the situation. Intel-SIG: commit 2576ae488ef9 migration/multifd: Unify multifd and TLS connection paths Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240206215118.6171-6-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 83 ++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 43 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index cf865edba09..3db18dc79e3 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -869,30 +869,7 @@ out: return NULL; } -static bool multifd_channel_connect(MultiFDSendParams *p, - QIOChannel *ioc, - Error **errp); - -static void multifd_tls_outgoing_handshake(QIOTask *task, - gpointer opaque) -{ - MultiFDSendParams *p = opaque; - QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task)); - Error *err = NULL; - - if (!qio_task_propagate_error(task, &err)) { - trace_multifd_tls_outgoing_handshake_complete(ioc); - if (multifd_channel_connect(p, ioc, &err)) { - return; - } - } - - trace_multifd_tls_outgoing_handshake_error(ioc, error_get_pretty(err)); - - multifd_send_set_error(err); - multifd_send_kick_main(p); - error_free(err); -} +static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque); static void *multifd_tls_handshake_thread(void *opaque) { @@ -900,7 +877,7 @@ static void *multifd_tls_handshake_thread(void *opaque) QIOChannelTLS *tioc = QIO_CHANNEL_TLS(p->c); qio_channel_tls_handshake(tioc, - multifd_tls_outgoing_handshake, + multifd_new_send_channel_async, p, NULL, NULL); @@ -920,6 +897,10 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, return false; } + /* + * Ownership of the socket channel now transfers to the newly + * created TLS channel, which has already taken a reference. + */ object_unref(OBJECT(ioc)); trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname); qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing"); @@ -936,18 +917,7 @@ static bool multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc, Error **errp) { - trace_multifd_set_outgoing_channel( - ioc, object_get_typename(OBJECT(ioc)), - migrate_get_current()->hostname); - - if (migrate_channel_requires_tls_upgrade(ioc)) { - /* - * tls_channel_connect will call back to this - * function after the TLS handshake, - * so we mustn't call multifd_send_thread until then - */ - return multifd_tls_channel_connect(p, ioc, errp); - } + qio_channel_set_delay(ioc, false); migration_ioc_register_yank(ioc); p->registered_yank = true; @@ -959,24 +929,51 @@ static bool multifd_channel_connect(MultiFDSendParams *p, return true; } +/* + * When TLS is enabled this function is called once to establish the + * TLS connection and a second time after the TLS handshake to create + * the multifd channel. Without TLS it goes straight into the channel + * creation. + */ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) { MultiFDSendParams *p = opaque; QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task)); Error *local_err = NULL; + bool ret; trace_multifd_new_send_channel_async(p->id); - if (!qio_task_propagate_error(task, &local_err)) { - qio_channel_set_delay(ioc, false); - if (multifd_channel_connect(p, ioc, &local_err)) { - return; - } + + if (qio_task_propagate_error(task, &local_err)) { + ret = false; + goto out; + } + + trace_multifd_set_outgoing_channel(ioc, object_get_typename(OBJECT(ioc)), + migrate_get_current()->hostname); + + if (migrate_channel_requires_tls_upgrade(ioc)) { + ret = multifd_tls_channel_connect(p, ioc, &local_err); + } else { + ret = multifd_channel_connect(p, ioc, &local_err); } + if (ret) { + return; + } + +out: trace_multifd_new_send_channel_async_error(p->id, local_err); multifd_send_set_error(local_err); multifd_send_kick_main(p); - object_unref(OBJECT(ioc)); + if (!p->c) { + /* + * If no channel has been created, drop the initial + * reference. Otherwise cleanup happens at + * multifd_send_channel_destroy() + */ + object_unref(OBJECT(ioc)); + } error_free(local_err); } -- Gitee From bc72816339b2ea4cc0272913a70037a857b824a3 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 6 Feb 2024 18:51:18 -0300 Subject: [PATCH 53/90] migration/multifd: Add a synchronization point for channel creation commit 93fa9dc2e0522c54b813dee0898a5feb98b624c9 upstream. It is possible that one of the multifd channels fails to be created at multifd_new_send_channel_async() while the rest of the channel creation tasks are still in flight. This could lead to multifd_save_cleanup() executing the qemu_thread_join() loop too early and not waiting for the threads which haven't been created yet, leading to the freeing of resources that the newly created threads will try to access and crash. Add a synchronization point after which there will be no attempts at thread creation and therefore calling multifd_save_cleanup() past that point will ensure it properly waits for the threads. A note about performance: Prior to this patch, if a channel took too long to be established, other channels could finish connecting first and already start taking load. Now we're bounded by the slowest-connecting channel. Intel-SIG: commit 93fa9dc2e052 migration/multifd: Add a synchronization point for channel creation Reported-by: Avihai Horon Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240206215118.6171-7-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 3db18dc79e3..adfe8c9a0a6 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -62,6 +62,11 @@ struct { * Make it easy for now. */ uintptr_t packet_num; + /* + * Synchronization point past which no more channels will be + * created. + */ + QemuSemaphore channels_created; /* send channels ready */ QemuSemaphore channels_ready; /* @@ -622,10 +627,6 @@ static void multifd_send_terminate_threads(void) /* * Finally recycle all the threads. - * - * TODO: p->running is still buggy, e.g. we can reach here without the - * corresponding multifd_new_send_channel_async() get invoked yet, - * then a new thread can even be created after this function returns. */ for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; @@ -670,6 +671,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) static void multifd_send_cleanup_state(void) { + qemu_sem_destroy(&multifd_send_state->channels_created); qemu_sem_destroy(&multifd_send_state->channels_ready); g_free(multifd_send_state->params); multifd_send_state->params = NULL; @@ -954,18 +956,26 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) if (migrate_channel_requires_tls_upgrade(ioc)) { ret = multifd_tls_channel_connect(p, ioc, &local_err); + if (ret) { + return; + } } else { ret = multifd_channel_connect(p, ioc, &local_err); } +out: + /* + * Here we're not interested whether creation succeeded, only that + * it happened at all. + */ + qemu_sem_post(&multifd_send_state->channels_created); + if (ret) { return; } -out: trace_multifd_new_send_channel_async_error(p->id, local_err); multifd_send_set_error(local_err); - multifd_send_kick_main(p); if (!p->c) { /* * If no channel has been created, drop the initial @@ -998,6 +1008,7 @@ bool multifd_send_setup(void) multifd_send_state = g_malloc0(sizeof(*multifd_send_state)); multifd_send_state->params = g_new0(MultiFDSendParams, thread_count); multifd_send_state->pages = multifd_pages_init(page_count); + qemu_sem_init(&multifd_send_state->channels_created, 0); qemu_sem_init(&multifd_send_state->channels_ready, 0); qatomic_set(&multifd_send_state->exiting, 0); multifd_send_state->ops = multifd_ops[migrate_multifd_compression()]; @@ -1023,6 +1034,15 @@ bool multifd_send_setup(void) multifd_new_send_channel_create(p); } + /* + * Wait until channel creation has started for all channels. The + * creation can still fail, but no more channels will be created + * past this point. + */ + for (i = 0; i < thread_count; i++) { + qemu_sem_wait(&multifd_send_state->channels_created); + } + for (i = 0; i < thread_count; i++) { MultiFDSendParams *p = &multifd_send_state->params[i]; -- Gitee From f2bff5d8533972d8695069c9dc0b4675245c3e48 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 20 Feb 2024 19:41:08 -0300 Subject: [PATCH 54/90] migration/multifd: Remove p->quit from recv side commit 11dd7be57524d400652cecf8740a016b3d66b53d upstream. Like we did on the sending side, replace the p->quit per-channel flag with a global atomic 'exiting' flag. Intel-SIG: commit 11dd7be57524 migration/multifd: Remove p->quit from recv side Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240220224138.24759-5-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index adfe8c9a0a6..fba00b9e8fb 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -79,6 +79,19 @@ struct { MultiFDMethods *ops; } *multifd_send_state; +struct { + MultiFDRecvParams *params; + /* number of created threads */ + int count; + /* syncs main thread and channels */ + QemuSemaphore sem_sync; + /* global number of generated multifd packets */ + uint64_t packet_num; + int exiting; + /* multifd ops */ + MultiFDMethods *ops; +} *multifd_recv_state; + /* Multifd without compression */ /** @@ -440,6 +453,11 @@ static bool multifd_send_should_exit(void) return qatomic_read(&multifd_send_state->exiting); } +static bool multifd_recv_should_exit(void) +{ + return qatomic_read(&multifd_recv_state->exiting); +} + /* * The migration thread can wait on either of the two semaphores. This * function can be used to kick the main thread out of waiting on either of @@ -1063,24 +1081,16 @@ bool multifd_send_setup(void) return true; } -struct { - MultiFDRecvParams *params; - /* number of created threads */ - int count; - /* syncs main thread and channels */ - QemuSemaphore sem_sync; - /* global number of generated multifd packets */ - uint64_t packet_num; - /* multifd ops */ - MultiFDMethods *ops; -} *multifd_recv_state; - static void multifd_recv_terminate_threads(Error *err) { int i; trace_multifd_recv_terminate_threads(err != NULL); + if (qatomic_xchg(&multifd_recv_state->exiting, 1)) { + return; + } + if (err) { MigrationState *s = migrate_get_current(); migrate_set_error(s, err); @@ -1094,8 +1104,6 @@ static void multifd_recv_terminate_threads(Error *err) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; - qemu_mutex_lock(&p->mutex); - p->quit = true; /* * We could arrive here for two reasons: * - normal quit, i.e. everything went fine, just finished @@ -1105,7 +1113,6 @@ static void multifd_recv_terminate_threads(Error *err) if (p->c) { qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); } - qemu_mutex_unlock(&p->mutex); } } @@ -1210,7 +1217,7 @@ static void *multifd_recv_thread(void *opaque) while (true) { uint32_t flags; - if (p->quit) { + if (multifd_recv_should_exit()) { break; } @@ -1274,6 +1281,7 @@ int multifd_recv_setup(Error **errp) multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state)); multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count); qatomic_set(&multifd_recv_state->count, 0); + qatomic_set(&multifd_recv_state->exiting, 0); qemu_sem_init(&multifd_recv_state->sem_sync, 0); multifd_recv_state->ops = multifd_ops[migrate_multifd_compression()]; @@ -1282,7 +1290,6 @@ int multifd_recv_setup(Error **errp) qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem_sync, 0); - p->quit = false; p->id = i; p->packet_len = sizeof(MultiFDPacket_t) + sizeof(uint64_t) * page_count; -- Gitee From 6bce7e77d775d1679649b9590ed3da1b4bec3c19 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 20 Feb 2024 19:41:09 -0300 Subject: [PATCH 55/90] migration/multifd: Release recv sem_sync earlier commit d13f0026c7a625a5a34a5dea4095a4d9cfa04652 upstream. Now that multifd_recv_terminate_threads() is called only once, release the recv side sem_sync earlier like we do for the send side. Intel-SIG: commit d13f0026c7a6 migration/multifd: Release recv sem_sync earlier Signed-off-by: Fabiano Rosas Reviewed-by: Peter Xu Link: https://lore.kernel.org/r/20240220224138.24759-6-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index fba00b9e8fb..43f0820996c 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -1104,6 +1104,12 @@ static void multifd_recv_terminate_threads(Error *err) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; + /* + * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, + * however try to wakeup it without harm in cleanup phase. + */ + qemu_sem_post(&p->sem_sync); + /* * We could arrive here for two reasons: * - normal quit, i.e. everything went fine, just finished @@ -1162,12 +1168,6 @@ void multifd_recv_cleanup(void) for (i = 0; i < migrate_multifd_channels(); i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; - /* - * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, - * however try to wakeup it without harm in cleanup phase. - */ - qemu_sem_post(&p->sem_sync); - if (p->thread_created) { qemu_thread_join(&p->thread); } -- Gitee From 1b8e244b734c3714a4c2fc57ecaf9c6f91baae19 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 22 Feb 2024 17:52:57 +0800 Subject: [PATCH 56/90] migration/multifd: Cleanup TLS iochannel referencing commit 9221e3c6a237da90ac296adfeb6e99ea9babfc20 upstream. Commit a1af605bd5 ("migration/multifd: fix hangup with TLS-Multifd due to blocking handshake") introduced a thread for TLS channels, which will resolve the issue on blocking the main thread. However in the same commit p->c is slightly abused just to be able to pass over the pointer "p" into the thread. That's the major reason we'll need to conditionally free the io channel in the fault paths. To clean it up, using a separate structure to pass over both "p" and "tioc" in the tls handshake thread. Then we can make it a rule that p->c will never be set until the channel is completely setup. With that, we can drop the tricky conditional unref of the io channel in the error path. Intel-SIG: commit 9221e3c6a237 migration/multifd: Cleanup TLS iochannel referencing Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240222095301.171137-2-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 43f0820996c..84a6b9e58f2 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -891,16 +891,22 @@ out: static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque); +typedef struct { + MultiFDSendParams *p; + QIOChannelTLS *tioc; +} MultiFDTLSThreadArgs; + static void *multifd_tls_handshake_thread(void *opaque) { - MultiFDSendParams *p = opaque; - QIOChannelTLS *tioc = QIO_CHANNEL_TLS(p->c); + MultiFDTLSThreadArgs *args = opaque; - qio_channel_tls_handshake(tioc, + qio_channel_tls_handshake(args->tioc, multifd_new_send_channel_async, - p, + args->p, NULL, NULL); + g_free(args); + return NULL; } @@ -910,6 +916,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, { MigrationState *s = migrate_get_current(); const char *hostname = s->hostname; + MultiFDTLSThreadArgs *args; QIOChannelTLS *tioc; tioc = migration_tls_client_create(ioc, hostname, errp); @@ -924,11 +931,14 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, object_unref(OBJECT(ioc)); trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname); qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing"); - p->c = QIO_CHANNEL(tioc); + + args = g_new0(MultiFDTLSThreadArgs, 1); + args->tioc = tioc; + args->p = p; p->tls_thread_created = true; qemu_thread_create(&p->tls_thread, "multifd-tls-handshake-worker", - multifd_tls_handshake_thread, p, + multifd_tls_handshake_thread, args, QEMU_THREAD_JOINABLE); return true; } @@ -941,6 +951,7 @@ static bool multifd_channel_connect(MultiFDSendParams *p, migration_ioc_register_yank(ioc); p->registered_yank = true; + /* Setup p->c only if the channel is completely setup */ p->c = ioc; p->thread_created = true; @@ -994,14 +1005,12 @@ out: trace_multifd_new_send_channel_async_error(p->id, local_err); multifd_send_set_error(local_err); - if (!p->c) { - /* - * If no channel has been created, drop the initial - * reference. Otherwise cleanup happens at - * multifd_send_channel_destroy() - */ - object_unref(OBJECT(ioc)); - } + /* + * For error cases (TLS or non-TLS), IO channel is always freed here + * rather than when cleanup multifd: since p->c is not set, multifd + * cleanup code doesn't even know its existence. + */ + object_unref(OBJECT(ioc)); error_free(local_err); } -- Gitee From 70a84a12f6c7c2bafbb45fa5c8bc2c738d6a75bc Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 22 Feb 2024 17:52:58 +0800 Subject: [PATCH 57/90] migration/multifd: Drop registered_yank commit 0518b5d8d30d3a4d0ea4f45d61527bcdc43044d2 upstream. With a clear definition of p->c protocol, where we only set it up if the channel is fully established (TLS or non-TLS), registered_yank boolean will have equal meaning of "p->c != NULL". Drop registered_yank by checking p->c instead. Intel-SIG: commit 0518b5d8d30d migration/multifd: Drop registered_yank Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240222095301.171137-3-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 7 +++---- migration/multifd.h | 2 -- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 84a6b9e58f2..1d039a48408 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -666,11 +666,11 @@ static int multifd_send_channel_destroy(QIOChannel *send) static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) { - if (p->registered_yank) { + if (p->c) { migration_ioc_unregister_yank(p->c); + multifd_send_channel_destroy(p->c); + p->c = NULL; } - multifd_send_channel_destroy(p->c); - p->c = NULL; qemu_sem_destroy(&p->sem); qemu_sem_destroy(&p->sem_sync); g_free(p->name); @@ -950,7 +950,6 @@ static bool multifd_channel_connect(MultiFDSendParams *p, qio_channel_set_delay(ioc, false); migration_ioc_register_yank(ioc); - p->registered_yank = true; /* Setup p->c only if the channel is completely setup */ p->c = ioc; diff --git a/migration/multifd.h b/migration/multifd.h index 8a1cad09967..b3fe27ae93a 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -78,8 +78,6 @@ typedef struct { bool tls_thread_created; /* communication channel */ QIOChannel *c; - /* is the yank function registered */ - bool registered_yank; /* packet allocated len */ uint32_t packet_len; /* guest page size */ -- Gitee From 577f2123fa701aa0a9843382125d04afa18b5506 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 22 Feb 2024 17:52:59 +0800 Subject: [PATCH 58/90] migration/multifd: Make multifd_channel_connect() return void commit 770de49c00fa9eb262473f282c92979b47b7fd22 upstream. It never fails, drop the retval and also the Error**. Intel-SIG: commit 770de49c00fa migration/multifd: Make multifd_channel_connect() return void Suggested-by: Avihai Horon Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240222095301.171137-4-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 1d039a48408..af89e059159 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -943,9 +943,7 @@ static bool multifd_tls_channel_connect(MultiFDSendParams *p, return true; } -static bool multifd_channel_connect(MultiFDSendParams *p, - QIOChannel *ioc, - Error **errp) +static void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc) { qio_channel_set_delay(ioc, false); @@ -956,7 +954,6 @@ static bool multifd_channel_connect(MultiFDSendParams *p, p->thread_created = true; qemu_thread_create(&p->thread, p->name, multifd_send_thread, p, QEMU_THREAD_JOINABLE); - return true; } /* @@ -988,7 +985,8 @@ static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque) return; } } else { - ret = multifd_channel_connect(p, ioc, &local_err); + multifd_channel_connect(p, ioc); + ret = true; } out: -- Gitee From 732a6a03c3c0131193d09b5ef9b47c8c33541489 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 22 Feb 2024 17:53:00 +0800 Subject: [PATCH 59/90] migration/multifd: Cleanup outgoing_args in state destroy commit 72b90b96872acc5d00f9c16dfc196543349361da upstream. outgoing_args is a global cache of socket address to be reused in multifd. Freeing the cache in per-channel destructor is more or less a hack. Move it to multifd_send_cleanup_state() so it only get checked once. Use a small helper to do so because it's internal of socket.c. Intel-SIG: commit 72b90b96872a migration/multifd: Cleanup outgoing_args in state destroy Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240222095301.171137-5-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 1 + migration/socket.c | 12 ++++++++---- migration/socket.h | 2 ++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index af89e059159..fa33fd98b4a 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -689,6 +689,7 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) static void multifd_send_cleanup_state(void) { + socket_cleanup_outgoing_migration(); qemu_sem_destroy(&multifd_send_state->channels_created); qemu_sem_destroy(&multifd_send_state->channels_ready); g_free(multifd_send_state->params); diff --git a/migration/socket.c b/migration/socket.c index 98e3ea15147..3184c7c3c11 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -64,10 +64,6 @@ int socket_send_channel_destroy(QIOChannel *send) { /* Remove channel */ object_unref(OBJECT(send)); - if (outgoing_args.saddr) { - qapi_free_SocketAddress(outgoing_args.saddr); - outgoing_args.saddr = NULL; - } return 0; } @@ -137,6 +133,14 @@ void socket_start_outgoing_migration(MigrationState *s, NULL); } +void socket_cleanup_outgoing_migration(void) +{ + if (outgoing_args.saddr) { + qapi_free_SocketAddress(outgoing_args.saddr); + outgoing_args.saddr = NULL; + } +} + static void socket_accept_incoming_migration(QIONetListener *listener, QIOChannelSocket *cioc, gpointer opaque) diff --git a/migration/socket.h b/migration/socket.h index 5e4c33b8ea5..5f52eddd4cf 100644 --- a/migration/socket.h +++ b/migration/socket.h @@ -29,4 +29,6 @@ void socket_start_incoming_migration(SocketAddress *saddr, Error **errp); void socket_start_outgoing_migration(MigrationState *s, SocketAddress *saddr, Error **errp); +void socket_cleanup_outgoing_migration(void); + #endif -- Gitee From c205cc305d3302046412e0cc1f086b3647039add Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 22 Feb 2024 17:53:01 +0800 Subject: [PATCH 60/90] migration/multifd: Drop unnecessary helper to destroy IOC commit c9a7e83c9d64fd5ebc759186789e1b753c919d32 upstream. Both socket_send_channel_destroy() and multifd_send_channel_destroy() are unnecessary wrappers to destroy an IOC, as the only thing to do is to release the final IOC reference. We have plenty of code that destroys an IOC using direct unref() already; keep that style. Intel-SIG: commit c9a7e83c9d64 migration/multifd: Drop unnecessary helper to destroy IOC Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240222095301.171137-6-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 7 +------ migration/socket.c | 7 ------- migration/socket.h | 1 - 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index fa33fd98b4a..6c07f19af17 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -659,16 +659,11 @@ static void multifd_send_terminate_threads(void) } } -static int multifd_send_channel_destroy(QIOChannel *send) -{ - return socket_send_channel_destroy(send); -} - static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) { if (p->c) { migration_ioc_unregister_yank(p->c); - multifd_send_channel_destroy(p->c); + object_unref(OBJECT(p->c)); p->c = NULL; } qemu_sem_destroy(&p->sem); diff --git a/migration/socket.c b/migration/socket.c index 3184c7c3c11..9ab89b1e089 100644 --- a/migration/socket.c +++ b/migration/socket.c @@ -60,13 +60,6 @@ QIOChannel *socket_send_channel_create_sync(Error **errp) return QIO_CHANNEL(sioc); } -int socket_send_channel_destroy(QIOChannel *send) -{ - /* Remove channel */ - object_unref(OBJECT(send)); - return 0; -} - struct SocketConnectData { MigrationState *s; char *hostname; diff --git a/migration/socket.h b/migration/socket.h index 5f52eddd4cf..46c233ecd29 100644 --- a/migration/socket.h +++ b/migration/socket.h @@ -23,7 +23,6 @@ void socket_send_channel_create(QIOTaskFunc f, void *data); QIOChannel *socket_send_channel_create_sync(Error **errp); -int socket_send_channel_destroy(QIOChannel *send); void socket_start_incoming_migration(SocketAddress *saddr, Error **errp); -- Gitee From c2ef9e914afae7f80f0696d62934c04124d1c759 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:29:55 -0300 Subject: [PATCH 61/90] migration/multifd: Cleanup multifd_recv_sync_main commit 4aac6b1e9bd48677c4f24518fe86ffd34c677d5a upstream. Some minor cleanups and documentation for multifd_recv_sync_main. Use thread_count as done in other parts of the code. Remove p->id from the multifd_recv_state sync, since that is global and not tied to a channel. Add documentation for the sync steps. Intel-SIG: commit 4aac6b1e9bd4 migration/multifd: Cleanup multifd_recv_sync_main Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-2-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd.c | 17 +++++++++++++---- migration/trace-events | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 6c07f19af17..c7389bf8339 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -1182,18 +1182,27 @@ void multifd_recv_cleanup(void) void multifd_recv_sync_main(void) { + int thread_count = migrate_multifd_channels(); int i; if (!migrate_multifd()) { return; } - for (i = 0; i < migrate_multifd_channels(); i++) { - MultiFDRecvParams *p = &multifd_recv_state->params[i]; - trace_multifd_recv_sync_main_wait(p->id); + /* + * Initiate the synchronization by waiting for all channels. + * For socket-based migration this means each channel has received + * the SYNC packet on the stream. + */ + for (i = 0; i < thread_count; i++) { + trace_multifd_recv_sync_main_wait(i); qemu_sem_wait(&multifd_recv_state->sem_sync); } - for (i = 0; i < migrate_multifd_channels(); i++) { + + /* + * Sync done. Release the channels for the next iteration. + */ + for (i = 0; i < thread_count; i++) { MultiFDRecvParams *p = &multifd_recv_state->params[i]; WITH_QEMU_LOCK_GUARD(&p->mutex) { diff --git a/migration/trace-events b/migration/trace-events index 298ad2b0dd6..bf1a0696325 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -132,7 +132,7 @@ multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uin multifd_recv_new_channel(uint8_t id) "channel %u" multifd_recv_sync_main(long packet_num) "packet num %ld" multifd_recv_sync_main_signal(uint8_t id) "channel %u" -multifd_recv_sync_main_wait(uint8_t id) "channel %u" +multifd_recv_sync_main_wait(uint8_t id) "iter %u" multifd_recv_terminate_threads(bool error) "error %d" multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64 multifd_recv_thread_start(uint8_t id) "%u" -- Gitee From 17e41c2dcabf5d1c117cc51ea0d383627db07981 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:06 -0300 Subject: [PATCH 62/90] migration/multifd: Rename MultiFDSend|RecvParams::data to compress_data commit 402dd7ac1c3be44f306c903cdfd2583ffec5e2fd upstream. Use a more specific name for the compression data so we can use the generic for the multifd core code. Intel-SIG: commit 402dd7ac1c3b migration/multifd: Rename MultiFDSend|RecvParams::data to compress_data Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-13-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd-zlib.c | 20 ++++++++++---------- migration/multifd-zstd.c | 20 ++++++++++---------- migration/multifd.h | 4 ++-- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 012e3bdea1d..2a8f5fc9a67 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -69,7 +69,7 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp) err_msg = "out of memory for buf"; goto err_free_zbuff; } - p->data = z; + p->compress_data = z; return 0; err_free_zbuff: @@ -92,15 +92,15 @@ err_free_z: */ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) { - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; deflateEnd(&z->zs); g_free(z->zbuff); z->zbuff = NULL; g_free(z->buf); z->buf = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** @@ -117,7 +117,7 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) { MultiFDPages_t *pages = p->pages; - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; z_stream *zs = &z->zs; uint32_t out_size = 0; int ret; @@ -194,7 +194,7 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp) struct zlib_data *z = g_new0(struct zlib_data, 1); z_stream *zs = &z->zs; - p->data = z; + p->compress_data = z; zs->zalloc = Z_NULL; zs->zfree = Z_NULL; zs->opaque = Z_NULL; @@ -224,13 +224,13 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp) */ static void zlib_recv_cleanup(MultiFDRecvParams *p) { - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; inflateEnd(&z->zs); g_free(z->zbuff); z->zbuff = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** @@ -246,7 +246,7 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) */ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) { - struct zlib_data *z = p->data; + struct zlib_data *z = p->compress_data; z_stream *zs = &z->zs; uint32_t in_size = p->next_packet_size; /* we measure the change of total_out */ diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index dc8fe43e948..593cf290ad8 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -52,7 +52,7 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) struct zstd_data *z = g_new0(struct zstd_data, 1); int res; - p->data = z; + p->compress_data = z; z->zcs = ZSTD_createCStream(); if (!z->zcs) { g_free(z); @@ -90,14 +90,14 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) */ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) { - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; ZSTD_freeCStream(z->zcs); z->zcs = NULL; g_free(z->zbuff); z->zbuff = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** @@ -114,7 +114,7 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) { MultiFDPages_t *pages = p->pages; - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; int ret; uint32_t i; @@ -183,7 +183,7 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) struct zstd_data *z = g_new0(struct zstd_data, 1); int ret; - p->data = z; + p->compress_data = z; z->zds = ZSTD_createDStream(); if (!z->zds) { g_free(z); @@ -221,14 +221,14 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp) */ static void zstd_recv_cleanup(MultiFDRecvParams *p) { - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; ZSTD_freeDStream(z->zds); z->zds = NULL; g_free(z->zbuff); z->zbuff = NULL; - g_free(p->data); - p->data = NULL; + g_free(p->compress_data); + p->compress_data = NULL; } /** @@ -248,7 +248,7 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) uint32_t out_size = 0; uint32_t expected_size = p->normal_num * p->page_size; uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; - struct zstd_data *z = p->data; + struct zstd_data *z = p->compress_data; int ret; int i; diff --git a/migration/multifd.h b/migration/multifd.h index b3fe27ae93a..adccd3532fc 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -127,7 +127,7 @@ typedef struct { /* number of iovs used */ uint32_t iovs_num; /* used for compression methods */ - void *data; + void *compress_data; } MultiFDSendParams; typedef struct { @@ -183,7 +183,7 @@ typedef struct { /* num of non zero pages */ uint32_t normal_num; /* used for de-compression methods */ - void *data; + void *compress_data; } MultiFDRecvParams; typedef struct { -- Gitee From e202cc24b70c2e9b7ee5c60ee27afea940fee667 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:07 -0300 Subject: [PATCH 63/90] migration/multifd: Decouple recv method from pages commit 9db191251381c75e57201f7b07330ca982a55d1e upstream. Next patches will abstract the type of data being received by the channels, so do some cleanup now to remove references to pages and dependency on 'normal_num'. Intel-SIG: commit 9db191251381 migration/multifd: Decouple recv method from pages Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-14-farosas@suse.de Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/multifd-zlib.c | 6 +++--- migration/multifd-zstd.c | 6 +++--- migration/multifd.c | 13 ++++++++----- migration/multifd.h | 4 ++-- 4 files changed, 16 insertions(+), 13 deletions(-) diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 2a8f5fc9a67..6120faad65a 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -234,7 +234,7 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) } /** - * zlib_recv_pages: read the data from the channel into actual pages + * zlib_recv: read the data from the channel into actual pages * * Read the compressed buffer, and uncompress it into the actual * pages. @@ -244,7 +244,7 @@ static void zlib_recv_cleanup(MultiFDRecvParams *p) * @p: Params for the channel that we are using * @errp: pointer to an error */ -static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp) +static int zlib_recv(MultiFDRecvParams *p, Error **errp) { struct zlib_data *z = p->compress_data; z_stream *zs = &z->zs; @@ -319,7 +319,7 @@ static MultiFDMethods multifd_zlib_ops = { .send_prepare = zlib_send_prepare, .recv_setup = zlib_recv_setup, .recv_cleanup = zlib_recv_cleanup, - .recv_pages = zlib_recv_pages + .recv = zlib_recv }; static void multifd_zlib_register(void) diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index 593cf290ad8..cac236833d0 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -232,7 +232,7 @@ static void zstd_recv_cleanup(MultiFDRecvParams *p) } /** - * zstd_recv_pages: read the data from the channel into actual pages + * zstd_recv: read the data from the channel into actual pages * * Read the compressed buffer, and uncompress it into the actual * pages. @@ -242,7 +242,7 @@ static void zstd_recv_cleanup(MultiFDRecvParams *p) * @p: Params for the channel that we are using * @errp: pointer to an error */ -static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp) +static int zstd_recv(MultiFDRecvParams *p, Error **errp) { uint32_t in_size = p->next_packet_size; uint32_t out_size = 0; @@ -310,7 +310,7 @@ static MultiFDMethods multifd_zstd_ops = { .send_prepare = zstd_send_prepare, .recv_setup = zstd_recv_setup, .recv_cleanup = zstd_recv_cleanup, - .recv_pages = zstd_recv_pages + .recv = zstd_recv }; static void multifd_zstd_register(void) diff --git a/migration/multifd.c b/migration/multifd.c index c7389bf8339..3a8520097b2 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -197,7 +197,7 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) } /** - * nocomp_recv_pages: read the data from the channel into actual pages + * nocomp_recv: read the data from the channel * * For no compression we just need to read things into the correct place. * @@ -206,7 +206,7 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) * @p: Params for the channel that we are using * @errp: pointer to an error */ -static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp) +static int nocomp_recv(MultiFDRecvParams *p, Error **errp) { uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; @@ -228,7 +228,7 @@ static MultiFDMethods multifd_nocomp_ops = { .send_prepare = nocomp_send_prepare, .recv_setup = nocomp_recv_setup, .recv_cleanup = nocomp_recv_cleanup, - .recv_pages = nocomp_recv_pages + .recv = nocomp_recv }; static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = { @@ -1227,6 +1227,8 @@ static void *multifd_recv_thread(void *opaque) while (true) { uint32_t flags; + bool has_data = false; + p->normal_num = 0; if (multifd_recv_should_exit()) { break; @@ -1248,10 +1250,11 @@ static void *multifd_recv_thread(void *opaque) flags = p->flags; /* recv methods don't know how to handle the SYNC flag */ p->flags &= ~MULTIFD_FLAG_SYNC; + has_data = !!p->normal_num; qemu_mutex_unlock(&p->mutex); - if (p->normal_num) { - ret = multifd_recv_state->ops->recv_pages(p, &local_err); + if (has_data) { + ret = multifd_recv_state->ops->recv(p, &local_err); if (ret != 0) { break; } diff --git a/migration/multifd.h b/migration/multifd.h index adccd3532fc..6a54377cc14 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -197,8 +197,8 @@ typedef struct { int (*recv_setup)(MultiFDRecvParams *p, Error **errp); /* Cleanup for receiving side */ void (*recv_cleanup)(MultiFDRecvParams *p); - /* Read all pages */ - int (*recv_pages)(MultiFDRecvParams *p, Error **errp); + /* Read all data */ + int (*recv)(MultiFDRecvParams *p, Error **errp); } MultiFDMethods; void multifd_register_ops(int method, MultiFDMethods *ops); -- Gitee From c1a7d223d0484752b23efe6d4e456680d2164031 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 29 Feb 2024 12:30:08 -0300 Subject: [PATCH 64/90] migration/multifd: Allow multifd without packets commit 06833d83f8978139395da0f1d6a9fad81b9dd024 upstream. For the upcoming support to the new 'mapped-ram' migration stream format, we cannot use multifd packets because each write into the ramblock section in the migration file is expected to contain only the guest pages. They are written at their respective offsets relative to the ramblock section header. There is no space for the packet information and the expected gains from the new approach come partly from being able to write the pages sequentially without extraneous data in between. The new format also simply doesn't need the packets and all necessary information can be taken from the standard migration headers with some (future) changes to multifd code. Use the presence of the mapped-ram capability to decide whether to send packets. This only moves code under multifd_use_packets(), it has no effect for now as mapped-ram cannot yet be enabled with multifd. Intel-SIG: commit 06833d83f897 migration/multifd: Allow multifd without packets Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240229153017.2221-15-farosas@suse.de Signed-off-by: Peter Xu [jz: make multifd_use_packet to always return true, since mapped-ram is not backported] Signed-off-by: Jason Zeng --- migration/multifd.c | 175 +++++++++++++++++++++++++++++--------------- 1 file changed, 114 insertions(+), 61 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index 3a8520097b2..9e617b608ed 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -92,6 +92,11 @@ struct { MultiFDMethods *ops; } *multifd_recv_state; +static bool multifd_use_packets(void) +{ + return true; +} + /* Multifd without compression */ /** @@ -122,6 +127,19 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) return; } +static void multifd_send_prepare_iovs(MultiFDSendParams *p) +{ + MultiFDPages_t *pages = p->pages; + + for (int i = 0; i < pages->num; i++) { + p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; + p->iov[p->iovs_num].iov_len = p->page_size; + p->iovs_num++; + } + + p->next_packet_size = pages->num * p->page_size; +} + /** * nocomp_send_prepare: prepare date to be able to send * @@ -136,9 +154,13 @@ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) { bool use_zero_copy_send = migrate_zero_copy_send(); - MultiFDPages_t *pages = p->pages; int ret; + if (!multifd_use_packets()) { + multifd_send_prepare_iovs(p); + return 0; + } + if (!use_zero_copy_send) { /* * Only !zerocopy needs the header in IOV; zerocopy will @@ -147,13 +169,7 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) multifd_send_prepare_header(p); } - for (int i = 0; i < pages->num; i++) { - p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; - p->iov[p->iovs_num].iov_len = p->page_size; - p->iovs_num++; - } - - p->next_packet_size = pages->num * p->page_size; + multifd_send_prepare_iovs(p); p->flags |= MULTIFD_FLAG_NOCOMP; multifd_send_fill_packet(p); @@ -208,7 +224,13 @@ static void nocomp_recv_cleanup(MultiFDRecvParams *p) */ static int nocomp_recv(MultiFDRecvParams *p, Error **errp) { - uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + uint32_t flags; + + if (!multifd_use_packets()) { + return 0; + } + + flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; if (flags != MULTIFD_FLAG_NOCOMP) { error_setg(errp, "multifd %u: flags received %x flags expected %x", @@ -795,15 +817,18 @@ static void *multifd_send_thread(void *opaque) MigrationThread *thread = NULL; Error *local_err = NULL; int ret = 0; + bool use_packets = multifd_use_packets(); thread = migration_threads_add(p->name, qemu_get_thread_id()); trace_multifd_send_thread_start(p->id); rcu_register_thread(); - if (multifd_send_initial_packet(p, &local_err) < 0) { - ret = -1; - goto out; + if (use_packets) { + if (multifd_send_initial_packet(p, &local_err) < 0) { + ret = -1; + goto out; + } } while (true) { @@ -854,16 +879,20 @@ static void *multifd_send_thread(void *opaque) * it doesn't require explicit memory barriers. */ assert(qatomic_read(&p->pending_sync)); - p->flags = MULTIFD_FLAG_SYNC; - multifd_send_fill_packet(p); - ret = qio_channel_write_all(p->c, (void *)p->packet, - p->packet_len, &local_err); - if (ret != 0) { - break; + + if (use_packets) { + p->flags = MULTIFD_FLAG_SYNC; + multifd_send_fill_packet(p); + ret = qio_channel_write_all(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret != 0) { + break; + } + /* p->next_packet_size will always be zero for a SYNC packet */ + stat64_add(&mig_stats.multifd_bytes, p->packet_len); + p->flags = 0; } - /* p->next_packet_size will always be zero for a SYNC packet */ - stat64_add(&mig_stats.multifd_bytes, p->packet_len); - p->flags = 0; + qatomic_set(&p->pending_sync, false); qemu_sem_post(&p->sem_sync); } @@ -1018,6 +1047,7 @@ bool multifd_send_setup(void) Error *local_err = NULL; int thread_count, ret = 0; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); + bool use_packets = multifd_use_packets(); uint8_t i; if (!migrate_multifd()) { @@ -1040,14 +1070,20 @@ bool multifd_send_setup(void) qemu_sem_init(&p->sem_sync, 0); p->id = i; p->pages = multifd_pages_init(page_count); - p->packet_len = sizeof(MultiFDPacket_t) - + sizeof(uint64_t) * page_count; - p->packet = g_malloc0(p->packet_len); - p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); - p->packet->version = cpu_to_be32(MULTIFD_VERSION); + + if (use_packets) { + p->packet_len = sizeof(MultiFDPacket_t) + + sizeof(uint64_t) * page_count; + p->packet = g_malloc0(p->packet_len); + p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); + p->packet->version = cpu_to_be32(MULTIFD_VERSION); + + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, page_count + 1); + } else { + p->iov = g_new0(struct iovec, page_count); + } p->name = g_strdup_printf("multifdsend_%d", i); - /* We need one extra place for the packet header */ - p->iov = g_new0(struct iovec, page_count + 1); p->page_size = qemu_target_page_size(); p->page_count = page_count; p->write_flags = 0; @@ -1110,7 +1146,9 @@ static void multifd_recv_terminate_threads(Error *err) * multifd_recv_thread may hung at MULTIFD_FLAG_SYNC handle code, * however try to wakeup it without harm in cleanup phase. */ - qemu_sem_post(&p->sem_sync); + if (multifd_use_packets()) { + qemu_sem_post(&p->sem_sync); + } /* * We could arrive here for two reasons: @@ -1185,7 +1223,7 @@ void multifd_recv_sync_main(void) int thread_count = migrate_multifd_channels(); int i; - if (!migrate_multifd()) { + if (!migrate_multifd() || !multifd_use_packets()) { return; } @@ -1220,13 +1258,14 @@ static void *multifd_recv_thread(void *opaque) { MultiFDRecvParams *p = opaque; Error *local_err = NULL; + bool use_packets = multifd_use_packets(); int ret; trace_multifd_recv_thread_start(p->id); rcu_register_thread(); while (true) { - uint32_t flags; + uint32_t flags = 0; bool has_data = false; p->normal_num = 0; @@ -1234,25 +1273,27 @@ static void *multifd_recv_thread(void *opaque) break; } - ret = qio_channel_read_all_eof(p->c, (void *)p->packet, - p->packet_len, &local_err); - if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */ - break; - } + if (use_packets) { + ret = qio_channel_read_all_eof(p->c, (void *)p->packet, + p->packet_len, &local_err); + if (ret == 0 || ret == -1) { /* 0: EOF -1: Error */ + break; + } - qemu_mutex_lock(&p->mutex); - ret = multifd_recv_unfill_packet(p, &local_err); - if (ret) { + qemu_mutex_lock(&p->mutex); + ret = multifd_recv_unfill_packet(p, &local_err); + if (ret) { + qemu_mutex_unlock(&p->mutex); + break; + } + + flags = p->flags; + /* recv methods don't know how to handle the SYNC flag */ + p->flags &= ~MULTIFD_FLAG_SYNC; + has_data = !!p->normal_num; qemu_mutex_unlock(&p->mutex); - break; } - flags = p->flags; - /* recv methods don't know how to handle the SYNC flag */ - p->flags &= ~MULTIFD_FLAG_SYNC; - has_data = !!p->normal_num; - qemu_mutex_unlock(&p->mutex); - if (has_data) { ret = multifd_recv_state->ops->recv(p, &local_err); if (ret != 0) { @@ -1260,9 +1301,11 @@ static void *multifd_recv_thread(void *opaque) } } - if (flags & MULTIFD_FLAG_SYNC) { - qemu_sem_post(&multifd_recv_state->sem_sync); - qemu_sem_wait(&p->sem_sync); + if (use_packets) { + if (flags & MULTIFD_FLAG_SYNC) { + qemu_sem_post(&multifd_recv_state->sem_sync); + qemu_sem_wait(&p->sem_sync); + } } } @@ -1281,6 +1324,7 @@ int multifd_recv_setup(Error **errp) { int thread_count; uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size(); + bool use_packets = multifd_use_packets(); uint8_t i; /* @@ -1305,9 +1349,12 @@ int multifd_recv_setup(Error **errp) qemu_mutex_init(&p->mutex); qemu_sem_init(&p->sem_sync, 0); p->id = i; - p->packet_len = sizeof(MultiFDPacket_t) - + sizeof(uint64_t) * page_count; - p->packet = g_malloc0(p->packet_len); + + if (use_packets) { + p->packet_len = sizeof(MultiFDPacket_t) + + sizeof(uint64_t) * page_count; + p->packet = g_malloc0(p->packet_len); + } p->name = g_strdup_printf("multifdrecv_%d", i); p->iov = g_new0(struct iovec, page_count); p->normal = g_new0(ram_addr_t, page_count); @@ -1351,18 +1398,24 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) { MultiFDRecvParams *p; Error *local_err = NULL; + bool use_packets = multifd_use_packets(); int id; - id = multifd_recv_initial_packet(ioc, &local_err); - if (id < 0) { - multifd_recv_terminate_threads(local_err); - error_propagate_prepend(errp, local_err, - "failed to receive packet" - " via multifd channel %d: ", - qatomic_read(&multifd_recv_state->count)); - return; + if (use_packets) { + id = multifd_recv_initial_packet(ioc, &local_err); + if (id < 0) { + multifd_recv_terminate_threads(local_err); + error_propagate_prepend(errp, local_err, + "failed to receive packet" + " via multifd channel %d: ", + qatomic_read(&multifd_recv_state->count)); + return; + } + trace_multifd_recv_new_channel(id); + } else { + /* next patch gives this a meaningful value */ + id = 0; } - trace_multifd_recv_new_channel(id); p = &multifd_recv_state->params[id]; if (p->c != NULL) { -- Gitee From f26d12b6de65f24e8bb065896dd20f45f0c1a558 Mon Sep 17 00:00:00 2001 From: Hao Xiang Date: Mon, 11 Mar 2024 18:00:11 +0000 Subject: [PATCH 65/90] migration/multifd: Add new migration option zero-page-detection. commit 5fdbb1dfccfd59661c95cae760b8e276c5b8e65c upstream. This new parameter controls where the zero page checking is running. 1. If this parameter is set to 'legacy', zero page checking is done in the migration main thread. 2. If this parameter is set to 'none', zero page checking is disabled. Intel-SIG: commit 5fdbb1dfccfd migration/multifd: Add new migration option zero-page-detection. Signed-off-by: Hao Xiang Reviewed-by: Peter Xu Acked-by: Markus Armbruster Link: https://lore.kernel.org/r/20240311180015.3359271-4-hao.xiang@linux.dev Signed-off-by: Peter Xu Conflicts: hw/core/qdev-properties-system.c include/hw/qdev-properties-system.h [jz: resolve simple context conflicts] Signed-off-by: Jason Zeng --- hw/core/qdev-properties-system.c | 10 ++++++++++ include/hw/qdev-properties-system.h | 4 ++++ migration/migration-hmp-cmds.c | 9 +++++++++ migration/options.c | 21 +++++++++++++++++++++ migration/options.h | 1 + migration/ram.c | 4 ++++ qapi/migration.json | 28 +++++++++++++++++++++++++++- 7 files changed, 76 insertions(+), 1 deletion(-) diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index 1473ab3d5e9..4171dd104f9 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -687,6 +687,16 @@ const PropertyInfo qdev_prop_mig_mode = { .set_default_value = qdev_propinfo_set_default_value_enum, }; +const PropertyInfo qdev_prop_zero_page_detection = { + .name = "ZeroPageDetection", + .description = "zero_page_detection values, " + "none,legacy", + .enum_table = &ZeroPageDetection_lookup, + .get = qdev_propinfo_get_enum, + .set = qdev_propinfo_set_enum, + .set_default_value = qdev_propinfo_set_default_value_enum, +}; + /* --- Reserved Region --- */ /* diff --git a/include/hw/qdev-properties-system.h b/include/hw/qdev-properties-system.h index 91f7a2452d9..85d4a033897 100644 --- a/include/hw/qdev-properties-system.h +++ b/include/hw/qdev-properties-system.h @@ -8,6 +8,7 @@ extern const PropertyInfo qdev_prop_macaddr; extern const PropertyInfo qdev_prop_reserved_region; extern const PropertyInfo qdev_prop_multifd_compression; extern const PropertyInfo qdev_prop_mig_mode; +extern const PropertyInfo qdev_prop_zero_page_detection; extern const PropertyInfo qdev_prop_losttickpolicy; extern const PropertyInfo qdev_prop_blockdev_on_error; extern const PropertyInfo qdev_prop_bios_chs_trans; @@ -46,6 +47,9 @@ extern const PropertyInfo qdev_prop_cpus390entitlement; #define DEFINE_PROP_MIG_MODE(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_mig_mode, \ MigMode) +#define DEFINE_PROP_ZERO_PAGE_DETECTION(_n, _s, _f, _d) \ + DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_zero_page_detection, \ + ZeroPageDetection) #define DEFINE_PROP_LOSTTICKPOLICY(_n, _s, _f, _d) \ DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_losttickpolicy, \ LostTickPolicy) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index a2675518e06..16070c5eb3f 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -345,6 +345,11 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict *qdict) monitor_printf(mon, "%s: %s\n", MigrationParameter_str(MIGRATION_PARAMETER_MULTIFD_COMPRESSION), MultiFDCompression_str(params->multifd_compression)); + assert(params->has_zero_page_detection); + monitor_printf(mon, "%s: %s\n", + MigrationParameter_str(MIGRATION_PARAMETER_ZERO_PAGE_DETECTION), + qapi_enum_lookup(&ZeroPageDetection_lookup, + params->zero_page_detection)); monitor_printf(mon, "%s: %" PRIu64 " bytes\n", MigrationParameter_str(MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE), params->xbzrle_cache_size); @@ -651,6 +656,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_multifd_zstd_level = true; visit_type_uint8(v, param, &p->multifd_zstd_level, &err); break; + case MIGRATION_PARAMETER_ZERO_PAGE_DETECTION: + p->has_zero_page_detection = true; + visit_type_ZeroPageDetection(v, param, &p->zero_page_detection, &err); + break; case MIGRATION_PARAMETER_XBZRLE_CACHE_SIZE: p->has_xbzrle_cache_size = true; if (!visit_type_size(v, param, &cache_size, &err)) { diff --git a/migration/options.c b/migration/options.c index 70f6beb83c0..38403bf7459 100644 --- a/migration/options.c +++ b/migration/options.c @@ -179,6 +179,9 @@ Property migration_properties[] = { DEFINE_PROP_MIG_MODE("mode", MigrationState, parameters.mode, MIG_MODE_NORMAL), + DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState, + parameters.zero_page_detection, + ZERO_PAGE_DETECTION_LEGACY), DEFINE_PROP_STRING("sev-pdh", MigrationState, parameters.sev_pdh), DEFINE_PROP_STRING("sev-plat-cert", MigrationState, parameters.sev_plat_cert), DEFINE_PROP_STRING("sev-amd-cert", MigrationState, parameters.sev_amd_cert), @@ -904,6 +907,13 @@ uint64_t migrate_xbzrle_cache_size(void) return s->parameters.xbzrle_cache_size; } +ZeroPageDetection migrate_zero_page_detection(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.zero_page_detection; +} + /* parameter setters */ void migrate_set_block_incremental(bool value) @@ -1017,6 +1027,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->vcpu_dirty_limit = s->parameters.vcpu_dirty_limit; params->has_mode = true; params->mode = s->parameters.mode; + params->has_zero_page_detection = true; + params->zero_page_detection = s->parameters.zero_page_detection; return params; } @@ -1053,6 +1065,7 @@ void migrate_params_init(MigrationParameters *params) params->has_x_vcpu_dirty_limit_period = true; params->has_vcpu_dirty_limit = true; params->has_mode = true; + params->has_zero_page_detection = true; params->sev_pdh = g_strdup(""); params->sev_plat_cert = g_strdup(""); @@ -1359,6 +1372,10 @@ static void migrate_params_test_apply(MigrateSetParameters *params, dest->mode = params->mode; } + if (params->has_zero_page_detection) { + dest->zero_page_detection = params->zero_page_detection; + } + if (params->sev_pdh) { assert(params->sev_pdh->type == QTYPE_QSTRING); dest->sev_pdh = params->sev_pdh->u.s; @@ -1516,6 +1533,10 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) s->parameters.mode = params->mode; } + if (params->has_zero_page_detection) { + s->parameters.zero_page_detection = params->zero_page_detection; + } + if (params->sev_pdh) { g_free(s->parameters.sev_pdh); assert(params->sev_pdh->type == QTYPE_QSTRING); diff --git a/migration/options.h b/migration/options.h index 246c160aeee..b7c4fb38615 100644 --- a/migration/options.h +++ b/migration/options.h @@ -93,6 +93,7 @@ const char *migrate_tls_authz(void); const char *migrate_tls_creds(void); const char *migrate_tls_hostname(void); uint64_t migrate_xbzrle_cache_size(void); +ZeroPageDetection migrate_zero_page_detection(void); /* parameters setters */ diff --git a/migration/ram.c b/migration/ram.c index 9d176281008..b350a1e8d63 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1138,6 +1138,10 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss, QEMUFile *file = pss->pss_channel; int len = 0; + if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_NONE) { + return 0; + } + if (!buffer_is_zero(p, TARGET_PAGE_SIZE)) { return 0; } diff --git a/qapi/migration.json b/qapi/migration.json index 3c4724db1b2..af0cefce045 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -653,6 +653,18 @@ { 'enum': 'MigMode', 'data': [ 'normal', 'cpr-reboot' ] } +## +# @ZeroPageDetection: +# +# @none: Do not perform zero page checking. +# +# @legacy: Perform zero page checking in main migration thread. +# +# Since: 9.0 +## +{ 'enum': 'ZeroPageDetection', + 'data': [ 'none', 'legacy' ] } + ## # @BitmapMigrationBitmapAliasTransform: # @@ -874,6 +886,10 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # +# @zero-page-detection: Whether and how to detect zero pages. +# See description in @ZeroPageDetection. Default is 'legacy'. +# (since 9.0) +# # @sev-pdh: The target host platform diffie-hellman key encoded in base64, or # pdh filename for hygon # (Since 4.2) @@ -919,6 +935,7 @@ { 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] }, 'vcpu-dirty-limit', 'mode', + 'zero-page-detection', 'sev-pdh', 'sev-plat-cert', 'sev-amd-cert'] } ## @@ -1074,6 +1091,10 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # +# @zero-page-detection: Whether and how to detect zero pages. +# See description in @ZeroPageDetection. Default is 'legacy'. +# (since 9.0) +# # @sev-pdh: The target host platform diffie-hellman key encoded in base64, or # pdh filename for hygon # (Since 4.2) @@ -1139,11 +1160,11 @@ 'features': [ 'unstable' ] }, '*vcpu-dirty-limit': 'uint64', '*mode': 'MigMode', + '*zero-page-detection': 'ZeroPageDetection', '*sev-pdh': 'StrOrNull', '*sev-plat-cert': 'StrOrNull', '*sev-amd-cert' : 'StrOrNull' } } - ## # @migrate-set-parameters: # @@ -1317,6 +1338,10 @@ # @mode: Migration mode. See description in @MigMode. Default is 'normal'. # (Since 8.2) # +# @zero-page-detection: Whether and how to detect zero pages. +# See description in @ZeroPageDetection. Default is 'legacy'. +# (since 9.0) +# # @sev-pdh: The target host platform diffie-hellman key encoded in base64, or # pdh filename for hygon # (Since 4.2) @@ -1379,6 +1404,7 @@ 'features': [ 'unstable' ] }, '*vcpu-dirty-limit': 'uint64', '*mode': 'MigMode', + '*zero-page-detection': 'ZeroPageDetection', '*sev-pdh': 'str', '*sev-plat-cert': 'str', '*sev-amd-cert' : 'str'} } -- Gitee From 9bd737ed2bf81eb40326d597220e2dc03d796cc1 Mon Sep 17 00:00:00 2001 From: Hao Xiang Date: Mon, 11 Mar 2024 18:00:12 +0000 Subject: [PATCH 66/90] migration/multifd: Implement zero page transmission on the multifd thread. commit 303e6f54f9657be76ee060006ee2d4cacff263a0 upstream. 1. Add zero_pages field in MultiFDPacket_t. 2. Implements the zero page detection and handling on the multifd threads for non-compression, zlib and zstd compression backends. 3. Added a new value 'multifd' in ZeroPageDetection enumeration. 4. Adds zero page counters and updates multifd send/receive tracing format to track the newly added counters. Intel-SIG: commit 303e6f54f965 migration/multifd: Implement zero page transmission on the multifd thread. Signed-off-by: Hao Xiang Acked-by: Markus Armbruster Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240311180015.3359271-5-hao.xiang@linux.dev Signed-off-by: Peter Xu Conflicts: migration/multifd.c [jz: there is no multifd_set_file_bitmap() because we didn't backport mapped-ram, so abandon changes in multifd_set_file_bitmap()] Signed-off-by: Jason Zeng --- hw/core/qdev-properties-system.c | 2 +- migration/meson.build | 1 + migration/multifd-zero-page.c | 87 ++++++++++++++++++++++++++++++++ migration/multifd-zlib.c | 21 ++++++-- migration/multifd-zstd.c | 20 ++++++-- migration/multifd.c | 83 +++++++++++++++++++++++++----- migration/multifd.h | 23 ++++++++- migration/ram.c | 1 - migration/trace-events | 8 +-- qapi/migration.json | 7 ++- 10 files changed, 222 insertions(+), 31 deletions(-) create mode 100644 migration/multifd-zero-page.c diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index 4171dd104f9..67d40c526c5 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -690,7 +690,7 @@ const PropertyInfo qdev_prop_mig_mode = { const PropertyInfo qdev_prop_zero_page_detection = { .name = "ZeroPageDetection", .description = "zero_page_detection values, " - "none,legacy", + "none,legacy,multifd", .enum_table = &ZeroPageDetection_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, diff --git a/migration/meson.build b/migration/meson.build index 92b1cc42978..1eeb915ff63 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -22,6 +22,7 @@ system_ss.add(files( 'migration.c', 'multifd.c', 'multifd-zlib.c', + 'multifd-zero-page.c', 'ram-compress.c', 'options.c', 'postcopy-ram.c', diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c new file mode 100644 index 00000000000..1ba38be6361 --- /dev/null +++ b/migration/multifd-zero-page.c @@ -0,0 +1,87 @@ +/* + * Multifd zero page detection implementation. + * + * Copyright (c) 2024 Bytedance Inc + * + * Authors: + * Hao Xiang + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "exec/ramblock.h" +#include "migration.h" +#include "multifd.h" +#include "options.h" +#include "ram.h" + +static bool multifd_zero_page_enabled(void) +{ + return migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD; +} + +static void swap_page_offset(ram_addr_t *pages_offset, int a, int b) +{ + ram_addr_t temp; + + if (a == b) { + return; + } + + temp = pages_offset[a]; + pages_offset[a] = pages_offset[b]; + pages_offset[b] = temp; +} + +/** + * multifd_send_zero_page_detect: Perform zero page detection on all pages. + * + * Sorts normal pages before zero pages in p->pages->offset and updates + * p->pages->normal_num. + * + * @param p A pointer to the send params. + */ +void multifd_send_zero_page_detect(MultiFDSendParams *p) +{ + MultiFDPages_t *pages = p->pages; + RAMBlock *rb = pages->block; + int i = 0; + int j = pages->num - 1; + + if (!multifd_zero_page_enabled()) { + pages->normal_num = pages->num; + return; + } + + /* + * Sort the page offset array by moving all normal pages to + * the left and all zero pages to the right of the array. + */ + while (i <= j) { + uint64_t offset = pages->offset[i]; + + if (!buffer_is_zero(rb->host + offset, p->page_size)) { + i++; + continue; + } + + swap_page_offset(pages->offset, i, j); + ram_release_page(rb->idstr, offset); + j--; + } + + pages->normal_num = i; +} + +void multifd_recv_zero_page_process(MultiFDRecvParams *p) +{ + for (int i = 0; i < p->zero_num; i++) { + void *page = p->host + p->zero[i]; + if (!buffer_is_zero(page, p->page_size)) { + memset(page, 0, p->page_size); + } + } +} diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 6120faad65a..83c03743801 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -123,13 +123,15 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) int ret; uint32_t i; - multifd_send_prepare_header(p); + if (!multifd_send_prepare_common(p)) { + goto out; + } - for (i = 0; i < pages->num; i++) { + for (i = 0; i < pages->normal_num; i++) { uint32_t available = z->zbuff_len - out_size; int flush = Z_NO_FLUSH; - if (i == pages->num - 1) { + if (i == pages->normal_num - 1) { flush = Z_SYNC_FLUSH; } @@ -172,10 +174,10 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp) p->iov[p->iovs_num].iov_len = out_size; p->iovs_num++; p->next_packet_size = out_size; - p->flags |= MULTIFD_FLAG_ZLIB; +out: + p->flags |= MULTIFD_FLAG_ZLIB; multifd_send_fill_packet(p); - return 0; } @@ -261,6 +263,14 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) p->id, flags, MULTIFD_FLAG_ZLIB); return -1; } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp); if (ret != 0) { @@ -310,6 +320,7 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) p->id, out_size, expected_size); return -1; } + return 0; } diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index cac236833d0..02112255adc 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -118,16 +118,18 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) int ret; uint32_t i; - multifd_send_prepare_header(p); + if (!multifd_send_prepare_common(p)) { + goto out; + } z->out.dst = z->zbuff; z->out.size = z->zbuff_len; z->out.pos = 0; - for (i = 0; i < pages->num; i++) { + for (i = 0; i < pages->normal_num; i++) { ZSTD_EndDirective flush = ZSTD_e_continue; - if (i == pages->num - 1) { + if (i == pages->normal_num - 1) { flush = ZSTD_e_flush; } z->in.src = p->pages->block->host + pages->offset[i]; @@ -161,10 +163,10 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) p->iov[p->iovs_num].iov_len = z->out.pos; p->iovs_num++; p->next_packet_size = z->out.pos; - p->flags |= MULTIFD_FLAG_ZSTD; +out: + p->flags |= MULTIFD_FLAG_ZSTD; multifd_send_fill_packet(p); - return 0; } @@ -257,6 +259,14 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp) p->id, flags, MULTIFD_FLAG_ZSTD); return -1; } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp); if (ret != 0) { diff --git a/migration/multifd.c b/migration/multifd.c index 9e617b608ed..14f25277089 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -11,6 +11,7 @@ */ #include "qemu/osdep.h" +#include "qemu/cutils.h" #include "qemu/rcu.h" #include "exec/target_page.h" #include "sysemu/sysemu.h" @@ -131,13 +132,13 @@ static void multifd_send_prepare_iovs(MultiFDSendParams *p) { MultiFDPages_t *pages = p->pages; - for (int i = 0; i < pages->num; i++) { + for (int i = 0; i < pages->normal_num; i++) { p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i]; p->iov[p->iovs_num].iov_len = p->page_size; p->iovs_num++; } - p->next_packet_size = pages->num * p->page_size; + p->next_packet_size = pages->normal_num * p->page_size; } /** @@ -156,6 +157,8 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) bool use_zero_copy_send = migrate_zero_copy_send(); int ret; + multifd_send_zero_page_detect(p); + if (!multifd_use_packets()) { multifd_send_prepare_iovs(p); return 0; @@ -237,6 +240,13 @@ static int nocomp_recv(MultiFDRecvParams *p, Error **errp) p->id, flags, MULTIFD_FLAG_NOCOMP); return -1; } + + multifd_recv_zero_page_process(p); + + if (!p->normal_num) { + return 0; + } + for (int i = 0; i < p->normal_num; i++) { p->iov[i].iov_base = p->host + p->normal[i]; p->iov[i].iov_len = p->page_size; @@ -271,6 +281,7 @@ static void multifd_pages_reset(MultiFDPages_t *pages) * overwritten later when reused. */ pages->num = 0; + pages->normal_num = 0; pages->block = NULL; } @@ -362,11 +373,13 @@ void multifd_send_fill_packet(MultiFDSendParams *p) MultiFDPacket_t *packet = p->packet; MultiFDPages_t *pages = p->pages; uint64_t packet_num; + uint32_t zero_num = pages->num - pages->normal_num; int i; packet->flags = cpu_to_be32(p->flags); packet->pages_alloc = cpu_to_be32(p->pages->allocated); - packet->normal_pages = cpu_to_be32(pages->num); + packet->normal_pages = cpu_to_be32(pages->normal_num); + packet->zero_pages = cpu_to_be32(zero_num); packet->next_packet_size = cpu_to_be32(p->next_packet_size); packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num); @@ -384,10 +397,11 @@ void multifd_send_fill_packet(MultiFDSendParams *p) } p->packets_sent++; - p->total_normal_pages += pages->num; + p->total_normal_pages += pages->normal_num; + p->total_zero_pages += zero_num; - trace_multifd_send(p->id, packet_num, pages->num, p->flags, - p->next_packet_size); + trace_multifd_send(p->id, packet_num, pages->normal_num, zero_num, + p->flags, p->next_packet_size); } static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) @@ -428,20 +442,29 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->normal_num = be32_to_cpu(packet->normal_pages); if (p->normal_num > packet->pages_alloc) { error_setg(errp, "multifd: received packet " - "with %u pages and expected maximum pages are %u", + "with %u normal pages and expected maximum pages are %u", p->normal_num, packet->pages_alloc) ; return -1; } + p->zero_num = be32_to_cpu(packet->zero_pages); + if (p->zero_num > packet->pages_alloc - p->normal_num) { + error_setg(errp, "multifd: received packet " + "with %u zero pages and expected maximum zero pages are %u", + p->zero_num, packet->pages_alloc - p->normal_num) ; + return -1; + } + p->next_packet_size = be32_to_cpu(packet->next_packet_size); p->packet_num = be64_to_cpu(packet->packet_num); p->packets_recved++; p->total_normal_pages += p->normal_num; + p->total_zero_pages += p->zero_num; - trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->flags, - p->next_packet_size); + trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->zero_num, + p->flags, p->next_packet_size); - if (p->normal_num == 0) { + if (p->normal_num == 0 && p->zero_num == 0) { return 0; } @@ -467,6 +490,18 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp) p->normal[i] = offset; } + for (i = 0; i < p->zero_num; i++) { + uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]); + + if (offset > (p->block->used_length - p->page_size)) { + error_setg(errp, "multifd: offset too long %" PRIu64 + " (max " RAM_ADDR_FMT ")", + offset, p->block->used_length); + return -1; + } + p->zero[i] = offset; + } + return 0; } @@ -862,6 +897,8 @@ static void *multifd_send_thread(void *opaque) stat64_add(&mig_stats.multifd_bytes, p->next_packet_size + p->packet_len); + stat64_add(&mig_stats.normal_pages, pages->normal_num); + stat64_add(&mig_stats.zero_pages, pages->num - pages->normal_num); multifd_pages_reset(p->pages); p->next_packet_size = 0; @@ -909,7 +946,8 @@ out: rcu_unregister_thread(); migration_threads_remove(thread); - trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages); + trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages, + p->total_zero_pages); return NULL; } @@ -1185,6 +1223,8 @@ static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) p->iov = NULL; g_free(p->normal); p->normal = NULL; + g_free(p->zero); + p->zero = NULL; multifd_recv_state->ops->recv_cleanup(p); } @@ -1290,7 +1330,7 @@ static void *multifd_recv_thread(void *opaque) flags = p->flags; /* recv methods don't know how to handle the SYNC flag */ p->flags &= ~MULTIFD_FLAG_SYNC; - has_data = !!p->normal_num; + has_data = p->normal_num || p->zero_num; qemu_mutex_unlock(&p->mutex); } @@ -1315,7 +1355,9 @@ static void *multifd_recv_thread(void *opaque) } rcu_unregister_thread(); - trace_multifd_recv_thread_end(p->id, p->packets_recved, p->total_normal_pages); + trace_multifd_recv_thread_end(p->id, p->packets_recved, + p->total_normal_pages, + p->total_zero_pages); return NULL; } @@ -1358,6 +1400,7 @@ int multifd_recv_setup(Error **errp) p->name = g_strdup_printf("multifdrecv_%d", i); p->iov = g_new0(struct iovec, page_count); p->normal = g_new0(ram_addr_t, page_count); + p->zero = g_new0(ram_addr_t, page_count); p->page_count = page_count; p->page_size = qemu_target_page_size(); } @@ -1433,3 +1476,17 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) QEMU_THREAD_JOINABLE); qatomic_inc(&multifd_recv_state->count); } + +bool multifd_send_prepare_common(MultiFDSendParams *p) +{ + multifd_send_zero_page_detect(p); + + if (!p->pages->normal_num) { + p->next_packet_size = 0; + return false; + } + + multifd_send_prepare_header(p); + + return true; +} diff --git a/migration/multifd.h b/migration/multifd.h index 6a54377cc14..d99603c6a4a 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -48,14 +48,24 @@ typedef struct { /* size of the next packet that contains pages */ uint32_t next_packet_size; uint64_t packet_num; - uint64_t unused[4]; /* Reserved for future use */ + /* zero pages */ + uint32_t zero_pages; + uint32_t unused32[1]; /* Reserved for future use */ + uint64_t unused64[3]; /* Reserved for future use */ char ramblock[256]; + /* + * This array contains the pointers to: + * - normal pages (initial normal_pages entries) + * - zero pages (following zero_pages entries) + */ uint64_t offset[]; } __attribute__((packed)) MultiFDPacket_t; typedef struct { /* number of used pages */ uint32_t num; + /* number of normal pages */ + uint32_t normal_num; /* number of allocated pages */ uint32_t allocated; /* offset of each page */ @@ -122,6 +132,8 @@ typedef struct { uint64_t packets_sent; /* non zero pages sent through this channel */ uint64_t total_normal_pages; + /* zero pages sent through this channel */ + uint64_t total_zero_pages; /* buffers to send */ struct iovec *iov; /* number of iovs used */ @@ -176,12 +188,18 @@ typedef struct { uint8_t *host; /* non zero pages recv through this channel */ uint64_t total_normal_pages; + /* zero pages recv through this channel */ + uint64_t total_zero_pages; /* buffers to recv */ struct iovec *iov; /* Pages that are not zero */ ram_addr_t *normal; /* num of non zero pages */ uint32_t normal_num; + /* Pages that are zero */ + ram_addr_t *zero; + /* num of zero pages */ + uint32_t zero_num; /* used for de-compression methods */ void *compress_data; } MultiFDRecvParams; @@ -203,6 +221,9 @@ typedef struct { void multifd_register_ops(int method, MultiFDMethods *ops); void multifd_send_fill_packet(MultiFDSendParams *p); +bool multifd_send_prepare_common(MultiFDSendParams *p); +void multifd_send_zero_page_detect(MultiFDSendParams *p); +void multifd_recv_zero_page_process(MultiFDRecvParams *p); static inline void multifd_send_prepare_header(MultiFDSendParams *p) { diff --git a/migration/ram.c b/migration/ram.c index b350a1e8d63..273ccec35be 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -1393,7 +1393,6 @@ static int ram_save_multifd_page(RAMBlock *block, ram_addr_t offset) if (!multifd_queue_page(block, offset)) { return -1; } - stat64_add(&mig_stats.normal_pages, 1); return 1; } diff --git a/migration/trace-events b/migration/trace-events index bf1a0696325..f0e1cb80c75 100644 --- a/migration/trace-events +++ b/migration/trace-events @@ -128,21 +128,21 @@ postcopy_preempt_reset_channel(void) "" # multifd.c multifd_new_send_channel_async(uint8_t id) "channel %u" multifd_new_send_channel_async_error(uint8_t id, void *err) "channel=%u err=%p" -multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " pages %u flags 0x%x next packet size %u" +multifd_recv(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t zero, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u" multifd_recv_new_channel(uint8_t id) "channel %u" multifd_recv_sync_main(long packet_num) "packet num %ld" multifd_recv_sync_main_signal(uint8_t id) "channel %u" multifd_recv_sync_main_wait(uint8_t id) "iter %u" multifd_recv_terminate_threads(bool error) "error %d" -multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t pages) "channel %u packets %" PRIu64 " pages %" PRIu64 +multifd_recv_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64 multifd_recv_thread_start(uint8_t id) "%u" -multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u flags 0x%x next packet size %u" +multifd_send(uint8_t id, uint64_t packet_num, uint32_t normal_pages, uint32_t zero_pages, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " normal pages %u zero pages %u flags 0x%x next packet size %u" multifd_send_error(uint8_t id) "channel %u" multifd_send_sync_main(long packet_num) "packet num %ld" multifd_send_sync_main_signal(uint8_t id) "channel %u" multifd_send_sync_main_wait(uint8_t id) "channel %u" multifd_send_terminate_threads(void) "" -multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 +multifd_send_thread_end(uint8_t id, uint64_t packets, uint64_t normal_pages, uint64_t zero_pages) "channel %u packets %" PRIu64 " normal pages %" PRIu64 " zero pages %" PRIu64 multifd_send_thread_start(uint8_t id) "%u" multifd_tls_outgoing_handshake_start(void *ioc, void *tioc, const char *hostname) "ioc=%p tioc=%p hostname=%s" multifd_tls_outgoing_handshake_error(void *ioc, const char *err) "ioc=%p err=%s" diff --git a/qapi/migration.json b/qapi/migration.json index af0cefce045..e7249f24f84 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -660,10 +660,15 @@ # # @legacy: Perform zero page checking in main migration thread. # +# @multifd: Perform zero page checking in multifd sender thread if +# multifd migration is enabled, else in the main migration +# thread as for @legacy. +# # Since: 9.0 +# ## { 'enum': 'ZeroPageDetection', - 'data': [ 'none', 'legacy' ] } + 'data': [ 'none', 'legacy', 'multifd' ] } ## # @BitmapMigrationBitmapAliasTransform: -- Gitee From 081831f9c7e7f66ec0f6348d3bb4d989e9ef8802 Mon Sep 17 00:00:00 2001 From: Hao Xiang Date: Mon, 11 Mar 2024 18:00:13 +0000 Subject: [PATCH 67/90] migration/multifd: Implement ram_save_target_page_multifd to handle multifd version of MigrationOps::ram_save_target_page. commit 9ae90f73e623c8b8c7ec1fccd8ca493805df8fbd upstream. 1. Add a dedicated handler for MigrationOps::ram_save_target_page in multifd live migration. 2. Refactor ram_save_target_page_legacy so that the legacy and multifd handlers don't have internal functions calling into each other. Intel-SIG: commit 9ae90f73e623 migration/multifd: Implement ram_save_target_page_multifd to handle multifd version of MigrationOps::ram_save_target_page. Signed-off-by: Hao Xiang Reviewed-by: Fabiano Rosas Message-Id: <20240226195654.934709-4-hao.xiang@bytedance.com> Link: https://lore.kernel.org/r/20240311180015.3359271-6-hao.xiang@linux.dev Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/ram.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/migration/ram.c b/migration/ram.c index 273ccec35be..0e08d86a466 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -2230,7 +2230,6 @@ static bool encrypted_test_list(RAMState *rs, RAMBlock *block, */ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) { - RAMBlock *block = pss->block; ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; int res; @@ -2257,17 +2256,33 @@ static int ram_save_target_page_legacy(RAMState *rs, PageSearchStatus *pss) return 1; } + return ram_save_page(rs, pss); +} + +/** + * ram_save_target_page_multifd: send one target page to multifd workers + * + * Returns 1 if the page was queued, -1 otherwise. + * + * @rs: current RAM state + * @pss: data about the page we want to send + */ +static int ram_save_target_page_multifd(RAMState *rs, PageSearchStatus *pss) +{ + RAMBlock *block = pss->block; + ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS; + /* - * Do not use multifd in postcopy as one whole host page should be - * placed. Meanwhile postcopy requires atomic update of pages, so even - * if host page size == guest page size the dest guest during run may - * still see partially copied pages which is data corruption. + * While using multifd live migration, we still need to handle zero + * page checking on the migration main thread. */ - if (migrate_multifd() && !migration_in_postcopy()) { - return ram_save_multifd_page(block, offset); + if (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_LEGACY) { + if (save_zero_page(rs, pss, offset)) { + return 1; + } } - return ram_save_page(rs, pss); + return ram_save_multifd_page(block, offset); } /* Should be called before sending a host page */ @@ -3403,7 +3418,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque) } migration_ops = g_malloc0(sizeof(MigrationOps)); - migration_ops->ram_save_target_page = ram_save_target_page_legacy; + + if (migrate_multifd()) { + migration_ops->ram_save_target_page = ram_save_target_page_multifd; + } else { + migration_ops->ram_save_target_page = ram_save_target_page_legacy; + } qemu_mutex_unlock_iothread(); ret = multifd_send_sync_main(); -- Gitee From 1a5e7d9d1c12ff5322921383c7b31d1269a698b1 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 1 Apr 2024 23:41:10 +0800 Subject: [PATCH 68/90] migration/multifd: solve zero page causing multiple page faults commit 5ef7e26bdb7eda10d6d5e1b77121be9945e5e550 upstream. Implemented recvbitmap tracking of received pages in multifd. If the zero page appears for the first time in the recvbitmap, this page is not checked and set. If the zero page has already appeared in the recvbitmap, there is no need to check the data but directly set the data to 0, because it is unlikely that the zero page will be migrated multiple times. Intel-SIG: commit 5ef7e26bdb7e migration/multifd: solve zero page causing multiple page faults Signed-off-by: Yuan Liu Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240401154110.2028453-2-yuan1.liu@intel.com [peterx: touch up the comment, as the bitmap is used outside postcopy now] Signed-off-by: Peter Xu Conflicts: include/exec/ramblock.h [jz: resolve context conflict due to mmaped-ram which is not backported] Signed-off-by: Jason Zeng --- include/exec/ramblock.h | 2 +- migration/multifd-zero-page.c | 4 +++- migration/multifd-zlib.c | 1 + migration/multifd-zstd.c | 1 + migration/multifd.c | 1 + migration/ram.c | 4 ++++ migration/ram.h | 1 + 7 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h index 69c6a539029..8f9579ed705 100644 --- a/include/exec/ramblock.h +++ b/include/exec/ramblock.h @@ -44,7 +44,7 @@ struct RAMBlock { size_t page_size; /* dirty bitmap used during migration */ unsigned long *bmap; - /* bitmap of already received pages in postcopy */ + /* Bitmap of already received pages. Only used on destination side. */ unsigned long *receivedmap; /* diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c index 1ba38be6361..e1b8370f88e 100644 --- a/migration/multifd-zero-page.c +++ b/migration/multifd-zero-page.c @@ -80,8 +80,10 @@ void multifd_recv_zero_page_process(MultiFDRecvParams *p) { for (int i = 0; i < p->zero_num; i++) { void *page = p->host + p->zero[i]; - if (!buffer_is_zero(page, p->page_size)) { + if (ramblock_recv_bitmap_test_byte_offset(p->block, p->zero[i])) { memset(page, 0, p->page_size); + } else { + ramblock_recv_bitmap_set_offset(p->block, p->zero[i]); } } } diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index 83c03743801..b210725f6e7 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -284,6 +284,7 @@ static int zlib_recv(MultiFDRecvParams *p, Error **errp) int flush = Z_NO_FLUSH; unsigned long start = zs->total_out; + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); if (i == p->normal_num - 1) { flush = Z_SYNC_FLUSH; } diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index 02112255adc..256858df0a0 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -278,6 +278,7 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp) z->in.pos = 0; for (i = 0; i < p->normal_num; i++) { + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); z->out.dst = p->host + p->normal[i]; z->out.size = p->page_size; z->out.pos = 0; diff --git a/migration/multifd.c b/migration/multifd.c index 14f25277089..fbab8c5b720 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -250,6 +250,7 @@ static int nocomp_recv(MultiFDRecvParams *p, Error **errp) for (int i = 0; i < p->normal_num; i++) { p->iov[i].iov_base = p->host + p->normal[i]; p->iov[i].iov_len = p->page_size; + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); } return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp); } diff --git a/migration/ram.c b/migration/ram.c index 0e08d86a466..9d49765bcf8 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -272,6 +272,10 @@ void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, nr); } +void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset) +{ + set_bit_atomic(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap); +} #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL) /* diff --git a/migration/ram.h b/migration/ram.h index 9b937a446b7..cd263df0266 100644 --- a/migration/ram.h +++ b/migration/ram.h @@ -69,6 +69,7 @@ int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr); bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset); void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr); void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr, size_t nr); +void ramblock_recv_bitmap_set_offset(RAMBlock *rb, uint64_t byte_offset); int64_t ramblock_recv_bitmap_send(QEMUFile *file, const char *block_name); bool ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb, Error **errp); -- Gitee From a594a8ade977e0294ae4be9f6bba1a9f83937028 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:04 +0800 Subject: [PATCH 69/90] docs/migration: add qpl compression feature commit 0d40b3d76ced77c1c82c77a636af703fabdb407c upstream. add Intel Query Processing Library (QPL) compression method introduction Intel-SIG: commit 0d40b3d76ced docs/migration: add qpl compression feature Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Fabiano Rosas Acked-by: Peter Xu Signed-off-by: Fabiano Rosas Conflicts: docs/devel/migration/features.rst [jz: resolve simple context conflict] Signed-off-by: Jason Zeng --- docs/devel/migration/features.rst | 1 + docs/devel/migration/qpl-compression.rst | 260 +++++++++++++++++++++++ 2 files changed, 261 insertions(+) create mode 100644 docs/devel/migration/qpl-compression.rst diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index a9acaf618ee..9819393c12f 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -10,3 +10,4 @@ Migration has plenty of features to support different use cases. dirty-limit vfio virtio + qpl-compression diff --git a/docs/devel/migration/qpl-compression.rst b/docs/devel/migration/qpl-compression.rst new file mode 100644 index 00000000000..990992d7865 --- /dev/null +++ b/docs/devel/migration/qpl-compression.rst @@ -0,0 +1,260 @@ +=============== +QPL Compression +=============== +The Intel Query Processing Library (Intel ``QPL``) is an open-source library to +provide compression and decompression features and it is based on deflate +compression algorithm (RFC 1951). + +The ``QPL`` compression relies on Intel In-Memory Analytics Accelerator(``IAA``) +and Shared Virtual Memory(``SVM``) technology, they are new features supported +from Intel 4th Gen Intel Xeon Scalable processors, codenamed Sapphire Rapids +processor(``SPR``). + +For more ``QPL`` introduction, please refer to `QPL Introduction +`_ + +QPL Compression Framework +========================= + +:: + + +----------------+ +------------------+ + | MultiFD Thread | |accel-config tool | + +-------+--------+ +--------+---------+ + | | + | | + |compress/decompress | + +-------+--------+ | Setup IAA + | QPL library | | Resources + +-------+---+----+ | + | | | + | +-------------+-------+ + | Open IAA | + | Devices +-----+-----+ + | |idxd driver| + | +-----+-----+ + | | + | | + | +-----+-----+ + +-----------+IAA Devices| + Submit jobs +-----------+ + via enqcmd + + +QPL Build And Installation +-------------------------- + +.. code-block:: shell + + $git clone --recursive https://github.com/intel/qpl.git qpl + $mkdir qpl/build + $cd qpl/build + $cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DQPL_LIBRARY_TYPE=SHARED .. + $sudo cmake --build . --target install + +For more details about ``QPL`` installation, please refer to `QPL Installation +`_ + +IAA Device Management +--------------------- + +The number of ``IAA`` devices will vary depending on the Xeon product model. +On a ``SPR`` server, there can be a maximum of 8 ``IAA`` devices, with up to +4 devices per socket. + +By default, all ``IAA`` devices are disabled and need to be configured and +enabled by users manually. + +Check the number of devices through the following command + +.. code-block:: shell + + #lspci -d 8086:0cfe + 6a:02.0 System peripheral: Intel Corporation Device 0cfe + 6f:02.0 System peripheral: Intel Corporation Device 0cfe + 74:02.0 System peripheral: Intel Corporation Device 0cfe + 79:02.0 System peripheral: Intel Corporation Device 0cfe + e7:02.0 System peripheral: Intel Corporation Device 0cfe + ec:02.0 System peripheral: Intel Corporation Device 0cfe + f1:02.0 System peripheral: Intel Corporation Device 0cfe + f6:02.0 System peripheral: Intel Corporation Device 0cfe + +IAA Device Configuration And Enabling +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``accel-config`` tool is used to enable ``IAA`` devices and configure +``IAA`` hardware resources(work queues and engines). One ``IAA`` device +has 8 work queues and 8 processing engines, multiple engines can be assigned +to a work queue via ``group`` attribute. + +For ``accel-config`` installation, please refer to `accel-config installation +`_ + +One example of configuring and enabling an ``IAA`` device. + +.. code-block:: shell + + #accel-config config-engine iax1/engine1.0 -g 0 + #accel-config config-engine iax1/engine1.1 -g 0 + #accel-config config-engine iax1/engine1.2 -g 0 + #accel-config config-engine iax1/engine1.3 -g 0 + #accel-config config-engine iax1/engine1.4 -g 0 + #accel-config config-engine iax1/engine1.5 -g 0 + #accel-config config-engine iax1/engine1.6 -g 0 + #accel-config config-engine iax1/engine1.7 -g 0 + #accel-config config-wq iax1/wq1.0 -g 0 -s 128 -p 10 -b 1 -t 128 -m shared -y user -n app1 -d user + #accel-config enable-device iax1 + #accel-config enable-wq iax1/wq1.0 + +.. note:: + IAX is an early name for IAA + +- The ``IAA`` device index is 1, use ``ls -lh /sys/bus/dsa/devices/iax*`` + command to query the ``IAA`` device index. + +- 8 engines and 1 work queue are configured in group 0, so all compression jobs + submitted to this work queue can be processed by all engines at the same time. + +- Set work queue attributes including the work mode, work queue size and so on. + +- Enable the ``IAA1`` device and work queue 1.0 + +.. note:: + + Set work queue mode to shared mode, since ``QPL`` library only supports + shared mode + +For more detailed configuration, please refer to `IAA Configuration Samples +`_ + +IAA Unit Test +^^^^^^^^^^^^^ + +- Enabling ``IAA`` devices for Xeon platform, please refer to `IAA User Guide + `_ + +- ``IAA`` device driver is Intel Data Accelerator Driver (idxd), it is + recommended that the minimum version of Linux kernel is 5.18. + +- Add ``"intel_iommu=on,sm_on"`` parameter to kernel command line + for ``SVM`` feature enabling. + +Here is an easy way to verify ``IAA`` device driver and ``SVM`` with `iaa_test +`_ + +.. code-block:: shell + + #./test/iaa_test + [ info] alloc wq 0 shared size 128 addr 0x7f26cebe5000 batch sz 0xfffffffe xfer sz 0x80000000 + [ info] test noop: tflags 0x1 num_desc 1 + [ info] preparing descriptor for noop + [ info] Submitted all noop jobs + [ info] verifying task result for 0x16f7e20 + [ info] test with op 0 passed + + +IAA Resources Allocation For Migration +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +There is no ``IAA`` resource configuration parameters for migration and +``accel-config`` tool configuration cannot directly specify the ``IAA`` +resources used for migration. + +The multifd migration with ``QPL`` compression method will use all work +queues that are enabled and shared mode. + +.. note:: + + Accessing IAA resources requires ``sudo`` command or ``root`` privileges + by default. Administrators can modify the IAA device node ownership + so that QEMU can use IAA with specified user permissions. + + For example + + #chown -R qemu /dev/iax + +Shared Virtual Memory(SVM) Introduction +======================================= + +An ability for an accelerator I/O device to operate in the same virtual +memory space of applications on host processors. It also implies the +ability to operate from pageable memory, avoiding functional requirements +to pin memory for DMA operations. + +When using ``SVM`` technology, users do not need to reserve memory for the +``IAA`` device and perform pin memory operation. The ``IAA`` device can +directly access data using the virtual address of the process. + +For more ``SVM`` technology, please refer to +`Shared Virtual Addressing (SVA) with ENQCMD +`_ + + +How To Use QPL Compression In Migration +======================================= + +1 - Installation of ``QPL`` library and ``accel-config`` library if using IAA + +2 - Configure and enable ``IAA`` devices and work queues via ``accel-config`` + +3 - Build ``QEMU`` with ``--enable-qpl`` parameter + + E.g. configure --target-list=x86_64-softmmu --enable-kvm ``--enable-qpl`` + +4 - Enable ``QPL`` compression during migration + + Set ``migrate_set_parameter multifd-compression qpl`` when migrating, the + ``QPL`` compression does not support configuring the compression level, it + only supports one compression level. + +The Difference Between QPL And ZLIB +=================================== + +Although both ``QPL`` and ``ZLIB`` are based on the deflate compression +algorithm, and ``QPL`` can support the header and tail of ``ZLIB``, ``QPL`` +is still not fully compatible with the ``ZLIB`` compression in the migration. + +``QPL`` only supports 4K history buffer, and ``ZLIB`` is 32K by default. +``ZLIB`` compresses data that ``QPL`` may not decompress correctly and +vice versa. + +``QPL`` does not support the ``Z_SYNC_FLUSH`` operation in ``ZLIB`` streaming +compression, current ``ZLIB`` implementation uses ``Z_SYNC_FLUSH``, so each +``multifd`` thread has a ``ZLIB`` streaming context, and all page compression +and decompression are based on this stream. ``QPL`` cannot decompress such data +and vice versa. + +The introduction for ``Z_SYNC_FLUSH``, please refer to `Zlib Manual +`_ + +The Best Practices +================== +When user enables the IAA device for ``QPL`` compression, it is recommended +to add ``-mem-prealloc`` parameter to the destination boot parameters. This +parameter can avoid the occurrence of I/O page fault and reduce the overhead +of IAA compression and decompression. + +The example of booting with ``-mem-prealloc`` parameter + +.. code-block:: shell + + $qemu-system-x86_64 --enable-kvm -cpu host --mem-prealloc ... + + +An example about I/O page fault measurement of destination without +``-mem-prealloc``, the ``svm_prq`` indicates the number of I/O page fault +occurrences and processing time. + +.. code-block:: shell + + #echo 1 > /sys/kernel/debug/iommu/intel/dmar_perf_latency + #echo 2 > /sys/kernel/debug/iommu/intel/dmar_perf_latency + #echo 3 > /sys/kernel/debug/iommu/intel/dmar_perf_latency + #echo 4 > /sys/kernel/debug/iommu/intel/dmar_perf_latency + #cat /sys/kernel/debug/iommu/intel/dmar_perf_latency + IOMMU: dmar18 Register Base Address: c87fc000 + <0.1us 0.1us-1us 1us-10us 10us-100us 100us-1ms 1ms-10ms >=10ms min(us) max(us) average(us) + inv_iotlb 0 286 123 0 0 0 0 0 1 0 + inv_devtlb 0 276 133 0 0 0 0 0 2 0 + inv_iec 0 0 0 0 0 0 0 0 0 0 + svm_prq 0 0 25206 364 395 0 0 1 556 9 -- Gitee From 28b1cc22b9518823f69ded8baa9bf11bb0412125 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:05 +0800 Subject: [PATCH 70/90] migration/multifd: put IOV initialization into compression method commit d9d3e4f243214f742425d9d8360f0794bb05c999 upstream. Different compression methods may require different numbers of IOVs. Based on streaming compression of zlib and zstd, all pages will be compressed to a data block, so two IOVs are needed for packet header and compressed data block. Intel-SIG: commit d9d3e4f24321 migration/multifd: put IOV initialization into compression method Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Fabiano Rosas Reviewed-by: Peter Xu Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-zlib.c | 7 +++++++ migration/multifd-zstd.c | 8 +++++++- migration/multifd.c | 22 ++++++++++++---------- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c index b210725f6e7..2df49837802 100644 --- a/migration/multifd-zlib.c +++ b/migration/multifd-zlib.c @@ -70,6 +70,10 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp) goto err_free_zbuff; } p->compress_data = z; + + /* Needs 2 IOVs, one for packet header and one for compressed data */ + p->iov = g_new0(struct iovec, 2); + return 0; err_free_zbuff: @@ -101,6 +105,9 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp) z->buf = NULL; g_free(p->compress_data); p->compress_data = NULL; + + g_free(p->iov); + p->iov = NULL; } /** diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index 256858df0a0..ca17b7e310f 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -52,7 +52,6 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) struct zstd_data *z = g_new0(struct zstd_data, 1); int res; - p->compress_data = z; z->zcs = ZSTD_createCStream(); if (!z->zcs) { g_free(z); @@ -77,6 +76,10 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp) error_setg(errp, "multifd %u: out of memory for zbuff", p->id); return -1; } + p->compress_data = z; + + /* Needs 2 IOVs, one for packet header and one for compressed data */ + p->iov = g_new0(struct iovec, 2); return 0; } @@ -98,6 +101,9 @@ static void zstd_send_cleanup(MultiFDSendParams *p, Error **errp) z->zbuff = NULL; g_free(p->compress_data); p->compress_data = NULL; + + g_free(p->iov); + p->iov = NULL; } /** diff --git a/migration/multifd.c b/migration/multifd.c index fbab8c5b720..eae70400399 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -112,6 +112,13 @@ static int nocomp_send_setup(MultiFDSendParams *p, Error **errp) p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY; } + if (multifd_use_packets()) { + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, p->page_count + 1); + } else { + p->iov = g_new0(struct iovec, p->page_count); + } + return 0; } @@ -125,6 +132,8 @@ static int nocomp_send_setup(MultiFDSendParams *p, Error **errp) */ static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp) { + g_free(p->iov); + p->iov = NULL; return; } @@ -201,6 +210,7 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp) */ static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp) { + p->iov = g_new0(struct iovec, p->page_count); return 0; } @@ -213,6 +223,8 @@ static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp) */ static void nocomp_recv_cleanup(MultiFDRecvParams *p) { + g_free(p->iov); + p->iov = NULL; } /** @@ -733,8 +745,6 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp) p->packet_len = 0; g_free(p->packet); p->packet = NULL; - g_free(p->iov); - p->iov = NULL; multifd_send_state->ops->send_cleanup(p, errp); return *errp == NULL; @@ -1116,11 +1126,6 @@ bool multifd_send_setup(void) p->packet = g_malloc0(p->packet_len); p->packet->magic = cpu_to_be32(MULTIFD_MAGIC); p->packet->version = cpu_to_be32(MULTIFD_VERSION); - - /* We need one extra place for the packet header */ - p->iov = g_new0(struct iovec, page_count + 1); - } else { - p->iov = g_new0(struct iovec, page_count); } p->name = g_strdup_printf("multifdsend_%d", i); p->page_size = qemu_target_page_size(); @@ -1220,8 +1225,6 @@ static void multifd_recv_cleanup_channel(MultiFDRecvParams *p) p->packet_len = 0; g_free(p->packet); p->packet = NULL; - g_free(p->iov); - p->iov = NULL; g_free(p->normal); p->normal = NULL; g_free(p->zero); @@ -1399,7 +1402,6 @@ int multifd_recv_setup(Error **errp) p->packet = g_malloc0(p->packet_len); } p->name = g_strdup_printf("multifdrecv_%d", i); - p->iov = g_new0(struct iovec, page_count); p->normal = g_new0(ram_addr_t, page_count); p->zero = g_new0(ram_addr_t, page_count); p->page_count = page_count; -- Gitee From 3bb89f82818476ee9d8cb5e5c36608145ffc7201 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:06 +0800 Subject: [PATCH 71/90] configure: add --enable-qpl build option commit b844a2c7cc7f7c7756a27d372e64f6688d67c4eb upstream. add --enable-qpl and --disable-qpl options to enable and disable the QPL compression method for multifd migration. The Query Processing Library (QPL) is an open-source library that supports data compression and decompression features. It is based on the deflate compression algorithm and use Intel In-Memory Analytics Accelerator(IAA) hardware for compression and decompression acceleration. For more live migration with IAA, please refer to the document docs/devel/migration/qpl-compression.rst Intel-SIG: commit b844a2c7cc7f configure: add --enable-qpl build option Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Fabiano Rosas Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- meson.build | 8 ++++++++ meson_options.txt | 2 ++ scripts/meson-buildoptions.sh | 3 +++ 3 files changed, 13 insertions(+) diff --git a/meson.build b/meson.build index d92384c23a4..c833cd2d47a 100644 --- a/meson.build +++ b/meson.build @@ -1051,6 +1051,12 @@ if not get_option('zstd').auto() or have_block required: get_option('zstd'), method: 'pkg-config') endif +qpl = not_found +if not get_option('qpl').auto() or have_system + qpl = dependency('qpl', version: '>=1.5.0', + required: get_option('qpl'), + method: 'pkg-config') +endif virgl = not_found have_vhost_user_gpu = have_tools and targetos == 'linux' and pixman.found() @@ -2218,6 +2224,7 @@ config_host_data.set('CONFIG_MALLOC_TRIM', has_malloc_trim) config_host_data.set('CONFIG_STATX', has_statx) config_host_data.set('CONFIG_STATX_MNT_ID', has_statx_mnt_id) config_host_data.set('CONFIG_ZSTD', zstd.found()) +config_host_data.set('CONFIG_QPL', qpl.found()) config_host_data.set('CONFIG_FUSE', fuse.found()) config_host_data.set('CONFIG_FUSE_LSEEK', fuse_lseek.found()) config_host_data.set('CONFIG_SPICE_PROTOCOL', spice_protocol.found()) @@ -4393,6 +4400,7 @@ summary_info += {'snappy support': snappy} summary_info += {'bzip2 support': libbzip2} summary_info += {'lzfse support': liblzfse} summary_info += {'zstd support': zstd} +summary_info += {'Query Processing Library support': qpl} summary_info += {'NUMA host support': numa} summary_info += {'capstone': capstone} summary_info += {'libpmem support': libpmem} diff --git a/meson_options.txt b/meson_options.txt index c9baeda6395..e16aa8c823b 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -259,6 +259,8 @@ option('xkbcommon', type : 'feature', value : 'auto', description: 'xkbcommon support') option('zstd', type : 'feature', value : 'auto', description: 'zstd compression support') +option('qpl', type : 'feature', value : 'auto', + description: 'Query Processing Library support') option('fuse', type: 'feature', value: 'auto', description: 'FUSE block device export') option('fuse_lseek', type : 'feature', value : 'auto', diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index 680fa3f581d..784f74fde90 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -222,6 +222,7 @@ meson_options_help() { printf "%s\n" ' Xen PCI passthrough support' printf "%s\n" ' xkbcommon xkbcommon support' printf "%s\n" ' zstd zstd compression support' + printf "%s\n" ' qpl Query Processing Library support' } _meson_option_parse() { case $1 in @@ -562,6 +563,8 @@ _meson_option_parse() { --disable-xkbcommon) printf "%s" -Dxkbcommon=disabled ;; --enable-zstd) printf "%s" -Dzstd=enabled ;; --disable-zstd) printf "%s" -Dzstd=disabled ;; + --enable-qpl) printf "%s" -Dqpl=enabled ;; + --disable-qpl) printf "%s" -Dqpl=disabled ;; *) return 1 ;; esac } -- Gitee From 9748d06f1636c348787f29bf68886dd781190e1c Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:07 +0800 Subject: [PATCH 72/90] migration/multifd: add qpl compression method commit 354cac2859e48ec5f7ee72a2a071da6c60a462d0 upstream. add the Query Processing Library (QPL) compression method Introduce the qpl as a new multifd migration compression method, it can use In-Memory Analytics Accelerator(IAA) to accelerate compression and decompression, which can not only reduce network bandwidth requirement but also reduce host compression and decompression CPU overhead. How to enable qpl compression during migration: migrate_set_parameter multifd-compression qpl There is no qpl compression level parameter added since it only supports level one, users do not need to specify the qpl compression level. Intel-SIG: commit 354cac2859e4 migration/multifd: add qpl compression method Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Peter Xu Reviewed-by: Fabiano Rosas [fixed docs spacing in migration.json] Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- hw/core/qdev-properties-system.c | 2 +- migration/meson.build | 1 + migration/multifd-qpl.c | 20 ++++++++++++++++++++ migration/multifd.h | 1 + qapi/migration.json | 8 +++++++- 5 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 migration/multifd-qpl.c diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index 67d40c526c5..523b3a9bae4 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -666,7 +666,7 @@ const PropertyInfo qdev_prop_fdc_drive_type = { const PropertyInfo qdev_prop_multifd_compression = { .name = "MultiFDCompression", .description = "multifd_compression values, " - "none/zlib/zstd", + "none/zlib/zstd/qpl", .enum_table = &MultiFDCompression_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, diff --git a/migration/meson.build b/migration/meson.build index 1eeb915ff63..cb177de1d2f 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -41,6 +41,7 @@ if get_option('live_block_migration').allowed() system_ss.add(files('block.c')) endif system_ss.add(when: zstd, if_true: files('multifd-zstd.c')) +system_ss.add(when: qpl, if_true: files('multifd-qpl.c')) specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', if_true: files('ram.c', diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c new file mode 100644 index 00000000000..056a68a060a --- /dev/null +++ b/migration/multifd-qpl.c @@ -0,0 +1,20 @@ +/* + * Multifd qpl compression accelerator implementation + * + * Copyright (c) 2023 Intel Corporation + * + * Authors: + * Yuan Liu + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "qemu/module.h" + +static void multifd_qpl_register(void) +{ + /* noop */ +} + +migration_init(multifd_qpl_register); diff --git a/migration/multifd.h b/migration/multifd.h index d99603c6a4a..11f05dd6d55 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -33,6 +33,7 @@ bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); #define MULTIFD_FLAG_NOCOMP (0 << 1) #define MULTIFD_FLAG_ZLIB (1 << 1) #define MULTIFD_FLAG_ZSTD (2 << 1) +#define MULTIFD_FLAG_QPL (4 << 1) /* This value needs to be a multiple of qemu_target_page_size() */ #define MULTIFD_PACKET_SIZE (512 * 1024) diff --git a/qapi/migration.json b/qapi/migration.json index e7249f24f84..720988007dc 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -625,11 +625,17 @@ # # @zstd: use zstd compression method. # +# @qpl: use qpl compression method. Query Processing Library(qpl) is +# based on the deflate compression algorithm and use the Intel +# In-Memory Analytics Accelerator(IAA) accelerated compression +# and decompression. (Since 9.1) +# # Since: 5.0 ## { 'enum': 'MultiFDCompression', 'data': [ 'none', 'zlib', - { 'name': 'zstd', 'if': 'CONFIG_ZSTD' } ] } + { 'name': 'zstd', 'if': 'CONFIG_ZSTD' }, + { 'name': 'qpl', 'if': 'CONFIG_QPL' } ] } ## # @MigMode: -- Gitee From bef363809a5464b9de660b8438c966dbf73311b0 Mon Sep 17 00:00:00 2001 From: Jason Zeng Date: Wed, 2 Apr 2025 18:09:21 +0800 Subject: [PATCH 73/90] migration/multifd: include ram.h in multifd.h Header file ram.h was included by multifd.h when mapped-ram was introduced in upstream code. This inclusion is needed by qpl when multifd-qpl.c includes multifd.h. Since we don't backport mapped-ram, add this inclusion separately. Intel-SIG: commit - migration/multifd: include ram.h in multifd.h Signed-off-by: Jason Zeng --- migration/multifd.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/migration/multifd.h b/migration/multifd.h index 11f05dd6d55..41965df7a94 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -13,6 +13,8 @@ #ifndef QEMU_MIGRATION_MULTIFD_H #define QEMU_MIGRATION_MULTIFD_H +#include "ram.h" + bool multifd_send_setup(void); void multifd_send_shutdown(void); int multifd_recv_setup(Error **errp); -- Gitee From 478f52635c850c181819cb8c03ab38a2fcab0854 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:08 +0800 Subject: [PATCH 74/90] migration/multifd: implement initialization of qpl compression commit 34e104b897da6e144a5f34e7c5eebf8a4c4d9d59 upstream. during initialization, a software job is allocated to each channel for software path fallabck when the IAA hardware is unavailable or the hardware job submission fails. If the IAA hardware is available, multiple hardware jobs are allocated for batch processing. Intel-SIG: commit 34e104b897da migration/multifd: implement initialization of qpl compression Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Fabiano Rosas Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-qpl.c | 328 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 327 insertions(+), 1 deletion(-) diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c index 056a68a060a..6791a204d55 100644 --- a/migration/multifd-qpl.c +++ b/migration/multifd-qpl.c @@ -9,12 +9,338 @@ * This work is licensed under the terms of the GNU GPL, version 2 or later. * See the COPYING file in the top-level directory. */ + #include "qemu/osdep.h" #include "qemu/module.h" +#include "qapi/error.h" +#include "multifd.h" +#include "qpl/qpl.h" + +typedef struct { + /* the QPL hardware path job */ + qpl_job *job; + /* indicates if fallback to software path is required */ + bool fallback_sw_path; + /* output data from the software path */ + uint8_t *sw_output; + /* output data length from the software path */ + uint32_t sw_output_len; +} QplHwJob; + +typedef struct { + /* array of hardware jobs, the number of jobs equals the number pages */ + QplHwJob *hw_jobs; + /* the QPL software job for the slow path and software fallback */ + qpl_job *sw_job; + /* the number of pages that the QPL needs to process at one time */ + uint32_t page_num; + /* array of compressed page buffers */ + uint8_t *zbuf; + /* array of compressed page lengths */ + uint32_t *zlen; + /* the status of the hardware device */ + bool hw_avail; +} QplData; + +/** + * check_hw_avail: check if IAA hardware is available + * + * If the IAA hardware does not exist or is unavailable, + * the QPL hardware job initialization will fail. + * + * Returns true if IAA hardware is available, otherwise false. + * + * @job_size: indicates the hardware job size if hardware is available + */ +static bool check_hw_avail(uint32_t *job_size) +{ + qpl_path_t path = qpl_path_hardware; + uint32_t size = 0; + qpl_job *job; + + if (qpl_get_job_size(path, &size) != QPL_STS_OK) { + return false; + } + assert(size > 0); + job = g_malloc0(size); + if (qpl_init_job(path, job) != QPL_STS_OK) { + g_free(job); + return false; + } + g_free(job); + *job_size = size; + return true; +} + +/** + * multifd_qpl_free_sw_job: clean up software job + * + * Free the software job resources. + * + * @qpl: pointer to the QplData structure + */ +static void multifd_qpl_free_sw_job(QplData *qpl) +{ + assert(qpl); + if (qpl->sw_job) { + qpl_fini_job(qpl->sw_job); + g_free(qpl->sw_job); + qpl->sw_job = NULL; + } +} + +/** + * multifd_qpl_free_jobs: clean up hardware jobs + * + * Free all hardware job resources. + * + * @qpl: pointer to the QplData structure + */ +static void multifd_qpl_free_hw_job(QplData *qpl) +{ + assert(qpl); + if (qpl->hw_jobs) { + for (int i = 0; i < qpl->page_num; i++) { + qpl_fini_job(qpl->hw_jobs[i].job); + g_free(qpl->hw_jobs[i].job); + qpl->hw_jobs[i].job = NULL; + } + g_free(qpl->hw_jobs); + qpl->hw_jobs = NULL; + } +} + +/** + * multifd_qpl_init_sw_job: initialize a software job + * + * Use the QPL software path to initialize a job + * + * @qpl: pointer to the QplData structure + * @errp: pointer to an error + */ +static int multifd_qpl_init_sw_job(QplData *qpl, Error **errp) +{ + qpl_path_t path = qpl_path_software; + uint32_t size = 0; + qpl_job *job = NULL; + qpl_status status; + + status = qpl_get_job_size(path, &size); + if (status != QPL_STS_OK) { + error_setg(errp, "qpl_get_job_size failed with error %d", status); + return -1; + } + job = g_malloc0(size); + status = qpl_init_job(path, job); + if (status != QPL_STS_OK) { + error_setg(errp, "qpl_init_job failed with error %d", status); + g_free(job); + return -1; + } + qpl->sw_job = job; + return 0; +} + +/** + * multifd_qpl_init_jobs: initialize hardware jobs + * + * Use the QPL hardware path to initialize jobs + * + * @qpl: pointer to the QplData structure + * @size: the size of QPL hardware path job + * @errp: pointer to an error + */ +static void multifd_qpl_init_hw_job(QplData *qpl, uint32_t size, Error **errp) +{ + qpl_path_t path = qpl_path_hardware; + qpl_job *job = NULL; + qpl_status status; + + qpl->hw_jobs = g_new0(QplHwJob, qpl->page_num); + for (int i = 0; i < qpl->page_num; i++) { + job = g_malloc0(size); + status = qpl_init_job(path, job); + /* the job initialization should succeed after check_hw_avail */ + assert(status == QPL_STS_OK); + qpl->hw_jobs[i].job = job; + } +} + +/** + * multifd_qpl_init: initialize QplData structure + * + * Allocate and initialize a QplData structure + * + * Returns a QplData pointer on success or NULL on error + * + * @num: the number of pages + * @size: the page size + * @errp: pointer to an error + */ +static QplData *multifd_qpl_init(uint32_t num, uint32_t size, Error **errp) +{ + uint32_t job_size = 0; + QplData *qpl; + + qpl = g_new0(QplData, 1); + qpl->page_num = num; + if (multifd_qpl_init_sw_job(qpl, errp) != 0) { + g_free(qpl); + return NULL; + } + qpl->hw_avail = check_hw_avail(&job_size); + if (qpl->hw_avail) { + multifd_qpl_init_hw_job(qpl, job_size, errp); + } + qpl->zbuf = g_malloc0(size * num); + qpl->zlen = g_new0(uint32_t, num); + return qpl; +} + +/** + * multifd_qpl_deinit: clean up QplData structure + * + * Free jobs, buffers and the QplData structure + * + * @qpl: pointer to the QplData structure + */ +static void multifd_qpl_deinit(QplData *qpl) +{ + if (qpl) { + multifd_qpl_free_sw_job(qpl); + multifd_qpl_free_hw_job(qpl); + g_free(qpl->zbuf); + g_free(qpl->zlen); + g_free(qpl); + } +} + +/** + * multifd_qpl_send_setup: set up send side + * + * Set up the channel with QPL compression. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_send_setup(MultiFDSendParams *p, Error **errp) +{ + QplData *qpl; + + qpl = multifd_qpl_init(p->page_count, p->page_size, errp); + if (!qpl) { + return -1; + } + p->compress_data = qpl; + + /* + * the page will be compressed independently and sent using an IOV. The + * additional two IOVs are used to store packet header and compressed data + * length + */ + p->iov = g_new0(struct iovec, p->page_count + 2); + return 0; +} + +/** + * multifd_qpl_send_cleanup: clean up send side + * + * Close the channel and free memory. + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static void multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp) +{ + multifd_qpl_deinit(p->compress_data); + p->compress_data = NULL; + g_free(p->iov); + p->iov = NULL; +} + +/** + * multifd_qpl_send_prepare: prepare data to be able to send + * + * Create a compressed buffer with all the pages that we are going to + * send. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp) +{ + /* Implement in next patch */ + return -1; +} + +/** + * multifd_qpl_recv_setup: set up receive side + * + * Create the compressed channel and buffer. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_recv_setup(MultiFDRecvParams *p, Error **errp) +{ + QplData *qpl; + + qpl = multifd_qpl_init(p->page_count, p->page_size, errp); + if (!qpl) { + return -1; + } + p->compress_data = qpl; + return 0; +} + +/** + * multifd_qpl_recv_cleanup: set up receive side + * + * Close the channel and free memory. + * + * @p: Params for the channel being used + */ +static void multifd_qpl_recv_cleanup(MultiFDRecvParams *p) +{ + multifd_qpl_deinit(p->compress_data); + p->compress_data = NULL; +} + +/** + * multifd_qpl_recv: read the data from the channel into actual pages + * + * Read the compressed buffer, and uncompress it into the actual + * pages. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp) +{ + /* Implement in next patch */ + return -1; +} + +static MultiFDMethods multifd_qpl_ops = { + .send_setup = multifd_qpl_send_setup, + .send_cleanup = multifd_qpl_send_cleanup, + .send_prepare = multifd_qpl_send_prepare, + .recv_setup = multifd_qpl_recv_setup, + .recv_cleanup = multifd_qpl_recv_cleanup, + .recv = multifd_qpl_recv, +}; static void multifd_qpl_register(void) { - /* noop */ + multifd_register_ops(MULTIFD_COMPRESSION_QPL, &multifd_qpl_ops); } migration_init(multifd_qpl_register); -- Gitee From c072955a8f8dbf41bcd4be0cf956a8a13255fd20 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:09 +0800 Subject: [PATCH 75/90] migration/multifd: implement qpl compression and decompression commit f6fe9fea995249ecc2cd72975d803fbf4d512c02 upstream. QPL compression and decompression will use IAA hardware path if the IAA hardware is available. Otherwise the QPL library software path is used. The hardware path will automatically fall back to QPL software path if the IAA queues are busy. In some scenarios, this may happen frequently, such as configuring 4 channels but only one IAA device is available. In the case of insufficient IAA hardware resources, retry and fallback can help optimize performance: 1. Retry + SW fallback: total time: 14649 ms downtime: 25 ms throughput: 17666.57 mbps pages-per-second: 1509647 2. No fallback, always wait for work queues to become available total time: 18381 ms downtime: 25 ms throughput: 13698.65 mbps pages-per-second: 859607 If both the hardware and software paths fail, the uncompressed page is sent directly. Intel-SIG: commit f6fe9fea9952 migration/multifd: implement qpl compression and decompression Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Fabiano Rosas Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-qpl.c | 424 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 420 insertions(+), 4 deletions(-) diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c index 6791a204d55..9265098ee79 100644 --- a/migration/multifd-qpl.c +++ b/migration/multifd-qpl.c @@ -13,9 +13,14 @@ #include "qemu/osdep.h" #include "qemu/module.h" #include "qapi/error.h" +#include "qapi/qapi-types-migration.h" +#include "exec/ramblock.h" #include "multifd.h" #include "qpl/qpl.h" +/* Maximum number of retries to resubmit a job if IAA work queues are full */ +#define MAX_SUBMIT_RETRY_NUM (3) + typedef struct { /* the QPL hardware path job */ qpl_job *job; @@ -260,6 +265,225 @@ static void multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp) p->iov = NULL; } +/** + * multifd_qpl_prepare_job: prepare the job + * + * Set the QPL job parameters and properties. + * + * @job: pointer to the qpl_job structure + * @is_compression: indicates compression and decompression + * @input: pointer to the input data buffer + * @input_len: the length of the input data + * @output: pointer to the output data buffer + * @output_len: the length of the output data + */ +static void multifd_qpl_prepare_job(qpl_job *job, bool is_compression, + uint8_t *input, uint32_t input_len, + uint8_t *output, uint32_t output_len) +{ + job->op = is_compression ? qpl_op_compress : qpl_op_decompress; + job->next_in_ptr = input; + job->next_out_ptr = output; + job->available_in = input_len; + job->available_out = output_len; + job->flags = QPL_FLAG_FIRST | QPL_FLAG_LAST | QPL_FLAG_OMIT_VERIFY; + /* only supports compression level 1 */ + job->level = 1; +} + +/** + * multifd_qpl_prepare_comp_job: prepare the compression job + * + * Set the compression job parameters and properties. + * + * @job: pointer to the qpl_job structure + * @input: pointer to the input data buffer + * @output: pointer to the output data buffer + * @size: the page size + */ +static void multifd_qpl_prepare_comp_job(qpl_job *job, uint8_t *input, + uint8_t *output, uint32_t size) +{ + /* + * Set output length to less than the page size to force the job to + * fail in case it compresses to a larger size. We'll send that page + * without compression and skip the decompression operation on the + * destination. + */ + multifd_qpl_prepare_job(job, true, input, size, output, size - 1); +} + +/** + * multifd_qpl_prepare_decomp_job: prepare the decompression job + * + * Set the decompression job parameters and properties. + * + * @job: pointer to the qpl_job structure + * @input: pointer to the input data buffer + * @len: the length of the input data + * @output: pointer to the output data buffer + * @size: the page size + */ +static void multifd_qpl_prepare_decomp_job(qpl_job *job, uint8_t *input, + uint32_t len, uint8_t *output, + uint32_t size) +{ + multifd_qpl_prepare_job(job, false, input, len, output, size); +} + +/** + * multifd_qpl_fill_iov: fill in the IOV + * + * Fill in the QPL packet IOV + * + * @p: Params for the channel being used + * @data: pointer to the IOV data + * @len: The length of the IOV data + */ +static void multifd_qpl_fill_iov(MultiFDSendParams *p, uint8_t *data, + uint32_t len) +{ + p->iov[p->iovs_num].iov_base = data; + p->iov[p->iovs_num].iov_len = len; + p->iovs_num++; + p->next_packet_size += len; +} + +/** + * multifd_qpl_fill_packet: fill the compressed page into the QPL packet + * + * Fill the compressed page length and IOV into the QPL packet + * + * @idx: The index of the compressed length array + * @p: Params for the channel being used + * @data: pointer to the compressed page buffer + * @len: The length of the compressed page + */ +static void multifd_qpl_fill_packet(uint32_t idx, MultiFDSendParams *p, + uint8_t *data, uint32_t len) +{ + QplData *qpl = p->compress_data; + + qpl->zlen[idx] = cpu_to_be32(len); + multifd_qpl_fill_iov(p, data, len); +} + +/** + * multifd_qpl_submit_job: submit a job to the hardware + * + * Submit a QPL hardware job to the IAA device + * + * Returns true if the job is submitted successfully, otherwise false. + * + * @job: pointer to the qpl_job structure + */ +static bool multifd_qpl_submit_job(qpl_job *job) +{ + qpl_status status; + uint32_t num = 0; + +retry: + status = qpl_submit_job(job); + if (status == QPL_STS_QUEUES_ARE_BUSY_ERR) { + if (num < MAX_SUBMIT_RETRY_NUM) { + num++; + goto retry; + } + } + return (status == QPL_STS_OK); +} + +/** + * multifd_qpl_compress_pages_slow_path: compress pages using slow path + * + * Compress the pages using software. If compression fails, the uncompressed + * page will be sent. + * + * @p: Params for the channel being used + */ +static void multifd_qpl_compress_pages_slow_path(MultiFDSendParams *p) +{ + QplData *qpl = p->compress_data; + uint32_t size = p->page_size; + qpl_job *job = qpl->sw_job; + uint8_t *zbuf = qpl->zbuf; + uint8_t *buf; + + for (int i = 0; i < p->pages->normal_num; i++) { + buf = p->pages->block->host + p->pages->offset[i]; + multifd_qpl_prepare_comp_job(job, buf, zbuf, size); + if (qpl_execute_job(job) == QPL_STS_OK) { + multifd_qpl_fill_packet(i, p, zbuf, job->total_out); + } else { + /* send the uncompressed page */ + multifd_qpl_fill_packet(i, p, buf, size); + } + zbuf += size; + } +} + +/** + * multifd_qpl_compress_pages: compress pages + * + * Submit the pages to the IAA hardware for compression. If hardware + * compression fails, it falls back to software compression. If software + * compression also fails, the uncompressed page is sent. + * + * @p: Params for the channel being used + */ +static void multifd_qpl_compress_pages(MultiFDSendParams *p) +{ + QplData *qpl = p->compress_data; + MultiFDPages_t *pages = p->pages; + uint32_t size = p->page_size; + QplHwJob *hw_job; + uint8_t *buf; + uint8_t *zbuf; + + for (int i = 0; i < pages->normal_num; i++) { + buf = pages->block->host + pages->offset[i]; + zbuf = qpl->zbuf + (size * i); + hw_job = &qpl->hw_jobs[i]; + multifd_qpl_prepare_comp_job(hw_job->job, buf, zbuf, size); + if (multifd_qpl_submit_job(hw_job->job)) { + hw_job->fallback_sw_path = false; + } else { + /* + * The IAA work queue is full, any immediate subsequent job + * submission is likely to fail, sending the page via the QPL + * software path at this point gives us a better chance of + * finding the queue open for the next pages. + */ + hw_job->fallback_sw_path = true; + multifd_qpl_prepare_comp_job(qpl->sw_job, buf, zbuf, size); + if (qpl_execute_job(qpl->sw_job) == QPL_STS_OK) { + hw_job->sw_output = zbuf; + hw_job->sw_output_len = qpl->sw_job->total_out; + } else { + hw_job->sw_output = buf; + hw_job->sw_output_len = size; + } + } + } + + for (int i = 0; i < pages->normal_num; i++) { + buf = pages->block->host + pages->offset[i]; + zbuf = qpl->zbuf + (size * i); + hw_job = &qpl->hw_jobs[i]; + if (hw_job->fallback_sw_path) { + multifd_qpl_fill_packet(i, p, hw_job->sw_output, + hw_job->sw_output_len); + continue; + } + if (qpl_wait_job(hw_job->job) == QPL_STS_OK) { + multifd_qpl_fill_packet(i, p, zbuf, hw_job->job->total_out); + } else { + /* send the uncompressed page */ + multifd_qpl_fill_packet(i, p, buf, size); + } + } +} + /** * multifd_qpl_send_prepare: prepare data to be able to send * @@ -273,8 +497,26 @@ static void multifd_qpl_send_cleanup(MultiFDSendParams *p, Error **errp) */ static int multifd_qpl_send_prepare(MultiFDSendParams *p, Error **errp) { - /* Implement in next patch */ - return -1; + QplData *qpl = p->compress_data; + uint32_t len = 0; + + if (!multifd_send_prepare_common(p)) { + goto out; + } + + /* The first IOV is used to store the compressed page lengths */ + len = p->pages->normal_num * sizeof(uint32_t); + multifd_qpl_fill_iov(p, (uint8_t *) qpl->zlen, len); + if (qpl->hw_avail) { + multifd_qpl_compress_pages(p); + } else { + multifd_qpl_compress_pages_slow_path(p); + } + +out: + p->flags |= MULTIFD_FLAG_QPL; + multifd_send_fill_packet(p); + return 0; } /** @@ -312,6 +554,140 @@ static void multifd_qpl_recv_cleanup(MultiFDRecvParams *p) p->compress_data = NULL; } +/** + * multifd_qpl_process_and_check_job: process and check a QPL job + * + * Process the job and check whether the job output length is the + * same as the specified length + * + * Returns true if the job execution succeeded and the output length + * is equal to the specified length, otherwise false. + * + * @job: pointer to the qpl_job structure + * @is_hardware: indicates whether the job is a hardware job + * @len: Specified output length + * @errp: pointer to an error + */ +static bool multifd_qpl_process_and_check_job(qpl_job *job, bool is_hardware, + uint32_t len, Error **errp) +{ + qpl_status status; + + status = (is_hardware ? qpl_wait_job(job) : qpl_execute_job(job)); + if (status != QPL_STS_OK) { + error_setg(errp, "qpl job failed with error %d", status); + return false; + } + if (job->total_out != len) { + error_setg(errp, "qpl decompressed len %u, expected len %u", + job->total_out, len); + return false; + } + return true; +} + +/** + * multifd_qpl_decompress_pages_slow_path: decompress pages using slow path + * + * Decompress the pages using software + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_decompress_pages_slow_path(MultiFDRecvParams *p, + Error **errp) +{ + QplData *qpl = p->compress_data; + uint32_t size = p->page_size; + qpl_job *job = qpl->sw_job; + uint8_t *zbuf = qpl->zbuf; + uint8_t *addr; + uint32_t len; + + for (int i = 0; i < p->normal_num; i++) { + len = qpl->zlen[i]; + addr = p->host + p->normal[i]; + /* the page is uncompressed, load it */ + if (len == size) { + memcpy(addr, zbuf, size); + zbuf += size; + continue; + } + multifd_qpl_prepare_decomp_job(job, zbuf, len, addr, size); + if (!multifd_qpl_process_and_check_job(job, false, size, errp)) { + return -1; + } + zbuf += len; + } + return 0; +} + +/** + * multifd_qpl_decompress_pages: decompress pages + * + * Decompress the pages using the IAA hardware. If hardware + * decompression fails, it falls back to software decompression. + * + * Returns 0 on success or -1 on error + * + * @p: Params for the channel being used + * @errp: pointer to an error + */ +static int multifd_qpl_decompress_pages(MultiFDRecvParams *p, Error **errp) +{ + QplData *qpl = p->compress_data; + uint32_t size = p->page_size; + uint8_t *zbuf = qpl->zbuf; + uint8_t *addr; + uint32_t len; + qpl_job *job; + + for (int i = 0; i < p->normal_num; i++) { + addr = p->host + p->normal[i]; + len = qpl->zlen[i]; + /* the page is uncompressed if received length equals the page size */ + if (len == size) { + memcpy(addr, zbuf, size); + zbuf += size; + continue; + } + + job = qpl->hw_jobs[i].job; + multifd_qpl_prepare_decomp_job(job, zbuf, len, addr, size); + if (multifd_qpl_submit_job(job)) { + qpl->hw_jobs[i].fallback_sw_path = false; + } else { + /* + * The IAA work queue is full, any immediate subsequent job + * submission is likely to fail, sending the page via the QPL + * software path at this point gives us a better chance of + * finding the queue open for the next pages. + */ + qpl->hw_jobs[i].fallback_sw_path = true; + job = qpl->sw_job; + multifd_qpl_prepare_decomp_job(job, zbuf, len, addr, size); + if (!multifd_qpl_process_and_check_job(job, false, size, errp)) { + return -1; + } + } + zbuf += len; + } + + for (int i = 0; i < p->normal_num; i++) { + /* ignore pages that have already been processed */ + if (qpl->zlen[i] == size || qpl->hw_jobs[i].fallback_sw_path) { + continue; + } + + job = qpl->hw_jobs[i].job; + if (!multifd_qpl_process_and_check_job(job, true, size, errp)) { + return -1; + } + } + return 0; +} /** * multifd_qpl_recv: read the data from the channel into actual pages * @@ -325,8 +701,48 @@ static void multifd_qpl_recv_cleanup(MultiFDRecvParams *p) */ static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp) { - /* Implement in next patch */ - return -1; + QplData *qpl = p->compress_data; + uint32_t in_size = p->next_packet_size; + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + uint32_t len = 0; + uint32_t zbuf_len = 0; + int ret; + + if (flags != MULTIFD_FLAG_QPL) { + error_setg(errp, "multifd %u: flags received %x flags expected %x", + p->id, flags, MULTIFD_FLAG_QPL); + return -1; + } + multifd_recv_zero_page_process(p); + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + + /* read compressed page lengths */ + len = p->normal_num * sizeof(uint32_t); + assert(len < in_size); + ret = qio_channel_read_all(p->c, (void *) qpl->zlen, len, errp); + if (ret != 0) { + return ret; + } + for (int i = 0; i < p->normal_num; i++) { + qpl->zlen[i] = be32_to_cpu(qpl->zlen[i]); + assert(qpl->zlen[i] <= p->page_size); + zbuf_len += qpl->zlen[i]; + } + + /* read compressed pages */ + assert(in_size == len + zbuf_len); + ret = qio_channel_read_all(p->c, (void *) qpl->zbuf, zbuf_len, errp); + if (ret != 0) { + return ret; + } + + if (qpl->hw_avail) { + return multifd_qpl_decompress_pages(p, errp); + } + return multifd_qpl_decompress_pages_slow_path(p, errp); } static MultiFDMethods multifd_qpl_ops = { -- Gitee From 6970d3b0616b21f7afacdb17cad67354875c25d2 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Mon, 10 Jun 2024 18:21:10 +0800 Subject: [PATCH 76/90] tests/migration-test: add qpl compression test commit 08b82d207d138173ddd334c91b387213508a6e13 upstream. add qpl to compression method test for multifd migration the qpl compression supports software path and hardware path(IAA device), and the hardware path is used first by default. If the hardware path is unavailable, it will automatically fallback to the software path for testing. Intel-SIG: commit 08b82d207d13 tests/migration-test: add qpl compression test Signed-off-by: Yuan Liu Reviewed-by: Nanhai Zou Reviewed-by: Peter Xu Reviewed-by: Fabiano Rosas Signed-off-by: Fabiano Rosas Conflicts: tests/qtest/migration-test.c [jz: resolve simple context conflict] Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 4bdf397828d..3ccf4f7ecf2 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2572,6 +2572,15 @@ test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from, } #endif /* CONFIG_ZSTD */ +#ifdef CONFIG_QPL +static void * +test_migrate_precopy_tcp_multifd_qpl_start(QTestState *from, + QTestState *to) +{ + return test_migrate_precopy_tcp_multifd_start_common(from, to, "qpl"); +} +#endif /* CONFIG_QPL */ + static void test_multifd_tcp_none(void) { MigrateCommon args = { @@ -2607,6 +2616,17 @@ static void test_multifd_tcp_zstd(void) } #endif +#ifdef CONFIG_QPL +static void test_multifd_tcp_qpl(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .start_hook = test_migrate_precopy_tcp_multifd_qpl_start, + }; + test_precopy_common(&args); +} +#endif + #ifdef CONFIG_GNUTLS static void * test_migrate_multifd_tcp_tls_psk_start_match(QTestState *from, @@ -3483,6 +3503,10 @@ int main(int argc, char **argv) migration_test_add("/migration/multifd/tcp/plain/zstd", test_multifd_tcp_zstd); #endif +#ifdef CONFIG_QPL + migration_test_add("/migration/multifd/tcp/plain/qpl", + test_multifd_tcp_qpl); +#endif #ifdef CONFIG_GNUTLS migration_test_add("/migration/multifd/tcp/tls/psk/match", test_multifd_tcp_tls_psk_match); -- Gitee From 0ed3c717c0c1efb1d2c2397958c53422a173e740 Mon Sep 17 00:00:00 2001 From: Bryan Zhang Date: Fri, 1 Mar 2024 03:59:00 +0000 Subject: [PATCH 77/90] migration: Properly apply migration compression level parameters commit b4014a2bf57ce08e2f6458cd82e9f968facf25c8 upstream. Some glue code was missing, so that using `qmp_migrate_set_parameters` to set `multifd-zstd-level` or `multifd-zlib-level` did not work. This commit adds the glue code to fix that. Intel-SIG: commit b4014a2bf57c migration: Properly apply migration compression level parameters Signed-off-by: Bryan Zhang Link: https://lore.kernel.org/r/20240301035901.4006936-2-bryan.zhang@bytedance.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/options.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/migration/options.c b/migration/options.c index 38403bf7459..68ff81b8853 100644 --- a/migration/options.c +++ b/migration/options.c @@ -1333,6 +1333,12 @@ static void migrate_params_test_apply(MigrateSetParameters *params, if (params->has_multifd_compression) { dest->multifd_compression = params->multifd_compression; } + if (params->has_multifd_zlib_level) { + dest->multifd_zlib_level = params->multifd_zlib_level; + } + if (params->has_multifd_zstd_level) { + dest->multifd_zstd_level = params->multifd_zstd_level; + } if (params->has_xbzrle_cache_size) { dest->xbzrle_cache_size = params->xbzrle_cache_size; } @@ -1485,6 +1491,12 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) if (params->has_multifd_compression) { s->parameters.multifd_compression = params->multifd_compression; } + if (params->has_multifd_zlib_level) { + s->parameters.multifd_zlib_level = params->multifd_zlib_level; + } + if (params->has_multifd_zstd_level) { + s->parameters.multifd_zstd_level = params->multifd_zstd_level; + } if (params->has_xbzrle_cache_size) { s->parameters.xbzrle_cache_size = params->xbzrle_cache_size; xbzrle_cache_resize(params->xbzrle_cache_size, errp); -- Gitee From d09991c7096a2e02be2f2b18d39aa8309fcf50c1 Mon Sep 17 00:00:00 2001 From: Bryan Zhang Date: Fri, 1 Mar 2024 03:59:01 +0000 Subject: [PATCH 78/90] tests/migration: Set compression level in migration tests commit 2b571432314ab42da742fbb578f4174166ecd7f5 upstream. Adds calls to set compression level for `zstd` and `zlib` migration tests, just to make sure that the calls work. Intel-SIG: commit 2b571432314a tests/migration: Set compression level in migration tests Signed-off-by: Bryan Zhang Link: https://lore.kernel.org/r/20240301035901.4006936-3-bryan.zhang@bytedance.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 3ccf4f7ecf2..10a3f99d6c5 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2560,6 +2560,13 @@ static void * test_migrate_precopy_tcp_multifd_zlib_start(QTestState *from, QTestState *to) { + /* + * Overloading this test to also check that set_parameter does not error. + * This is also done in the tests for the other compression methods. + */ + migrate_set_parameter_int(from, "multifd-zlib-level", 2); + migrate_set_parameter_int(to, "multifd-zlib-level", 2); + return test_migrate_precopy_tcp_multifd_start_common(from, to, "zlib"); } @@ -2568,6 +2575,9 @@ static void * test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from, QTestState *to) { + migrate_set_parameter_int(from, "multifd-zstd-level", 2); + migrate_set_parameter_int(to, "multifd-zstd-level", 2); + return test_migrate_precopy_tcp_multifd_start_common(from, to, "zstd"); } #endif /* CONFIG_ZSTD */ -- Gitee From 72a68fd234f46f923b03f6b78185b8a9735be8c5 Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Fri, 30 Aug 2024 16:27:18 -0700 Subject: [PATCH 79/90] docs/migration: add qatzip compression feature commit 85da4cbe6e5eb6ba6f31c8b30ee4582625546da7 upstream. add Intel QATzip compression method introduction Intel-SIG: commit 85da4cbe6e5e docs/migration: add qatzip compression feature Reviewed-by: Nanhai Zou Reviewed-by: Peter Xu Reviewed-by: Fabiano Rosas Signed-off-by: Yuan Liu Signed-off-by: Yichen Wang Link: https://lore.kernel.org/r/20240830232722.58272-2-yichen.wang@bytedance.com Signed-off-by: Peter Xu Conflicts: docs/devel/migration/features.rst [jz: resolve context conflict due to uadk which is not backported] Signed-off-by: Jason Zeng --- docs/devel/migration/features.rst | 1 + docs/devel/migration/qatzip-compression.rst | 165 ++++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 docs/devel/migration/qatzip-compression.rst diff --git a/docs/devel/migration/features.rst b/docs/devel/migration/features.rst index 9819393c12f..9ba25882ac9 100644 --- a/docs/devel/migration/features.rst +++ b/docs/devel/migration/features.rst @@ -11,3 +11,4 @@ Migration has plenty of features to support different use cases. vfio virtio qpl-compression + qatzip-compression diff --git a/docs/devel/migration/qatzip-compression.rst b/docs/devel/migration/qatzip-compression.rst new file mode 100644 index 00000000000..862b3831647 --- /dev/null +++ b/docs/devel/migration/qatzip-compression.rst @@ -0,0 +1,165 @@ +================== +QATzip Compression +================== +In scenarios with limited network bandwidth, the ``QATzip`` solution can help +users save a lot of host CPU resources by accelerating compression and +decompression through the Intel QuickAssist Technology(``QAT``) hardware. + + +The following test was conducted using 8 multifd channels and 10Gbps network +bandwidth. The results show that, compared to zstd, ``QATzip`` significantly +saves CPU resources on the sender and reduces migration time. Compared to the +uncompressed solution, ``QATzip`` greatly improves the dirty page processing +capability, indicated by the Pages per Second metric, and also reduces the +total migration time. + +:: + + VM Configuration: 16 vCPU and 64G memory + VM Workload: all vCPUs are idle and 54G memory is filled with Silesia data. + QAT Devices: 4 + |-----------|--------|---------|----------|----------|------|------| + |8 Channels |Total |down |throughput|pages per | send | recv | + | |time(ms)|time(ms) |(mbps) |second | cpu %| cpu% | + |-----------|--------|---------|----------|----------|------|------| + |qatzip | 16630| 28| 10467| 2940235| 160| 360| + |-----------|--------|---------|----------|----------|------|------| + |zstd | 20165| 24| 8579| 2391465| 810| 340| + |-----------|--------|---------|----------|----------|------|------| + |none | 46063| 40| 10848| 330240| 45| 85| + |-----------|--------|---------|----------|----------|------|------| + + +QATzip Compression Framework +============================ + +``QATzip`` is a user space library which builds on top of the Intel QuickAssist +Technology to provide extended accelerated compression and decompression +services. + +For more ``QATzip`` introduction, please refer to `QATzip Introduction +`_ + +:: + + +----------------+ + | MultiFd Thread | + +-------+--------+ + | + | compress/decompress + +-------+--------+ + | QATzip library | + +-------+--------+ + | + +-------+--------+ + | QAT library | + +-------+--------+ + | user space + --------+--------------------- + | kernel space + +------+-------+ + | QAT Driver | + +------+-------+ + | + +------+-------+ + | QAT Devices | + +--------------+ + + +QATzip Installation +------------------- + +The ``QATzip`` installation package has been integrated into some Linux +distributions and can be installed directly. For example, the Ubuntu Server +24.04 LTS system can be installed using below command + +.. code-block:: shell + + #apt search qatzip + libqatzip-dev/noble 1.2.0-0ubuntu3 amd64 + Intel QuickAssist user space library development files + + libqatzip3/noble 1.2.0-0ubuntu3 amd64 + Intel QuickAssist user space library + + qatzip/noble,now 1.2.0-0ubuntu3 amd64 [installed] + Compression user-space tool for Intel QuickAssist Technology + + #sudo apt install libqatzip-dev libqatzip3 qatzip + +If your system does not support the ``QATzip`` installation package, you can +use the source code to build and install, please refer to `QATzip source code installation +`_ + +QAT Hardware Deployment +----------------------- + +``QAT`` supports physical functions(PFs) and virtual functions(VFs) for +deployment, and users can configure ``QAT`` resources for migration according +to actual needs. For more details about ``QAT`` deployment, please refer to +`Intel QuickAssist Technology Documentation +`_ + +For more ``QAT`` hardware introduction, please refer to `intel-quick-assist-technology-overview +`_ + +How To Use QATzip Compression +============================= + +1 - Install ``QATzip`` library + +2 - Build ``QEMU`` with ``--enable-qatzip`` parameter + + E.g. configure --target-list=x86_64-softmmu --enable-kvm ``--enable-qatzip`` + +3 - Set ``migrate_set_parameter multifd-compression qatzip`` + +4 - Set ``migrate_set_parameter multifd-qatzip-level comp_level``, the default +comp_level value is 1, and it supports levels from 1 to 9 + +QAT Memory Requirements +======================= + +The user needs to reserve system memory for the QAT memory management to +allocate DMA memory. The size of the reserved system memory depends on the +number of devices used for migration and the number of multifd channels. + +Because memory usage depends on QAT configuration, please refer to `QAT Memory +Driver Queries +`_ +for memory usage calculation. + +.. list-table:: An example of a PF used for migration + :header-rows: 1 + + * - Number of channels + - Sender memory usage + - Receiver memory usage + * - 2 + - 10M + - 10M + * - 4 + - 12M + - 14M + * - 8 + - 16M + - 20M + +How To Choose Between QATzip and QPL +==================================== +Starting from 4th Gen Intel Xeon Scalable processors, codenamed Sapphire Rapids +processor(``SPR``), multiple built-in accelerators are supported including +``QAT`` and ``IAA``. The former can accelerate ``QATzip`` and the latter is +used to accelerate ``QPL``. + +Here are some suggestions: + +1 - If the live migration scenario is limited by network bandwidth and ``QAT`` +hardware resources exceed ``IAA``, use the ``QATzip`` method, which can save a +lot of host CPU resources for compression. + +2 - If the system cannot support shared virtual memory (SVM) technology, use +the ``QATzip`` method because ``QPL`` performance is not good without SVM +support. + +3 - For other scenarios, use the ``QPL`` method first. -- Gitee From a7fbc45c78315e46e26060a41eff1f6834430051 Mon Sep 17 00:00:00 2001 From: Bryan Zhang Date: Fri, 30 Aug 2024 16:27:19 -0700 Subject: [PATCH 80/90] meson: Introduce 'qatzip' feature to the build system commit e28ed313c268aeb4e0cefb66dcd215c30e4443fe upstream. Add a 'qatzip' feature, which is automatically disabled, and which depends on the QATzip library if enabled. Intel-SIG: commit e28ed313c268 meson: Introduce 'qatzip' feature to the build system Reviewed-by: Fabiano Rosas Signed-off-by: Bryan Zhang Signed-off-by: Hao Xiang Signed-off-by: Yichen Wang Link: https://lore.kernel.org/r/20240830232722.58272-3-yichen.wang@bytedance.com Signed-off-by: Peter Xu Conflicts: meson.build meson_options.txt scripts/meson-buildoptions.sh [jz: resolve simple context conflicts] Signed-off-by: Jason Zeng --- meson.build | 9 +++++++++ meson_options.txt | 2 ++ scripts/meson-buildoptions.sh | 3 +++ 3 files changed, 14 insertions(+) diff --git a/meson.build b/meson.build index c833cd2d47a..273a8941473 100644 --- a/meson.build +++ b/meson.build @@ -1057,6 +1057,13 @@ if not get_option('qpl').auto() or have_system required: get_option('qpl'), method: 'pkg-config') endif +qatzip = not_found +if not get_option('qatzip').auto() or have_system + qatzip = dependency('qatzip', version: '>=1.1.2', + required: get_option('qatzip'), + method: 'pkg-config') +endif + virgl = not_found have_vhost_user_gpu = have_tools and targetos == 'linux' and pixman.found() @@ -2225,6 +2232,7 @@ config_host_data.set('CONFIG_STATX', has_statx) config_host_data.set('CONFIG_STATX_MNT_ID', has_statx_mnt_id) config_host_data.set('CONFIG_ZSTD', zstd.found()) config_host_data.set('CONFIG_QPL', qpl.found()) +config_host_data.set('CONFIG_QATZIP', qatzip.found()) config_host_data.set('CONFIG_FUSE', fuse.found()) config_host_data.set('CONFIG_FUSE_LSEEK', fuse_lseek.found()) config_host_data.set('CONFIG_SPICE_PROTOCOL', spice_protocol.found()) @@ -4401,6 +4409,7 @@ summary_info += {'bzip2 support': libbzip2} summary_info += {'lzfse support': liblzfse} summary_info += {'zstd support': zstd} summary_info += {'Query Processing Library support': qpl} +summary_info += {'qatzip support': qatzip} summary_info += {'NUMA host support': numa} summary_info += {'capstone': capstone} summary_info += {'libpmem support': libpmem} diff --git a/meson_options.txt b/meson_options.txt index e16aa8c823b..6a2d8351fdb 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -261,6 +261,8 @@ option('zstd', type : 'feature', value : 'auto', description: 'zstd compression support') option('qpl', type : 'feature', value : 'auto', description: 'Query Processing Library support') +option('qatzip', type: 'feature', value: 'auto', + description: 'QATzip compression support') option('fuse', type: 'feature', value: 'auto', description: 'FUSE block device export') option('fuse_lseek', type : 'feature', value : 'auto', diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh index 784f74fde90..1f04c31bd00 100644 --- a/scripts/meson-buildoptions.sh +++ b/scripts/meson-buildoptions.sh @@ -164,6 +164,7 @@ meson_options_help() { printf "%s\n" ' plugins TCG plugins via shared library loading' printf "%s\n" ' png PNG support with libpng' printf "%s\n" ' pvrdma Enable PVRDMA support' + printf "%s\n" ' qatzip QATzip compression support' printf "%s\n" ' qcow1 qcow1 image format support' printf "%s\n" ' qed qed image format support' printf "%s\n" ' qga-vss build QGA VSS support (broken with MinGW)' @@ -431,6 +432,8 @@ _meson_option_parse() { --prefix=*) quote_sh "-Dprefix=$2" ;; --enable-pvrdma) printf "%s" -Dpvrdma=enabled ;; --disable-pvrdma) printf "%s" -Dpvrdma=disabled ;; + --enable-qatzip) printf "%s" -Dqatzip=enabled ;; + --disable-qatzip) printf "%s" -Dqatzip=disabled ;; --enable-qcow1) printf "%s" -Dqcow1=enabled ;; --disable-qcow1) printf "%s" -Dqcow1=disabled ;; --enable-qed) printf "%s" -Dqed=enabled ;; -- Gitee From ab95ed3f85b47d0b45ade0ecc70121df738fce5c Mon Sep 17 00:00:00 2001 From: Bryan Zhang Date: Fri, 30 Aug 2024 16:27:20 -0700 Subject: [PATCH 81/90] migration: Add migration parameters for QATzip commit 86c6eb1f39cbb7eb0467c114469e98ef699fb515 upstream. Adds support for migration parameters to control QATzip compression level. Intel-SIG: commit 86c6eb1f39cb migration: Add migration parameters for QATzip Acked-by: Markus Armbruster Signed-off-by: Bryan Zhang Signed-off-by: Hao Xiang Signed-off-by: Yichen Wang Reviewed-by: Fabiano Rosas Reviewed-by: Prasad Pandit Link: https://lore.kernel.org/r/20240830232722.58272-4-yichen.wang@bytedance.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- migration/migration-hmp-cmds.c | 4 ++++ migration/options.c | 34 ++++++++++++++++++++++++++++++++++ migration/options.h | 1 + qapi/migration.json | 18 ++++++++++++++++++ 4 files changed, 57 insertions(+) diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c index 16070c5eb3f..1e14adb9e14 100644 --- a/migration/migration-hmp-cmds.c +++ b/migration/migration-hmp-cmds.c @@ -652,6 +652,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict) p->has_multifd_zlib_level = true; visit_type_uint8(v, param, &p->multifd_zlib_level, &err); break; + case MIGRATION_PARAMETER_MULTIFD_QATZIP_LEVEL: + p->has_multifd_qatzip_level = true; + visit_type_uint8(v, param, &p->multifd_qatzip_level, &err); + break; case MIGRATION_PARAMETER_MULTIFD_ZSTD_LEVEL: p->has_multifd_zstd_level = true; visit_type_uint8(v, param, &p->multifd_zstd_level, &err); diff --git a/migration/options.c b/migration/options.c index 68ff81b8853..39ba3e0c0f2 100644 --- a/migration/options.c +++ b/migration/options.c @@ -62,6 +62,13 @@ #define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE /* 0: means nocompress, 1: best speed, ... 9: best compress ratio */ #define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1 +/* + * 1: best speed, ... 9: best compress ratio + * There is some nuance here. Refer to QATzip documentation to understand + * the mapping of QATzip levels to standard deflate levels. + */ +#define DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL 1 + /* 0: means nocompress, 1: best speed, ... 20: best compress ratio */ #define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1 @@ -143,6 +150,9 @@ Property migration_properties[] = { DEFINE_PROP_UINT8("multifd-zlib-level", MigrationState, parameters.multifd_zlib_level, DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL), + DEFINE_PROP_UINT8("multifd-qatzip-level", MigrationState, + parameters.multifd_qatzip_level, + DEFAULT_MIGRATE_MULTIFD_QATZIP_LEVEL), DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState, parameters.multifd_zstd_level, DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL), @@ -865,6 +875,13 @@ int migrate_multifd_zlib_level(void) return s->parameters.multifd_zlib_level; } +int migrate_multifd_qatzip_level(void) +{ + MigrationState *s = migrate_get_current(); + + return s->parameters.multifd_qatzip_level; +} + int migrate_multifd_zstd_level(void) { MigrationState *s = migrate_get_current(); @@ -994,6 +1011,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) params->multifd_compression = s->parameters.multifd_compression; params->has_multifd_zlib_level = true; params->multifd_zlib_level = s->parameters.multifd_zlib_level; + params->has_multifd_qatzip_level = true; + params->multifd_qatzip_level = s->parameters.multifd_qatzip_level; params->has_multifd_zstd_level = true; params->multifd_zstd_level = s->parameters.multifd_zstd_level; params->has_xbzrle_cache_size = true; @@ -1054,6 +1073,7 @@ void migrate_params_init(MigrationParameters *params) params->has_multifd_channels = true; params->has_multifd_compression = true; params->has_multifd_zlib_level = true; + params->has_multifd_qatzip_level = true; params->has_multifd_zstd_level = true; params->has_xbzrle_cache_size = true; params->has_max_postcopy_bandwidth = true; @@ -1168,6 +1188,14 @@ bool migrate_params_check(MigrationParameters *params, Error **errp) return false; } + if (params->has_multifd_qatzip_level && + ((params->multifd_qatzip_level > 9) || + (params->multifd_qatzip_level < 1))) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_qatzip_level", + "a value between 1 and 9"); + return false; + } + if (params->has_multifd_zstd_level && (params->multifd_zstd_level > 20)) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level", @@ -1333,6 +1361,9 @@ static void migrate_params_test_apply(MigrateSetParameters *params, if (params->has_multifd_compression) { dest->multifd_compression = params->multifd_compression; } + if (params->has_multifd_qatzip_level) { + dest->multifd_qatzip_level = params->multifd_qatzip_level; + } if (params->has_multifd_zlib_level) { dest->multifd_zlib_level = params->multifd_zlib_level; } @@ -1491,6 +1522,9 @@ static void migrate_params_apply(MigrateSetParameters *params, Error **errp) if (params->has_multifd_compression) { s->parameters.multifd_compression = params->multifd_compression; } + if (params->has_multifd_qatzip_level) { + s->parameters.multifd_qatzip_level = params->multifd_qatzip_level; + } if (params->has_multifd_zlib_level) { s->parameters.multifd_zlib_level = params->multifd_zlib_level; } diff --git a/migration/options.h b/migration/options.h index b7c4fb38615..1f1629a04e7 100644 --- a/migration/options.h +++ b/migration/options.h @@ -87,6 +87,7 @@ MigMode migrate_mode(void); int migrate_multifd_channels(void); MultiFDCompression migrate_multifd_compression(void); int migrate_multifd_zlib_level(void); +int migrate_multifd_qatzip_level(void); int migrate_multifd_zstd_level(void); uint8_t migrate_throttle_trigger_threshold(void); const char *migrate_tls_authz(void); diff --git a/qapi/migration.json b/qapi/migration.json index 720988007dc..6e7c03aa600 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -865,6 +865,11 @@ # speed, and 9 means best compression ratio which will consume # more CPU. Defaults to 1. (Since 5.0) # +# @multifd-qatzip-level: Set the compression level to be used in live +# migration. The level is an integer between 1 and 9, where 1 means +# the best compression speed, and 9 means the best compression +# ratio which will consume more CPU. Defaults to 1. (Since 9.2) +# # @multifd-zstd-level: Set the compression level to be used in live # migration, the compression level is an integer between 0 and 20, # where 0 means no compression, 1 means the best compression @@ -942,6 +947,7 @@ 'xbzrle-cache-size', 'max-postcopy-bandwidth', 'max-cpu-throttle', 'multifd-compression', 'multifd-zlib-level', 'multifd-zstd-level', + 'multifd-qatzip-level', 'block-bitmap-mapping', { 'name': 'x-vcpu-dirty-limit-period', 'features': ['unstable'] }, 'vcpu-dirty-limit', @@ -1070,6 +1076,11 @@ # speed, and 9 means best compression ratio which will consume # more CPU. Defaults to 1. (Since 5.0) # +# @multifd-qatzip-level: Set the compression level to be used in live +# migration. The level is an integer between 1 and 9, where 1 means +# the best compression speed, and 9 means the best compression +# ratio which will consume more CPU. Defaults to 1. (Since 9.2) +# # @multifd-zstd-level: Set the compression level to be used in live # migration, the compression level is an integer between 0 and 20, # where 0 means no compression, 1 means the best compression @@ -1165,6 +1176,7 @@ '*max-cpu-throttle': 'uint8', '*multifd-compression': 'MultiFDCompression', '*multifd-zlib-level': 'uint8', + '*multifd-qatzip-level': 'uint8', '*multifd-zstd-level': 'uint8', '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ], '*x-vcpu-dirty-limit-period': { 'type': 'uint64', @@ -1317,6 +1329,11 @@ # speed, and 9 means best compression ratio which will consume # more CPU. Defaults to 1. (Since 5.0) # +# @multifd-qatzip-level: Set the compression level to be used in live +# migration. The level is an integer between 1 and 9, where 1 means +# the best compression speed, and 9 means the best compression +# ratio which will consume more CPU. Defaults to 1. (Since 9.2) +# # @multifd-zstd-level: Set the compression level to be used in live # migration, the compression level is an integer between 0 and 20, # where 0 means no compression, 1 means the best compression @@ -1409,6 +1426,7 @@ '*max-cpu-throttle': 'uint8', '*multifd-compression': 'MultiFDCompression', '*multifd-zlib-level': 'uint8', + '*multifd-qatzip-level': 'uint8', '*multifd-zstd-level': 'uint8', '*block-bitmap-mapping': [ 'BitmapMigrationNodeAlias' ], '*x-vcpu-dirty-limit-period': { 'type': 'uint64', -- Gitee From 458a130bbebc495bdbe0e921f4fdf993f37c6dd6 Mon Sep 17 00:00:00 2001 From: Bryan Zhang Date: Fri, 30 Aug 2024 16:27:21 -0700 Subject: [PATCH 82/90] migration: Introduce 'qatzip' compression method commit 80484f945989988091c5cd729c3e8bde6c14907a upstream. Adds support for 'qatzip' as an option for the multifd compression method parameter, and implements using QAT for 'qatzip' compression and decompression. Intel-SIG: commit 80484f945989 migration: Introduce 'qatzip' compression method Acked-by: Markus Armbruster Reviewed-by: Fabiano Rosas Reviewed-by: Prasad Pandit Signed-off-by: Bryan Zhang Signed-off-by: Hao Xiang Signed-off-by: Yichen Wang Link: https://lore.kernel.org/r/20240830232722.58272-5-yichen.wang@bytedance.com Signed-off-by: Peter Xu Conflicts: hw/core/qdev-properties-system.c migration/meson.build migration/multifd.h qapi/migration.json [jz: resolve context conflicts due to uadk which is not backported] Signed-off-by: Jason Zeng --- hw/core/qdev-properties-system.c | 2 +- migration/meson.build | 1 + migration/multifd-qatzip.c | 394 +++++++++++++++++++++++++++++++ migration/multifd.h | 5 +- qapi/migration.json | 3 + 5 files changed, 402 insertions(+), 3 deletions(-) create mode 100644 migration/multifd-qatzip.c diff --git a/hw/core/qdev-properties-system.c b/hw/core/qdev-properties-system.c index 523b3a9bae4..25ee188df7b 100644 --- a/hw/core/qdev-properties-system.c +++ b/hw/core/qdev-properties-system.c @@ -666,7 +666,7 @@ const PropertyInfo qdev_prop_fdc_drive_type = { const PropertyInfo qdev_prop_multifd_compression = { .name = "MultiFDCompression", .description = "multifd_compression values, " - "none/zlib/zstd/qpl", + "none/zlib/zstd/qpl/qatzip", .enum_table = &MultiFDCompression_lookup, .get = qdev_propinfo_get_enum, .set = qdev_propinfo_set_enum, diff --git a/migration/meson.build b/migration/meson.build index cb177de1d2f..e3a0b896518 100644 --- a/migration/meson.build +++ b/migration/meson.build @@ -42,6 +42,7 @@ if get_option('live_block_migration').allowed() endif system_ss.add(when: zstd, if_true: files('multifd-zstd.c')) system_ss.add(when: qpl, if_true: files('multifd-qpl.c')) +system_ss.add(when: qatzip, if_true: files('multifd-qatzip.c')) specific_ss.add(when: 'CONFIG_SYSTEM_ONLY', if_true: files('ram.c', diff --git a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c new file mode 100644 index 00000000000..3c787ed8792 --- /dev/null +++ b/migration/multifd-qatzip.c @@ -0,0 +1,394 @@ +/* + * Multifd QATzip compression implementation + * + * Copyright (c) Bytedance + * + * Authors: + * Bryan Zhang + * Hao Xiang + * Yichen Wang + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "exec/ramblock.h" +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qapi/qapi-types-migration.h" +#include "options.h" +#include "multifd.h" +#include + +typedef struct { + /* + * Unique session for use with QATzip API + */ + QzSession_T sess; + + /* + * For compression: Buffer for pages to compress + * For decompression: Buffer for data to decompress + */ + uint8_t *in_buf; + uint32_t in_len; + + /* + * For compression: Output buffer of compressed data + * For decompression: Output buffer of decompressed data + */ + uint8_t *out_buf; + uint32_t out_len; +} QatzipData; + +/** + * qatzip_send_setup: Set up QATzip session and private buffers. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_send_setup(MultiFDSendParams *p, Error **errp) +{ + QatzipData *q; + QzSessionParamsDeflate_T params; + const char *err_msg; + int ret; + + q = g_new0(QatzipData, 1); + p->compress_data = q; + /* We need one extra place for the packet header */ + p->iov = g_new0(struct iovec, 2); + + /* + * Initialize QAT device with software fallback by default. This allows + * QATzip to use CPU path when QAT hardware reaches maximum throughput. + */ + ret = qzInit(&q->sess, true); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzInit failed"; + goto err; + } + + ret = qzGetDefaultsDeflate(¶ms); + if (ret != QZ_OK) { + err_msg = "qzGetDefaultsDeflate failed"; + goto err; + } + + /* Make sure to use configured QATzip compression level. */ + params.common_params.comp_lvl = migrate_multifd_qatzip_level(); + ret = qzSetupSessionDeflate(&q->sess, ¶ms); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzSetupSessionDeflate failed"; + goto err; + } + + if (MULTIFD_PACKET_SIZE > UINT32_MAX) { + err_msg = "packet size too large for QAT"; + goto err; + } + + q->in_len = MULTIFD_PACKET_SIZE; + /* + * PINNED_MEM is an enum from qatzip headers, which means to use + * kzalloc_node() to allocate memory for QAT DMA purposes. When QAT device + * is not available or software fallback is used, the malloc flag needs to + * be set as COMMON_MEM. + */ + q->in_buf = qzMalloc(q->in_len, 0, PINNED_MEM); + if (!q->in_buf) { + q->in_buf = qzMalloc(q->in_len, 0, COMMON_MEM); + if (!q->in_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + q->out_len = qzMaxCompressedLength(MULTIFD_PACKET_SIZE, &q->sess); + q->out_buf = qzMalloc(q->out_len, 0, PINNED_MEM); + if (!q->out_buf) { + q->out_buf = qzMalloc(q->out_len, 0, COMMON_MEM); + if (!q->out_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + return 0; + +err: + error_setg(errp, "multifd %u: [sender] %s", p->id, err_msg); + return -1; +} + +/** + * qatzip_send_cleanup: Tear down QATzip session and release private buffers. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return None + */ +static void qatzip_send_cleanup(MultiFDSendParams *p, Error **errp) +{ + QatzipData *q = p->compress_data; + + if (q) { + if (q->in_buf) { + qzFree(q->in_buf); + } + if (q->out_buf) { + qzFree(q->out_buf); + } + (void)qzTeardownSession(&q->sess); + (void)qzClose(&q->sess); + g_free(q); + } + + g_free(p->iov); + p->iov = NULL; + p->compress_data = NULL; +} + +/** + * qatzip_send_prepare: Compress pages and update IO channel info. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_send_prepare(MultiFDSendParams *p, Error **errp) +{ + MultiFDPages_t *pages = p->pages; + QatzipData *q = p->compress_data; + int ret; + unsigned int in_len, out_len; + + if (!multifd_send_prepare_common(p)) { + goto out; + } + + /* + * Unlike other multifd compression implementations, we use a non-streaming + * API and place all the data into one buffer, rather than sending each + * page to the compression API at a time. Based on initial benchmarks, the + * non-streaming API outperforms the streaming API. Plus, the logic in QEMU + * is friendly to using the non-streaming API anyway. If either of these + * statements becomes no longer true, we can revisit adding a streaming + * implementation. + */ + for (int i = 0; i < pages->normal_num; i++) { + memcpy(q->in_buf + (i * p->page_size), + pages->block->host + pages->offset[i], + p->page_size); + } + + in_len = pages->normal_num * p->page_size; + if (in_len > q->in_len) { + error_setg(errp, "multifd %u: unexpectedly large input", p->id); + return -1; + } + out_len = q->out_len; + + ret = qzCompress(&q->sess, q->in_buf, &in_len, q->out_buf, &out_len, 1); + if (ret != QZ_OK) { + error_setg(errp, "multifd %u: QATzip returned %d instead of QZ_OK", + p->id, ret); + return -1; + } + if (in_len != pages->normal_num * p->page_size) { + error_setg(errp, "multifd %u: QATzip failed to compress all input", + p->id); + return -1; + } + + p->iov[p->iovs_num].iov_base = q->out_buf; + p->iov[p->iovs_num].iov_len = out_len; + p->iovs_num++; + p->next_packet_size = out_len; + +out: + p->flags |= MULTIFD_FLAG_QATZIP; + multifd_send_fill_packet(p); + return 0; +} + +/** + * qatzip_recv_setup: Set up QATzip session and allocate private buffers. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_recv_setup(MultiFDRecvParams *p, Error **errp) +{ + QatzipData *q; + QzSessionParamsDeflate_T params; + const char *err_msg; + int ret; + + q = g_new0(QatzipData, 1); + p->compress_data = q; + + /* + * Initialize QAT device with software fallback by default. This allows + * QATzip to use CPU path when QAT hardware reaches maximum throughput. + */ + ret = qzInit(&q->sess, true); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzInit failed"; + goto err; + } + + ret = qzGetDefaultsDeflate(¶ms); + if (ret != QZ_OK) { + err_msg = "qzGetDefaultsDeflate failed"; + goto err; + } + + ret = qzSetupSessionDeflate(&q->sess, ¶ms); + if (ret != QZ_OK && ret != QZ_DUPLICATE) { + err_msg = "qzSetupSessionDeflate failed"; + goto err; + } + + /* + * Reserve extra spaces for the incoming packets. Current implementation + * doesn't send uncompressed pages in case the compression gets too big. + */ + q->in_len = MULTIFD_PACKET_SIZE * 2; + /* + * PINNED_MEM is an enum from qatzip headers, which means to use + * kzalloc_node() to allocate memory for QAT DMA purposes. When QAT device + * is not available or software fallback is used, the malloc flag needs to + * be set as COMMON_MEM. + */ + q->in_buf = qzMalloc(q->in_len, 0, PINNED_MEM); + if (!q->in_buf) { + q->in_buf = qzMalloc(q->in_len, 0, COMMON_MEM); + if (!q->in_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + q->out_len = MULTIFD_PACKET_SIZE; + q->out_buf = qzMalloc(q->out_len, 0, PINNED_MEM); + if (!q->out_buf) { + q->out_buf = qzMalloc(q->out_len, 0, COMMON_MEM); + if (!q->out_buf) { + err_msg = "qzMalloc failed"; + goto err; + } + } + + return 0; + +err: + error_setg(errp, "multifd %u: [receiver] %s", p->id, err_msg); + return -1; +} + +/** + * qatzip_recv_cleanup: Tear down QATzip session and release private buffers. + * + * @param p Multifd channel params + * @return None + */ +static void qatzip_recv_cleanup(MultiFDRecvParams *p) +{ + QatzipData *q = p->compress_data; + + if (q) { + if (q->in_buf) { + qzFree(q->in_buf); + } + if (q->out_buf) { + qzFree(q->out_buf); + } + (void)qzTeardownSession(&q->sess); + (void)qzClose(&q->sess); + g_free(q); + } + p->compress_data = NULL; +} + + +/** + * qatzip_recv: Decompress pages and copy them to the appropriate + * locations. + * + * @param p Multifd channel params + * @param errp Pointer to error, which will be set in case of error + * @return 0 on success, -1 on error (and *errp will be set) + */ +static int qatzip_recv(MultiFDRecvParams *p, Error **errp) +{ + QatzipData *q = p->compress_data; + int ret; + unsigned int in_len, out_len; + uint32_t in_size = p->next_packet_size; + uint32_t expected_size = p->normal_num * p->page_size; + uint32_t flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK; + + if (in_size > q->in_len) { + error_setg(errp, "multifd %u: received unexpectedly large packet", + p->id); + return -1; + } + + if (flags != MULTIFD_FLAG_QATZIP) { + error_setg(errp, "multifd %u: flags received %x flags expected %x", + p->id, flags, MULTIFD_FLAG_QATZIP); + return -1; + } + + multifd_recv_zero_page_process(p); + if (!p->normal_num) { + assert(in_size == 0); + return 0; + } + + ret = qio_channel_read_all(p->c, (void *)q->in_buf, in_size, errp); + if (ret != 0) { + return ret; + } + + in_len = in_size; + out_len = q->out_len; + ret = qzDecompress(&q->sess, q->in_buf, &in_len, q->out_buf, &out_len); + if (ret != QZ_OK) { + error_setg(errp, "multifd %u: qzDecompress failed", p->id); + return -1; + } + if (out_len != expected_size) { + error_setg(errp, "multifd %u: packet size received %u size expected %u", + p->id, out_len, expected_size); + return -1; + } + + /* Copy each page to its appropriate location. */ + for (int i = 0; i < p->normal_num; i++) { + memcpy(p->host + p->normal[i], + q->out_buf + p->page_size * i, + p->page_size); + } + return 0; +} + +static MultiFDMethods multifd_qatzip_ops = { + .send_setup = qatzip_send_setup, + .send_cleanup = qatzip_send_cleanup, + .send_prepare = qatzip_send_prepare, + .recv_setup = qatzip_recv_setup, + .recv_cleanup = qatzip_recv_cleanup, + .recv = qatzip_recv +}; + +static void multifd_qatzip_register(void) +{ + multifd_register_ops(MULTIFD_COMPRESSION_QATZIP, &multifd_qatzip_ops); +} + +migration_init(multifd_qatzip_register); diff --git a/migration/multifd.h b/migration/multifd.h index 41965df7a94..dedb1f1c14d 100644 --- a/migration/multifd.h +++ b/migration/multifd.h @@ -29,13 +29,14 @@ bool multifd_queue_page(RAMBlock *block, ram_addr_t offset); /* Multifd Compression flags */ #define MULTIFD_FLAG_SYNC (1 << 0) -/* We reserve 3 bits for compression methods */ -#define MULTIFD_FLAG_COMPRESSION_MASK (7 << 1) +/* We reserve 5 bits for compression methods */ +#define MULTIFD_FLAG_COMPRESSION_MASK (0x1f << 1) /* we need to be compatible. Before compression value was 0 */ #define MULTIFD_FLAG_NOCOMP (0 << 1) #define MULTIFD_FLAG_ZLIB (1 << 1) #define MULTIFD_FLAG_ZSTD (2 << 1) #define MULTIFD_FLAG_QPL (4 << 1) +#define MULTIFD_FLAG_QATZIP (16 << 1) /* This value needs to be a multiple of qemu_target_page_size() */ #define MULTIFD_PACKET_SIZE (512 * 1024) diff --git a/qapi/migration.json b/qapi/migration.json index 6e7c03aa600..5186755d60a 100644 --- a/qapi/migration.json +++ b/qapi/migration.json @@ -625,6 +625,8 @@ # # @zstd: use zstd compression method. # +# @qatzip: use qatzip compression method. (Since 9.2) +# # @qpl: use qpl compression method. Query Processing Library(qpl) is # based on the deflate compression algorithm and use the Intel # In-Memory Analytics Accelerator(IAA) accelerated compression @@ -635,6 +637,7 @@ { 'enum': 'MultiFDCompression', 'data': [ 'none', 'zlib', { 'name': 'zstd', 'if': 'CONFIG_ZSTD' }, + { 'name': 'qatzip', 'if': 'CONFIG_QATZIP'}, { 'name': 'qpl', 'if': 'CONFIG_QPL' } ] } ## -- Gitee From 8e6f0610e7bb4e8c6d7754e542e05bd4b946f01f Mon Sep 17 00:00:00 2001 From: Bryan Zhang Date: Fri, 30 Aug 2024 16:27:22 -0700 Subject: [PATCH 83/90] tests/migration: Add integration test for 'qatzip' compression method commit afe166d4e8bc33bc448cd573b55d0ac094187d48 upstream. Adds an integration test for 'qatzip'. Intel-SIG: commit afe166d4e8bc tests/migration: Add integration test for 'qatzip' compression method Reviewed-by: Fabiano Rosas Signed-off-by: Bryan Zhang Signed-off-by: Hao Xiang Signed-off-by: Yichen Wang Link: https://lore.kernel.org/r/20240830232722.58272-6-yichen.wang@bytedance.com Signed-off-by: Peter Xu Signed-off-by: Jason Zeng --- tests/qtest/migration-test.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c index 10a3f99d6c5..5998e7a50cb 100644 --- a/tests/qtest/migration-test.c +++ b/tests/qtest/migration-test.c @@ -2582,6 +2582,18 @@ test_migrate_precopy_tcp_multifd_zstd_start(QTestState *from, } #endif /* CONFIG_ZSTD */ +#ifdef CONFIG_QATZIP +static void * +test_migrate_precopy_tcp_multifd_qatzip_start(QTestState *from, + QTestState *to) +{ + migrate_set_parameter_int(from, "multifd-qatzip-level", 2); + migrate_set_parameter_int(to, "multifd-qatzip-level", 2); + + return test_migrate_precopy_tcp_multifd_start_common(from, to, "qatzip"); +} +#endif + #ifdef CONFIG_QPL static void * test_migrate_precopy_tcp_multifd_qpl_start(QTestState *from, @@ -2626,6 +2638,17 @@ static void test_multifd_tcp_zstd(void) } #endif +#ifdef CONFIG_QATZIP +static void test_multifd_tcp_qatzip(void) +{ + MigrateCommon args = { + .listen_uri = "defer", + .start_hook = test_migrate_precopy_tcp_multifd_qatzip_start, + }; + test_precopy_common(&args); +} +#endif + #ifdef CONFIG_QPL static void test_multifd_tcp_qpl(void) { @@ -3513,6 +3536,10 @@ int main(int argc, char **argv) migration_test_add("/migration/multifd/tcp/plain/zstd", test_multifd_tcp_zstd); #endif +#ifdef CONFIG_QATZIP + migration_test_add("/migration/multifd/tcp/plain/qatzip", + test_multifd_tcp_qatzip); +#endif #ifdef CONFIG_QPL migration_test_add("/migration/multifd/tcp/plain/qpl", test_multifd_tcp_qpl); -- Gitee From 5c86088a1f9cbdbcb6e531fbf49fd526119a6590 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Tue, 17 Sep 2024 15:58:02 -0300 Subject: [PATCH 84/90] migration/multifd: Fix rb->receivedmap cleanup race commit 4ce56229087860805877075ddb29dd44578365a9 upstream. Fix a segmentation fault in multifd when rb->receivedmap is cleared too early. After commit 5ef7e26bdb ("migration/multifd: solve zero page causing multiple page faults"), multifd started using the rb->receivedmap bitmap, which belongs to ram.c and is initialized and *freed* from the ram SaveVMHandlers. Multifd threads are live until migration_incoming_state_destroy(), which is called after qemu_loadvm_state_cleanup(), leading to a crash when accessing rb->receivedmap. process_incoming_migration_co() ... qemu_loadvm_state() multifd_nocomp_recv() qemu_loadvm_state_cleanup() ramblock_recv_bitmap_set_offset() rb->receivedmap = NULL set_bit_atomic(..., rb->receivedmap) ... migration_incoming_state_destroy() multifd_recv_cleanup() multifd_recv_terminate_threads(NULL) Move the loadvm cleanup into migration_incoming_state_destroy(), after multifd_recv_cleanup() to ensure multifd threads have already exited when rb->receivedmap is cleared. Adjust the postcopy listen thread comment to indicate that we still want to skip the cpu synchronization. Intel-SIG: commit 4ce562290878 migration/multifd: Fix rb->receivedmap cleanup race CC: qemu-stable@nongnu.org Fixes: 5ef7e26bdb ("migration/multifd: solve zero page causing multiple page faults") Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240917185802.15619-3-farosas@suse.de [peterx: added comment in migration_incoming_state_destroy()] Signed-off-by: Peter Xu Conflicts: migration/migration.c [jz: resolve context conflict due to non-multifd compression which is already deleted in upstream while still in anolis] Signed-off-by: Jason Zeng --- migration/migration.c | 5 +++++ migration/savevm.c | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/migration/migration.c b/migration/migration.c index e7a513cb182..d1c8ec3be6f 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -278,6 +278,11 @@ void migration_incoming_state_destroy(void) multifd_recv_cleanup(); compress_threads_load_cleanup(); + /* + * RAM state cleanup needs to happen after multifd cleanup, because + * multifd threads can use some of its states (receivedmap). + */ + qemu_loadvm_state_cleanup(); if (mis->to_src_file) { /* Tell source that we are done */ diff --git a/migration/savevm.c b/migration/savevm.c index cf88057efac..44f85874098 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -2957,7 +2957,10 @@ int qemu_loadvm_state(QEMUFile *f) trace_qemu_loadvm_state_post_main(ret); if (mis->have_listen_thread) { - /* Listen thread still going, can't clean up yet */ + /* + * Postcopy listen thread still going, don't synchronize the + * cpus yet. + */ return ret; } @@ -3000,7 +3003,6 @@ int qemu_loadvm_state(QEMUFile *f) } } - qemu_loadvm_state_cleanup(); cpu_synchronize_all_post_init(); return ret; -- Gitee From dc83b3eec1d48af61acb57d4e5f3e28fe026778a Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Tue, 10 Sep 2024 07:41:38 +0200 Subject: [PATCH 85/90] migration/multifd: Fix loop conditions in multifd_zstd_send_prepare and multifd_zstd_recv commit cb0ed522a51a7d4b1fde535972d4aeeb82447928 upstream. GitHub's CodeQL reports four critical errors which are fixed by this commit: Unsigned difference expression compared to zero An expression (u - v > 0) with unsigned values u, v is only false if u == v, so all changed expressions did not work as expected. Intel-SIG: commit cb0ed522a51a migration/multifd: Fix loop conditions in multifd_zstd_send_prepare and multifd_zstd_recv Signed-off-by: Stefan Weil Link: https://lore.kernel.org/r/20240910054138.1458555-1-sw@weilnetz.de [peterx: Fix mangled email for author] Signed-off-by: Peter Xu Conflicts: migration/multifd-zstd.c [jz: resolve context conflict due to p->page which not renamed to page yet] Signed-off-by: Jason Zeng --- migration/multifd-zstd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c index ca17b7e310f..46ee68b6cef 100644 --- a/migration/multifd-zstd.c +++ b/migration/multifd-zstd.c @@ -152,9 +152,9 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp) */ do { ret = ZSTD_compressStream2(z->zcs, &z->out, &z->in, flush); - } while (ret > 0 && (z->in.size - z->in.pos > 0) - && (z->out.size - z->out.pos > 0)); - if (ret > 0 && (z->in.size - z->in.pos > 0)) { + } while (ret > 0 && (z->in.size > z->in.pos) + && (z->out.size > z->out.pos)); + if (ret > 0 && (z->in.size > z->in.pos)) { error_setg(errp, "multifd %u: compressStream buffer too small", p->id); return -1; @@ -299,7 +299,7 @@ static int zstd_recv(MultiFDRecvParams *p, Error **errp) */ do { ret = ZSTD_decompressStream(z->zds, &z->out, &z->in); - } while (ret > 0 && (z->in.size - z->in.pos > 0) + } while (ret > 0 && (z->in.size > z->in.pos) && (z->out.pos < p->page_size)); if (ret > 0 && (z->out.pos < p->page_size)) { error_setg(errp, "multifd %u: decompressStream buffer too small", -- Gitee From 77987d9d6ef1885dee409f76b4fc11165b4c23bf Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 19 Sep 2024 12:06:11 -0300 Subject: [PATCH 86/90] migration/multifd: Ensure packet->ramblock is null-terminated commit 68e0fca625912c7c63a8bfbc784f53d4fefa1a13 upstream. Coverity points out that the current usage of strncpy to write the ramblock name allows the field to not have an ending '\0' in case idstr is already not null-terminated (e.g. if it's larger than 256 bytes). This is currently harmless because the packet->ramblock field is never touched again on the source side. The destination side reads only up to the field's size from the stream and forces the last byte to be 0. We're still open to a programming error in the future in case this field is ever passed into a function that expects a null-terminated string. Change from strncpy to QEMU's pstrcpy, which puts a '\0' at the end of the string and doesn't fill the extra space with zeros. (there's no spillage between iterations of fill_packet because after commit 87bb9e953e ("migration/multifd: Isolate ram pages packet data") the packet is always zeroed before filling) Intel-SIG: commit 68e0fca62591 migration/multifd: Ensure packet->ramblock is null-terminated Resolves: Coverity CID 1560071 Reported-by: Peter Maydell Signed-off-by: Fabiano Rosas Link: https://lore.kernel.org/r/20240919150611.17074-1-farosas@suse.de Signed-off-by: Peter Xu Conflicts: migration/multifd-nocomp.c [jz: upstream has split nocomp code into multifd-nocomp.c, while we havenot yet. The function that needs to be fixed is still named multifd_send_fill_packet in multifd.c, so we fix it in multifd.c] Signed-off-by: Jason Zeng --- migration/multifd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/migration/multifd.c b/migration/multifd.c index eae70400399..e77aaeeb300 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -16,6 +16,7 @@ #include "exec/target_page.h" #include "sysemu/sysemu.h" #include "exec/ramblock.h" +#include "qemu/cutils.h" #include "qemu/error-report.h" #include "qapi/error.h" #include "ram.h" @@ -399,7 +400,8 @@ void multifd_send_fill_packet(MultiFDSendParams *p) packet->packet_num = cpu_to_be64(packet_num); if (pages->block) { - strncpy(packet->ramblock, pages->block->idstr, 256); + pstrcpy(packet->ramblock, sizeof(packet->ramblock), + pages->block->idstr); } for (i = 0; i < pages->num; i++) { -- Gitee From d9fbf1bcec83c26b7e78b75917fd8826281e17d7 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 29 Oct 2024 15:58:15 +0100 Subject: [PATCH 87/90] migration/multifd: Zero p->flags before starting filling a packet commit 00b4b216534d84ace7b0583cec70a3aaf256cb25 upstream. This way there aren't stale flags there. p->flags can't contain SYNC to be sent at the next RAM packet since syncs are now handled separately in multifd_send_thread. Intel-SIG: commit 00b4b216534d migration/multifd: Zero p->flags before starting filling a packet Reviewed-by: Fabiano Rosas Reviewed-by: Peter Xu Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/r/1c96b6cdb797e6f035eb1a4ad9bfc24f4c7f5df8.1730203967.git.maciej.szmigiero@oracle.com Signed-off-by: Peter Xu Conflicts: migration/multifd.c [jz: resolve simple context conflict] Signed-off-by: Jason Zeng --- migration/multifd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/migration/multifd.c b/migration/multifd.c index e77aaeeb300..f35f4e11202 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -894,6 +894,7 @@ static void *multifd_send_thread(void *opaque) if (qatomic_load_acquire(&p->pending_job)) { MultiFDPages_t *pages = p->pages; + p->flags = 0; p->iovs_num = 0; assert(pages->num); @@ -940,7 +941,6 @@ static void *multifd_send_thread(void *opaque) } /* p->next_packet_size will always be zero for a SYNC packet */ stat64_add(&mig_stats.multifd_bytes, p->packet_len); - p->flags = 0; } qatomic_set(&p->pending_sync, false); -- Gitee From 73b540ec9c69f636ef2dcd4116d75995416764af Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Wed, 18 Dec 2024 17:14:11 +0800 Subject: [PATCH 88/90] multifd: bugfix for migration using compression methods commit cdc3970f8597ebdc1a4c2090cfb4d11e297329ed upstream. When compression is enabled on the migration channel and the pages processed are all zero pages, these pages will not be sent and updated on the target side, resulting in incorrect memory data on the source and target sides. The root cause is that all compression methods call multifd_send_prepare_common to determine whether to compress dirty pages, but multifd_send_prepare_common does not update the IOV of MultiFDPacket_t when all dirty pages are zero pages. The solution is to always update the IOV of MultiFDPacket_t regardless of whether the dirty pages are all zero pages. Intel-SIG: commit cdc3970f8597 multifd: bugfix for migration using compression methods Fixes: 303e6f54f9 ("migration/multifd: Implement zero page transmission on the multifd thread.") Cc: qemu-stable@nongnu.org #9.0+ Signed-off-by: Yuan Liu Reviewed-by: Jason Zeng Reviewed-by: Peter Xu Message-Id: <20241218091413.140396-2-yuan1.liu@intel.com> Signed-off-by: Fabiano Rosas Conflicts: migration/multifd-nocomp.c [jz: upstream has split nocomp code into multifd-nocomp.c, while we havenot yet. The function that needs to be fixed is still in multifd.c, so we fix it in multifd.c] Signed-off-by: Jason Zeng --- migration/multifd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/migration/multifd.c b/migration/multifd.c index f35f4e11202..9c95d36d83e 100644 --- a/migration/multifd.c +++ b/migration/multifd.c @@ -1484,6 +1484,7 @@ void multifd_recv_new_channel(QIOChannel *ioc, Error **errp) bool multifd_send_prepare_common(MultiFDSendParams *p) { + multifd_send_prepare_header(p); multifd_send_zero_page_detect(p); if (!p->pages->normal_num) { @@ -1491,7 +1492,5 @@ bool multifd_send_prepare_common(MultiFDSendParams *p) return false; } - multifd_send_prepare_header(p); - return true; } -- Gitee From 2d85cbea6ba784ee10622cbbcfeeba3f1df9d73f Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Wed, 18 Dec 2024 17:14:12 +0800 Subject: [PATCH 89/90] multifd: bugfix for incorrect migration data with QPL compression commit 2588a5f99b0c3493b4690e3ff01ed36f80e830cc upstream. When QPL compression is enabled on the migration channel and the same dirty page changes from a normal page to a zero page in the iterative memory copy, the dirty page will not be updated to a zero page again on the target side, resulting in incorrect memory data on the source and target sides. The root cause is that the target side does not record the normal pages to the receivedmap. The solution is to add ramblock_recv_bitmap_set_offset in target side to record the normal pages. Intel-SIG: commit 2588a5f99b0c multifd: bugfix for incorrect migration data with QPL compression Signed-off-by: Yuan Liu Reviewed-by: Jason Zeng Reviewed-by: Peter Xu Message-Id: <20241218091413.140396-3-yuan1.liu@intel.com> Signed-off-by: Fabiano Rosas Signed-off-by: Jason Zeng --- migration/multifd-qpl.c | 1 + 1 file changed, 1 insertion(+) diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c index 9265098ee79..fea60e39379 100644 --- a/migration/multifd-qpl.c +++ b/migration/multifd-qpl.c @@ -730,6 +730,7 @@ static int multifd_qpl_recv(MultiFDRecvParams *p, Error **errp) qpl->zlen[i] = be32_to_cpu(qpl->zlen[i]); assert(qpl->zlen[i] <= p->page_size); zbuf_len += qpl->zlen[i]; + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); } /* read compressed pages */ -- Gitee From 07c4c3f072e326f32757b7d5d784bd0c0665c2fc Mon Sep 17 00:00:00 2001 From: Yuan Liu Date: Wed, 18 Dec 2024 17:14:13 +0800 Subject: [PATCH 90/90] multifd: bugfix for incorrect migration data with qatzip compression commit a523bc52166c80d8a04d46584f9f3868bd53ef69 upstream. When QPL compression is enabled on the migration channel and the same dirty page changes from a normal page to a zero page in the iterative memory copy, the dirty page will not be updated to a zero page again on the target side, resulting in incorrect memory data on the source and target sides. The root cause is that the target side does not record the normal pages to the receivedmap. The solution is to add ramblock_recv_bitmap_set_offset in target side to record the normal pages. Intel-SIG: commit a523bc52166c multifd: bugfix for incorrect migration data with qatzip compression Signed-off-by: Yuan Liu Reviewed-by: Jason Zeng Reviewed-by: Peter Xu Message-Id: <20241218091413.140396-4-yuan1.liu@intel.com> Signed-off-by: Fabiano Rosas Conflicts: migration/multifd-qatzip.c [jz: resolve simple context conflict] Signed-off-by: Jason Zeng --- migration/multifd-qatzip.c | 1 + 1 file changed, 1 insertion(+) diff --git a/migration/multifd-qatzip.c b/migration/multifd-qatzip.c index 3c787ed8792..88b6fb44ad6 100644 --- a/migration/multifd-qatzip.c +++ b/migration/multifd-qatzip.c @@ -373,6 +373,7 @@ static int qatzip_recv(MultiFDRecvParams *p, Error **errp) memcpy(p->host + p->normal[i], q->out_buf + p->page_size * i, p->page_size); + ramblock_recv_bitmap_set_offset(p->block, p->normal[i]); } return 0; } -- Gitee